-
Notifications
You must be signed in to change notification settings - Fork 0
/
hatespeech_cleaning.py
76 lines (44 loc) · 1.58 KB
/
hatespeech_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python
# coding: utf-8
# # Hate Speech Detection Using Machine Learning
#
# In[2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from pandas import *
#clean_string(): takes a string removes not alphabet characters and stopwords
#clean_dataset(): takes a csv filename and cleans it returns data sets and labels
def clean_string(raw_review):
example1 = BeautifulSoup(raw_review)
example1 = re.sub('[^a-zA-Z]'," ",example1.get_text())
words = example1.lower().split()
stops = set(stopwords.words('english'))
words = [w for w in words if not w in stops]
sentence = " ".join(words)
return sentence
def clean_dataset(filename):
with open(filename, 'r') as dat:
lines = dat.readlines()
data_set = []
data_labels = []
for i in range(1,len(lines)):
if len(lines[i].split(',')) < 6 or not lines[i].split(',')[5].isdigit():
continue
if int(lines[i].split(',')[5]) == 0: # hatespeech
data_labels.append(1)
data_set.append(clean_string(lines[i]))
elif int(lines[i].split(',')[5]) == 2: # not hatespeech
data_labels.append(0)
data_set.append(clean_string(lines[i]))
return data_set,data_labels
def main():
filename = 'labeled_data.csv'
data_set,data_labels = clean_dataset(filename)
# create a csv file of the clean dataset
df = DataFrame({'tweet': data_set, 'class': data_labels})
df.to_csv('data.csv', sep='\t')
main()
# In[ ]: