In [2]:
import pandas as pd
import re

from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

In [149]:
train = pd.read_csv("output_filtered.csv")
print("Training set", train.columns, ", ", train.shape)

Training set Index(['hate_speech_count', 'tweet'], dtype='object') ,  (718757, 2)


In [14]:
def preproc(data_frame, field):
    data_frame[field] = data_frame[field].str.lower()
    data_frame[field] = data_frame[field].apply(lambda x:re.sub(r"(@[\w]+)|([^0-9A-z \t])|(\w+:\/\/\s+)|^rt|http.+?", '', x))
    return data_frame

In [150]:
train = preproc(train, 'tweet')
print(train)

        hate_speech_count                                              tweet
0                       0   rt  as a woman you shouldnt complain about cl...
1                       0   rt  boy dats coldtyga dwn bad for cuffin dat ...
2                       0   rt  dawg rt  you ever fuck a bitch and she st...
3                       0                        rt   she look like a tranny
4                       0   rt  the shit you hear about me might be true ...
...                   ...                                                ...
718752                  1  i mute this telecasting and played kanye west ...
718753                  1  but hell yeah he s not a bachelor but looooooo...
718754                  1  great video musician but s not my musician lol...
718755                  1  not great pop video yeah he s not a pedophile ...
718756                  1  great video yeah he s non a paedophile lolllll...

[718757 rows x 2 columns]


In [151]:
train_maj = train[train.hate_speech_count == 0]
train_min = train[train.hate_speech_count == 1]

In [152]:
train_min_upsampled = resample(train_min, replace=True, n_samples=len(train_maj), random_state = 99)
train_subsampled = pd.concat([train_min_upsampled, train_maj])
print(train_subsampled['hate_speech_count'].value_counts())

0    372654
1    372654
Name: hate_speech_count, dtype: int64


In [153]:
SGD = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('nb', SGDClassifier())])
x_train, X_test, Y_train, Y_test = train_test_split(train_subsampled['tweet'], train_subsampled['hate_speech_count'], random_state=42,test_size=0.2)

model = SGD.fit(x_train, Y_train)
y_predict = model.predict(X_test)

print(f1_score(Y_test, y_predict)*100)

81.90597991970519


In [154]:
print("Is hate_speech: ",bool(model.predict(["""
Because sometimes, against all the odds, we keep the faith, and hope! 

We don't care which political party is doing what publicity. We don't want to waste our energy on the people who're 'NOT' speaking up. We would rather give our energy where it is needed   to unite with anyone who is standing up for the seen truth. The truth of evil. The truth of human rights violation. The truth of islamophobia. The truth of   being jer* and the truth of people of g@z@.

We will show  solidarity because we know they need us. They need to be heard. And our voices will reach out to them. Our voices are the weapons for them. 

What can we do more? 
We will keep posting. We will put pressure on the authorities for ceasefire. We want genocide to stop. We will donate, we will learn the history. We will educate ourselves and others. And most importantly, we will pray for them! It's high time. It's high time for us to unite for  

Prayers and love for p@lest!n3 and its people, always and forever!  
"""])[0]))

Is hate_speech:  True


In [145]:
import joblib

In [155]:
joblib.dump(model, 'benGvir_NLP_filtered.joblib')

['benGvir_NLP_filtered.joblib']

In [156]:
loaded_model = joblib.load('benGvir_NLP_filtered.joblib')

In [157]:
new_text = ["en passant"]

new_text_predictions = loaded_model.predict(new_text)

print(f'Predictions for the new text: {new_text_predictions}')

Predictions for the new text: [0]
