In [1]:
from text_processor import text_processor
import pandas as pd

# Train Test Split
from sklearn.model_selection import train_test_split

# Vectorisation module
from sklearn.feature_extraction.text import CountVectorizer

# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Evaluation metrics
from sklearn.metrics import f1_score

In [5]:
df = pd.read_csv(r'F:\GUVI_DATA_SCIENCE\Project\Hate_Speech_Classification\Hate_Speech_Dataset\final_processed.csv')
df.sample()

Unnamed: 0.1,Unnamed: 0,text,label
785,792,suspect mr harrison mr currie repeat probation...,noHate


In [6]:
df['label'].value_counts()

label
noHate    9507
hate      1196
Name: count, dtype: int64

In [7]:
df.isnull().sum()

Unnamed: 0      0
text          121
label           0
dtype: int64

In [8]:
df = df.dropna(axis = 0)

In [9]:
df.isnull().sum()

Unnamed: 0    0
text          0
label         0
dtype: int64

In [10]:
df['label'] = df['label'].map({'noHate' : 1, 'hate':0})

##### **Model Training**

In [11]:
x = df['text']
y = df['label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
print(x_train.shape, x_test.shape)

(8465,) (2117,)


##### **Vectorisation of Text**

In [12]:
cv = CountVectorizer()

x_train_vectorised_1 = cv.fit_transform(x_train)
x_test_vectorised_1  = cv.transform(x_test)

##### **Fitting the model**

In [14]:
def model_training(model_name, x_train, y_train):
    if model_name == 'RandomForestClassifier':
        model = RandomForestClassifier(random_state = 42)


    elif model_name == 'XGBClassifier':
        model = XGBClassifier(random_state = 42)


    elif model_name == 'SVC':
        model = SVC(random_state = 42)
 

    elif model_name == 'KNeighborsClassifier':
        model = KNeighborsClassifier()        

    else:
        raise ValueError('Invalid Model name')
    
    model.fit(x_train, y_train)

    return model

##### **Random Forest Classifier**

In [15]:
rfc = model_training('RandomForestClassifier', x_train_vectorised_1, y_train)

y_train_pred_1 = rfc.predict(x_train_vectorised_1)
y_test_pred_1 = rfc.predict(x_test_vectorised_1)

##### **XGBClassifier**

In [16]:
xgbc = model_training('XGBClassifier', x_train_vectorised_1, y_train)

y_train_pred_2 = xgbc.predict(x_train_vectorised_1)
y_test_pred_2 = xgbc.predict(x_test_vectorised_1)

##### **SVC**

In [17]:
svc = model_training('SVC', x_train_vectorised_1, y_train)

y_train_pred_3 = svc.predict(x_train_vectorised_1)
y_test_pred_3 = svc.predict(x_test_vectorised_1)

##### **KNeighborsClassifier**

In [18]:
knc = model_training('KNeighborsClassifier', x_train_vectorised_1, y_train)

y_train_pred_4 = knc.predict(x_train_vectorised_1)
y_test_pred_4 = knc.predict(x_test_vectorised_1)

##### **Evaluation**

##### **1. f1 score**

##### **Random Forest Classifier**

In [23]:
train_score_1 = f1_score(y_train, y_train_pred_1)
print('The f1 score of training set :',train_score_1)

test_score_1 = f1_score(y_test, y_test_pred_1)
print('The f1 score of testing set :',test_score_1)

The f1 score of training set : 0.9980161354318212
The f1 score of testing set : 0.9200102223358038


##### **XGBClassifier**

In [24]:
train_score_2 = f1_score(y_train, y_train_pred_2)
print('The f1 score of training set :',train_score_2)

test_score_2 = f1_score(y_test, y_test_pred_2)
print('The f1 score of testing set :',test_score_2)

The f1 score of training set : 0.9433302093095908
The f1 score of testing set : 0.9345747357825868


##### **SVC**

In [25]:
train_score_3 = f1_score(y_train, y_train_pred_3)
print('The f1 score of training set :',train_score_3)

test_score_3 = f1_score(y_test, y_test_pred_3)
print('The f1 score of testing set :',test_score_3)

The f1 score of training set : 0.9495000314445632
The f1 score of testing set : 0.9345747357825868


##### **KNeighborsClassifier**

In [26]:
train_score_4 = f1_score(y_train, y_train_pred_4)
print('The f1 score of training set :',train_score_4)

test_score_4 = f1_score(y_test, y_test_pred_4)
print('The f1 score of testing set :',test_score_4)

The f1 score of training set : 0.9369838648202261
The f1 score of testing set : 0.912045395924684


##### *Conclusion - RandomforestClassifier Performs better for this Scenario with a Accuracy score of 99% in training set and a Accuracy score of 92% in testing set*

##### **Saving the model**

In [27]:
import pickle

In [28]:
# Model Object
pickle.dump(rfc, open(r'F:\GUVI_DATA_SCIENCE\Project\Hate_Speech_Classification\Artifacts\model.pkl', 'wb'))

# Vectorizer
pickle.dump(cv, open(r'F:\GUVI_DATA_SCIENCE\Project\Hate_Speech_Classification\Artifacts\vectorizer.pkl', 'wb'))

##### **User Input Prediction**

In [30]:
text = 'As of March 13th , 2014 , the booklet had been downloaded over 18,300 times and counting .'

# Text processing

processed_df = text_processor(text)

model = pickle.load(open(r'F:\GUVI_DATA_SCIENCE\Project\Hate_Speech_Classification\Artifacts\model.pkl', 'rb'))
vectorizer = pickle.load(open(r'F:\GUVI_DATA_SCIENCE\Project\Hate_Speech_Classification\Artifacts\vectorizer.pkl', 'rb'))

input_vectorized = vectorizer.transform(processed_df['text'])

input_prediction = model.predict(input_vectorized)

input_prediction = input_prediction[0]

if input_prediction == 1:
    print('no hate')
else:
    print('hate')

no hate
