# Importing the libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
import os

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Importing the dataset

In [2]:
dataset = pd.read_csv('/content/labeled_data.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [4]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,24783.0,12681.192027,7299.553863,0.0,6372.5,12703.0,18995.5,25296.0
count,24783.0,3.243473,0.88306,3.0,3.0,3.0,3.0,9.0
hate_speech,24783.0,0.280515,0.631851,0.0,0.0,0.0,0.0,7.0
offensive_language,24783.0,2.413711,1.399459,0.0,2.0,3.0,3.0,9.0
neither,24783.0,0.549247,1.113299,0.0,0.0,0.0,0.0,9.0
class,24783.0,1.110277,0.462089,0.0,1.0,1.0,1.0,2.0


In [5]:
dt_trasformed = dataset[['class', 'tweet']]
y = dt_trasformed.iloc[:, :-1].values

In [6]:
print(y)

[[2]
 [1]
 [1]
 ...
 [1]
 [1]
 [2]]


### Encoding the Dependent Variable

In [7]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
y = np.array(ct.fit_transform(y))

In [8]:
print(y)

[[0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 ...
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


I separated this y in two variables that we will use to fit hate speech models and offensive speech models

In [9]:
y_df = pd.DataFrame(y)
y_hate = np.array(y_df[0])
y_offensive = np.array(y_df[1])

In [10]:
print(y_hate)
print(y_offensive)

[0. 0. 0. ... 0. 0. 0.]
[0. 1. 1. ... 1. 1. 0.]


## Cleaning the texts

In [11]:
corpus = []
for i in range(0, 24783):
  review = re.sub('[^a-zA-Z]', ' ', dt_trasformed['tweet'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

In [12]:
cv = CountVectorizer(max_features = 2000)
X = cv.fit_transform(corpus).toarray()

## Splitting the dataset into the Training set and Test set

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y_hate, test_size = 0.20, random_state = 0)

## Finding the best models to predict hate speech

**Naive Bayes**

In [14]:
classifier_np = GaussianNB()
classifier_np.fit(X_train, y_train)

GaussianNB()

**Decision Tree**

In [15]:
classifier_dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

**KNN**

In [16]:
classifier_knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_knn.fit(X_train, y_train)

KNeighborsClassifier()

**Logistic Regression**

In [17]:
classifier_lr = LogisticRegression(random_state = 0)
classifier_lr.fit(X_train, y_train)

LogisticRegression(random_state=0)

**Random Forest**

In [18]:
classifier_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_rf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

# Making the Confusion Matrix for each model

In [19]:
#Naive Bayes
y_pred_np = classifier_np.predict(X_test)
cm = confusion_matrix(y_test, y_pred_np)
print(cm)

[[1969 2709]
 [  90  189]]


In [20]:
#Decision Tree
y_pred_dt = classifier_dt.predict(X_test)
cm = confusion_matrix(y_test, y_pred_dt)
print(cm)

[[4455  223]
 [ 188   91]]


In [21]:
#Linear Regression
y_pred_lr = classifier_lr.predict(X_test)
cm = confusion_matrix(y_test, y_pred_lr)
print(cm)

[[4614   64]
 [ 229   50]]


In [22]:
#Random Forest
y_pred_rf = classifier_rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred_rf)
print(cm)

[[4586   92]
 [ 213   66]]


In [23]:
rf_score = accuracy_score(y_test, y_pred_rf)
lr_score = accuracy_score(y_test, y_pred_lr)
dt_score = accuracy_score(y_test, y_pred_dt)
np_score = accuracy_score(y_test, y_pred_np)

print('Random Forest Accuracy: ', str(rf_score))
print('Linear Regression Accuracy: ', str(lr_score))
print('Decision Tree Accuracy: ', str(dt_score))
print('Naive Bayes Accuracy: ', str(np_score))

Random Forest Accuracy:  0.9384708493040145
Linear Regression Accuracy:  0.940891668347791
Decision Tree Accuracy:  0.9170869477506557
Naive Bayes Accuracy:  0.43534395803913656


In [24]:
import joblib

# Save the model as a pickle in a file
joblib.dump(classifier_lr, 'classifier_lr_hate.pkl')

# Load the model from the file
classifier_lr_hate_from_joblib = joblib.load('classifier_lr_hate.pkl')

# Use the loaded model to make predictions
classifier_lr_hate_from_joblib.predict(X_test[:2])

array([0., 0.])

## Finding best models to predict offensive speech

In [25]:
X_train_off, X_test_off, y_train_off, y_test_off = train_test_split(X, y_offensive, test_size = 0.20, random_state = 0)

### Naive Bayes

In [26]:
classifier_np_off = GaussianNB()
classifier_np_off.fit(X_train_off, y_train_off)

GaussianNB()

### Decision Trees

In [27]:
classifier_dt_off = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_dt_off.fit(X_train_off, y_train_off)

DecisionTreeClassifier(criterion='entropy', random_state=0)

### KNN

In [28]:
classifier_knn_off = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_knn_off.fit(X_train_off, y_train_off)

KNeighborsClassifier()

### Logistic Regression

In [29]:
classifier_lr_off = LogisticRegression(random_state = 0)
classifier_lr_off.fit(X_train_off, y_train_off)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=0)

### Random Forest

In [30]:
classifier_rf_off = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_rf_off.fit(X_train_off, y_train_off)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

## Confusion Matrix

In [34]:
#Naive Bayes
y_pred_np_off = classifier_np_off.predict(X_test_off)
cm = confusion_matrix(y_test, y_pred_np_off)
print(cm)

[[3481 1197]
 [ 240   39]]


In [35]:
#Decision Tree
y_pred_dt_off = classifier_dt_off.predict(X_test_off)
cm = confusion_matrix(y_test, y_pred_dt_off)
print(cm)

[[1056 3622]
 [ 122  157]]


In [37]:
#Linear Regression
y_pred_lr_off = classifier_lr.predict(X_test_off)
cm = confusion_matrix(y_test, y_pred_lr_off)
print(cm)

[[4614   64]
 [ 229   50]]


In [38]:
#Random Forest
y_pred_rf_off = classifier_rf.predict(X_test_off)
cm = confusion_matrix(y_test, y_pred_rf_off)
print(cm)

[[4586   92]
 [ 213   66]]


In [39]:
rf_score = accuracy_score(y_test, y_pred_rf_off)
lr_score = accuracy_score(y_test, y_pred_lr_off)
dt_score = accuracy_score(y_test, y_pred_dt_off)
np_score = accuracy_score(y_test, y_pred_np_off)

print('Random Forest Accuracy: ', str(rf_score))
print('Linear Regression Accuracy: ', str(lr_score))
print('Decision Tree Accuracy: ', str(dt_score))
print('Naive Bayes Accuracy: ', str(np_score))

Random Forest Accuracy:  0.9384708493040145
Linear Regression Accuracy:  0.940891668347791
Decision Tree Accuracy:  0.24470445834173896
Naive Bayes Accuracy:  0.7101069195077668


In [40]:
import joblib

# Save the model as a pickle in a file
joblib.dump(classifier_lr_off, 'classifier_lr_off.pkl')

# Load the model from the file
classifier_lr_hate_from_joblib = joblib.load('classifier_lr_off.pkl')

# Use the loaded model to make predictions
classifier_lr_hate_from_joblib.predict(X_test_off[:2])

array([0., 1.])

In [41]:
y_test_off[:2]

array([0., 1.])