## Importing libraries

In [1]:
import pandas as pd
import numpy as np

## Data Collection and Pre-processing

In [2]:
df = pd.read_csv('labeled_data.csv')

In [3]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
5,5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."
6,6,3,0,3,0,1,"!!!!!!""@__BrighterDays: I can not just sit up ..."
7,7,3,0,3,0,1,!!!!&#8220;@selfiequeenbri: cause I'm tired of...
8,8,3,0,3,0,1,""" &amp; you might not get ya bitch back &amp; ..."
9,9,3,1,2,0,1,""" @rhythmixx_ :hobbies include: fighting Maria..."


In [4]:
df = df.rename(columns={'Unnamed: 0':'Index'})
df.head(10)

Unnamed: 0,Index,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
5,5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."
6,6,3,0,3,0,1,"!!!!!!""@__BrighterDays: I can not just sit up ..."
7,7,3,0,3,0,1,!!!!&#8220;@selfiequeenbri: cause I'm tired of...
8,8,3,0,3,0,1,""" &amp; you might not get ya bitch back &amp; ..."
9,9,3,1,2,0,1,""" @rhythmixx_ :hobbies include: fighting Maria..."


#### Checking for null values

In [5]:
df.isnull().sum()

Index                 0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

## Data Description

In [6]:
df.shape

(24783, 7)

In [7]:
df.describe()

Unnamed: 0,Index,count,hate_speech,offensive_language,neither,class
count,24783.0,24783.0,24783.0,24783.0,24783.0,24783.0
mean,12681.192027,3.243473,0.280515,2.413711,0.549247,1.110277
std,7299.553863,0.88306,0.631851,1.399459,1.113299,0.462089
min,0.0,3.0,0.0,0.0,0.0,0.0
25%,6372.5,3.0,0.0,2.0,0.0,1.0
50%,12703.0,3.0,0.0,3.0,0.0,1.0
75%,18995.5,3.0,0.0,3.0,0.0,1.0
max,25296.0,9.0,7.0,9.0,9.0,2.0


In [8]:
df["labels"] = df["class"].map({0: "Hate Speech", 1: "Offensive Language", 2: "Neither"})
df.head()

Unnamed: 0,Index,count,hate_speech,offensive_language,neither,class,tweet,labels
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,Neither
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive Language
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Offensive Language
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Offensive Language
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Offensive Language


### Cleaning the texts

In [9]:
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

def clean(speech):
    speech = str(speech).lower()
    speech = re.sub('\[.*?\]','',speech)
    speech = re.sub('https?://\S+|www\.\S+','',speech)
    speech = re.sub('<.*?>+','',speech)
    speech = re.sub('[%s]' % re.escape(string.punctuation),'',speech)
    speech = re.sub('\n','',speech)
    speech = re.sub('\w*\d\w*','',speech)
    
    stopwords_set = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    
    cleaned_speech = []
    for word in speech.split(' '):
        if word not in stopwords_set:
            cleaned_word = stemmer.stem(word)
            cleaned_speech.append(cleaned_word)
    
    return ' '.join(cleaned_speech)

In [10]:
df["tweet"]= df["tweet"].apply(clean)
df.head(5)

Unnamed: 0,Index,count,hate_speech,offensive_language,neither,class,tweet,labels
0,0,3,0,0,3,2,rt mayasolov woman shouldnt complain clean ho...,Neither
1,1,3,0,3,0,1,rt boy dat coldtyga dwn bad cuffin dat hoe ...,Offensive Language
2,2,3,0,3,0,1,rt urkindofbrand dawg rt ever fuck bitch sta...,Offensive Language
3,3,3,0,2,1,1,rt cganderson vivabas look like tranni,Offensive Language
4,4,6,0,6,0,1,rt shenikarobert shit hear might true might f...,Offensive Language


### Splitting dataset into train and test

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
x = np.array(df["tweet"])
y = np.array(df["labels"])

cv = CountVectorizer()
x = cv.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size =0.25, random_state=20)

### Testing the accuracy of a few ML Models on the dataset

In [13]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [14]:
#Naive Bytes
clf = MultinomialNB()
clf.fit(x_train, y_train)
clf_predict = clf.predict(x_test)
clf_acc = accuracy_score(clf_predict, y_test)
print("Test accuracy: {:.2f}%".format(clf_acc * 100))

Test accuracy: 87.48%


In [15]:
#Decision Tree 
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
clf_predict = clf.predict(x_test)
clf_acc = accuracy_score(clf_predict, y_test)
print("Test accuarcy: {:.2f}%".format(clf_acc*100))

Test accuarcy: 87.88%


In [16]:
#Logistic Regression
clf = LogisticRegression(max_iter=1000)
clf.fit(x_train, y_train)
clf_predict = clf.predict(x_test)
clf_acc = accuracy_score(clf_predict, y_test)
print("Test accuarcy: {:.2f}%".format(clf_acc*100))

Test accuarcy: 90.09%


### Logistic Regression gives the highest accuracy score.

#### Classification Report and Confusion Matrix for better understanding

In [17]:
print(confusion_matrix(y_test, clf_predict))
print("\n")
print(classification_report(y_test, clf_predict))

[[  93   29  208]
 [   2  881  138]
 [  77  160 4608]]


                    precision    recall  f1-score   support

       Hate Speech       0.54      0.28      0.37       330
           Neither       0.82      0.86      0.84      1021
Offensive Language       0.93      0.95      0.94      4845

          accuracy                           0.90      6196
         macro avg       0.76      0.70      0.72      6196
      weighted avg       0.89      0.90      0.89      6196



### Sample Examples

In [18]:
sample1 = "You suck"
data = cv.transform([sample1]).toarray()
print(clf.predict(data))

['Offensive Language']


In [19]:
sample2 = "You are good"
data = cv.transform([sample2]).toarray()
print(clf.predict(data))

['Neither']


In [20]:
sample2 = "I hate you and will murder you"
data = cv.transform([sample2]).toarray()
print(clf.predict(data))

['Hate Speech']
