In [52]:
import numpy as np                         
import pandas as pd         
import matplotlib.pyplot as plt         
import re                                  
import string                              
import nltk                                
from nltk.corpus import stopwords          
from nltk.stem import PorterStemmer       
import random                             

%matplotlib inline   

### Importing Data

In [53]:
df = pd.read_csv('labeled_data.csv')

In [55]:
df = df.drop(['Unnamed: 0','count','hate_speech', 'offensive_language', 'neither'], axis=1)

In [56]:
df.head()

Unnamed: 0,class,tweet
0,2,"""@mayasolovely: As a woman you shouldn't compl..."
1,1,"""@mleew17: boy dats cold...tyga dwn bad for cu..."
2,1,"""@UrKindOfBrand Dawg!!!! RT @80sbaby4life: You..."
3,1,"""@C_G_Anderson: @viva_based she look like a tr..."
4,1,"""@ShenikaRoberts: The shit you hear about me m..."


### Text Processing

In [67]:
df['tweet'] = df.tweet.apply(lambda x: [i for i in x.split(" ") if not ("@" in i)])

In [68]:
df.tweet

0        [As, a, woman, you, shouldn't, complain, about...
1        [boy, dats, cold...tyga, dwn, bad, for, cuffin...
2        [Dawg!!!!, RT, You, ever, fuck, a, bitch, and,...
3                             [she, look, like, a, tranny]
4        [The, shit, you, hear, about, me, might, be, t...
                               ...                        
24778    [you's, a, muthaf***in, lie, right!, His, TL, ...
24779    [you've, gone, and, broke, the, wrong, heart, ...
24780    [young, buck, wanna, eat!!.., dat, nigguh, lik...
24781        [youu, got, wild, bitches, tellin, you, lies]
24782    [~~Ruffled, |, Ntac, Eileen, Dahlia, -, Beauti...
Name: tweet, Length: 24783, dtype: object

In [69]:
df.tweet = df.tweet.apply(lambda x: " ".join(x))

In [75]:
df.tweet[400]

'Niggas whole hoes out here cuz" Couldn\'t had said it better'

In [76]:
df['tweet'] = df['tweet'].str.replace('\d+', '')

In [73]:
punctuation = "!#$%^&*()_+<>?:.,;''"

In [80]:
df['tweet'] = df['tweet'].str.replace('#', '')
df['tweet'] = df['tweet'].str.replace('!', '')
df['tweet'] = df['tweet'].str.replace('$', '')
df['tweet'] = df['tweet'].str.replace('%', '')
df['tweet'] = df['tweet'].str.replace('*', '')
df['tweet'] = df['tweet'].str.replace(';', '')
df['tweet'] = df['tweet'].str.replace('&', '')
df['tweet'] = df['tweet'].str.replace('""', '')
df['tweet'] = df['tweet'].str.replace('.', '')
df['tweet'] = df['tweet'].str.replace('_','')

### Train and Test Split

In [81]:
from sklearn.model_selection import train_test_split

In [82]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['class'] ,test_size=0.3,random_state=42)


### Vectorization

In [83]:
from sklearn.feature_extraction.text import CountVectorizer

In [84]:
bow_transformer = CountVectorizer()

In [85]:
bow_transformer.fit(X_train)

CountVectorizer()

In [86]:
print(f'There are {len(bow_transformer.get_feature_names())} tokens in the vocabulary:')
print(bow_transformer.get_feature_names())

There are 19240 tokens in the vocabulary:


In [87]:
X_train_bow = bow_transformer.transform(X_train)

In [88]:
pd.DataFrame(X_train_bow.toarray(), columns=bow_transformer.get_feature_names())

Unnamed: 0,aa,aaahhhhh,aahahah,aakwohdfgv,aaliyah,aamjhhowy,aampt,aan,aanpsujmyi,aap,...,zwoawgi,zwrimdzqnv,zwroiu,zwttsbcc,zwxyrmi,zxzlhtim,zyccjbitches,zycuodiwkz,zzzquil,zzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17343,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17344,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17345,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [109]:
print(f'Shape of Sparse Matrix: {X_train_bow.shape}')
print(f'Amount of Non-Zero occurences: {X_train_bow.nnz}')

# the sparsity of the matrix is calculated as the number of zero-valued elements divided by the total number of elements 
# a matrix will be sparse when its sparsity is greater than 0.5
sparsity = (X_train_bow.shape[0] * X_train_bow.shape[1] - X_train_bow.nnz)/ (X_train_bow.shape[0] * X_train_bow.shape[1]) *100
print(f'Sparsity: {sparsity}')

Shape of Sparse Matrix: (17348, 19240)
Amount of Non-Zero occurences: 204196
Sparsity: 99.93882235581567


### Training a Model

In [90]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

In [91]:
model = OneVsRestClassifier(LinearSVC(random_state=0))

In [92]:
model.fit(X_train_bow, y_train)

OneVsRestClassifier(estimator=LinearSVC(random_state=0))

### Model Evaluation

In [93]:
X_test_bow = bow_transformer.transform(X_test)
X_test_bow.shape

(7435, 19240)

In [94]:
predictions = model.predict(X_test_bow)

In [95]:
from sklearn.metrics import confusion_matrix

In [96]:
confusion_matrix(y_test,predictions)

array([[ 129,  257,   41],
       [ 175, 5404,  168],
       [  38,  210, 1013]], dtype=int64)

In [97]:
from sklearn.metrics import classification_report

In [98]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.38      0.30      0.34       427
           1       0.92      0.94      0.93      5747
           2       0.83      0.80      0.82      1261

    accuracy                           0.88      7435
   macro avg       0.71      0.68      0.69      7435
weighted avg       0.87      0.88      0.88      7435



## Training Other Models

In [115]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [107]:
model = DecisionTreeClassifier()
model.fit(X_train_bow, y_train)

X_test_bow = bow_transformer.transform(X_test)
X_test_bow.shape

predictions = model.predict(X_test_bow)
confusion_matrix(y_test,predictions)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.31      0.30      0.30       427
           1       0.93      0.93      0.93      5747
           2       0.82      0.83      0.83      1261

    accuracy                           0.87      7435
   macro avg       0.69      0.68      0.68      7435
weighted avg       0.87      0.87      0.87      7435



In [112]:
model = RandomForestClassifier()
model.fit(X_train_bow, y_train)

X_test_bow = bow_transformer.transform(X_test)
X_test_bow.shape

predictions = model.predict(X_test_bow)
confusion_matrix(y_test,predictions)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.44      0.13      0.20       427
           1       0.88      0.97      0.92      5747
           2       0.87      0.66      0.75      1261

    accuracy                           0.87      7435
   macro avg       0.73      0.59      0.62      7435
weighted avg       0.85      0.87      0.85      7435



In [114]:
model = MLPClassifier()
model.fit(X_train_bow, y_train)

X_test_bow = bow_transformer.transform(X_test)
X_test_bow.shape

predictions = model.predict(X_test_bow)
confusion_matrix(y_test,predictions)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.32      0.26      0.29       427
           1       0.91      0.93      0.92      5747
           2       0.80      0.75      0.77      1261

    accuracy                           0.86      7435
   macro avg       0.68      0.65      0.66      7435
weighted avg       0.85      0.86      0.86      7435





In [116]:
model = RidgeClassifier()
model.fit(X_train_bow, y_train)

X_test_bow = bow_transformer.transform(X_test)
X_test_bow.shape

predictions = model.predict(X_test_bow)
confusion_matrix(y_test,predictions)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.41      0.17      0.24       427
           1       0.90      0.94      0.92      5747
           2       0.77      0.74      0.75      1261

    accuracy                           0.86      7435
   macro avg       0.69      0.62      0.64      7435
weighted avg       0.85      0.86      0.85      7435



In [118]:
model = GradientBoostingClassifier()
model.fit(X_train_bow, y_train)

X_test_bow = bow_transformer.transform(X_test)
X_test_bow.shape

predictions = model.predict(X_test_bow)
confusion_matrix(y_test,predictions)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.54      0.18      0.27       427
           1       0.89      0.96      0.92      5747
           2       0.83      0.70      0.76      1261

    accuracy                           0.87      7435
   macro avg       0.75      0.61      0.65      7435
weighted avg       0.86      0.87      0.86      7435

