In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
stopwords = nltk.corpus.stopwords.words('english')
import string
import re

### Load Dataset and drop unnecessary columns

In [2]:
df = pd.read_csv("labeled_data.csv")
df=df.drop(df.index[5001:])
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [3]:
df["negative_class"] = (df["hate_speech"] + df["offensive_language"])

In [4]:
df["class"] = (df["negative_class"] > df["neither"]).astype(int)
df.head()
df.drop(["count", "hate_speech", "offensive_language", "negative_class", "neither"], axis=1, inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,class,tweet
0,0,0,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


### Preprocessing
#### Steps
* Removing Punctuations
* Removing URLs
* Removing Stop Words
* Lower Casing
* Tokenization
* Stemming
* Lemmatization

Reference: https://www.analyticsvidhya.com/blog/2021/06/text-preprocessing-in-nlp-with-python-codes/

In [5]:
# Punctuation Removal
def remove_punctuation(text):
    punctuation_free = "".join([i for i in text if i not in string.punctuation]).strip()
    return punctuation_free
df["punctuation_free"] = df["tweet"].apply(lambda x : remove_punctuation(x)) 
df.head()   

Unnamed: 0.1,Unnamed: 0,class,tweet,punctuation_free
0,0,0,!!! RT @mayasolovely: As a woman you shouldn't...,RT mayasolovely As a woman you shouldnt compla...
1,1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,RT mleew17 boy dats coldtyga dwn bad for cuffi...
2,2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,RT UrKindOfBrand Dawg RT 80sbaby4life You ever...
3,3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,RT CGAnderson vivabased she look like a tranny
4,4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,RT ShenikaRoberts The shit you hear about me m...


In [6]:
# Convert to lowercase
df["tweet_lower"] = df["punctuation_free"].apply(lambda x: x.lower())
df.head()

Unnamed: 0.1,Unnamed: 0,class,tweet,punctuation_free,tweet_lower
0,0,0,!!! RT @mayasolovely: As a woman you shouldn't...,RT mayasolovely As a woman you shouldnt compla...,rt mayasolovely as a woman you shouldnt compla...
1,1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,RT mleew17 boy dats coldtyga dwn bad for cuffi...,rt mleew17 boy dats coldtyga dwn bad for cuffi...
2,2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,RT UrKindOfBrand Dawg RT 80sbaby4life You ever...,rt urkindofbrand dawg rt 80sbaby4life you ever...
3,3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,RT CGAnderson vivabased she look like a tranny,rt cganderson vivabased she look like a tranny
4,4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,RT ShenikaRoberts The shit you hear about me m...,rt shenikaroberts the shit you hear about me m...


In [7]:
# Tokenization
def tokenize(text): # Reference: https://pynative.com/python-regex-split/
    tokens = re.split(r"\s+", text)
    return tokens
df["tokenized_tweet"] = df["tweet_lower"].apply(lambda x : tokenize(x))
df.head()

Unnamed: 0.1,Unnamed: 0,class,tweet,punctuation_free,tweet_lower,tokenized_tweet
0,0,0,!!! RT @mayasolovely: As a woman you shouldn't...,RT mayasolovely As a woman you shouldnt compla...,rt mayasolovely as a woman you shouldnt compla...,"[rt, mayasolovely, as, a, woman, you, shouldnt..."
1,1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,RT mleew17 boy dats coldtyga dwn bad for cuffi...,rt mleew17 boy dats coldtyga dwn bad for cuffi...,"[rt, mleew17, boy, dats, coldtyga, dwn, bad, f..."
2,2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,RT UrKindOfBrand Dawg RT 80sbaby4life You ever...,rt urkindofbrand dawg rt 80sbaby4life you ever...,"[rt, urkindofbrand, dawg, rt, 80sbaby4life, yo..."
3,3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,RT CGAnderson vivabased she look like a tranny,rt cganderson vivabased she look like a tranny,"[rt, cganderson, vivabased, she, look, like, a..."
4,4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,RT ShenikaRoberts The shit you hear about me m...,rt shenikaroberts the shit you hear about me m...,"[rt, shenikaroberts, the, shit, you, hear, abo..."


In [8]:
# Remove Stopwords
def remove_stopwords(text):
    removed_stopwords = [i for i in text if i not in stopwords]
    return removed_stopwords

df["no_stopwords"] = df["tokenized_tweet"].apply(lambda x : remove_stopwords(x))
df.head()

Unnamed: 0.1,Unnamed: 0,class,tweet,punctuation_free,tweet_lower,tokenized_tweet,no_stopwords
0,0,0,!!! RT @mayasolovely: As a woman you shouldn't...,RT mayasolovely As a woman you shouldnt compla...,rt mayasolovely as a woman you shouldnt compla...,"[rt, mayasolovely, as, a, woman, you, shouldnt...","[rt, mayasolovely, woman, shouldnt, complain, ..."
1,1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,RT mleew17 boy dats coldtyga dwn bad for cuffi...,rt mleew17 boy dats coldtyga dwn bad for cuffi...,"[rt, mleew17, boy, dats, coldtyga, dwn, bad, f...","[rt, mleew17, boy, dats, coldtyga, dwn, bad, c..."
2,2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,RT UrKindOfBrand Dawg RT 80sbaby4life You ever...,rt urkindofbrand dawg rt 80sbaby4life you ever...,"[rt, urkindofbrand, dawg, rt, 80sbaby4life, yo...","[rt, urkindofbrand, dawg, rt, 80sbaby4life, ev..."
3,3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,RT CGAnderson vivabased she look like a tranny,rt cganderson vivabased she look like a tranny,"[rt, cganderson, vivabased, she, look, like, a...","[rt, cganderson, vivabased, look, like, tranny]"
4,4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,RT ShenikaRoberts The shit you hear about me m...,rt shenikaroberts the shit you hear about me m...,"[rt, shenikaroberts, the, shit, you, hear, abo...","[rt, shenikaroberts, shit, hear, might, true, ..."


In [9]:
# Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(text):
    lemmatized_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemmatized_text
df["lemmatized_tweet"] = df["no_stopwords"].apply(lambda x : lemmatizer(x))
df.head()

Unnamed: 0.1,Unnamed: 0,class,tweet,punctuation_free,tweet_lower,tokenized_tweet,no_stopwords,lemmatized_tweet
0,0,0,!!! RT @mayasolovely: As a woman you shouldn't...,RT mayasolovely As a woman you shouldnt compla...,rt mayasolovely as a woman you shouldnt compla...,"[rt, mayasolovely, as, a, woman, you, shouldnt...","[rt, mayasolovely, woman, shouldnt, complain, ...","[rt, mayasolovely, woman, shouldnt, complain, ..."
1,1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,RT mleew17 boy dats coldtyga dwn bad for cuffi...,rt mleew17 boy dats coldtyga dwn bad for cuffi...,"[rt, mleew17, boy, dats, coldtyga, dwn, bad, f...","[rt, mleew17, boy, dats, coldtyga, dwn, bad, c...","[rt, mleew17, boy, dat, coldtyga, dwn, bad, cu..."
2,2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,RT UrKindOfBrand Dawg RT 80sbaby4life You ever...,rt urkindofbrand dawg rt 80sbaby4life you ever...,"[rt, urkindofbrand, dawg, rt, 80sbaby4life, yo...","[rt, urkindofbrand, dawg, rt, 80sbaby4life, ev...","[rt, urkindofbrand, dawg, rt, 80sbaby4life, ev..."
3,3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,RT CGAnderson vivabased she look like a tranny,rt cganderson vivabased she look like a tranny,"[rt, cganderson, vivabased, she, look, like, a...","[rt, cganderson, vivabased, look, like, tranny]","[rt, cganderson, vivabased, look, like, tranny]"
4,4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,RT ShenikaRoberts The shit you hear about me m...,rt shenikaroberts the shit you hear about me m...,"[rt, shenikaroberts, the, shit, you, hear, abo...","[rt, shenikaroberts, shit, hear, might, true, ...","[rt, shenikaroberts, shit, hear, might, true, ..."


### Additional Preprocessing Steps..
Some additional preprocessing steps for better result such as 'URL removal' and 'Spell Correction' will be implemented later if time permits <br>
Reference URL Removal: https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python

### Feature Extraction
* Count Vectorization(Bag-of-Words)
* Tf-idf

In [10]:
# Feature Extraction using Count Vectorization
vocabulary = set()
for tweet in (df["lemmatized_tweet"]):
    for word in tweet:
        vocabulary.add(word)

def generate_bag_of_words(text_vector):
    count_vector = np.array([text_vector.count(word) for word in vocabulary])
    return count_vector
df["count_vectorized_tweets"] = df["lemmatized_tweet"].apply(lambda x : generate_bag_of_words(x))
df.head()

Unnamed: 0.1,Unnamed: 0,class,tweet,punctuation_free,tweet_lower,tokenized_tweet,no_stopwords,lemmatized_tweet,count_vectorized_tweets
0,0,0,!!! RT @mayasolovely: As a woman you shouldn't...,RT mayasolovely As a woman you shouldnt compla...,rt mayasolovely as a woman you shouldnt compla...,"[rt, mayasolovely, as, a, woman, you, shouldnt...","[rt, mayasolovely, woman, shouldnt, complain, ...","[rt, mayasolovely, woman, shouldnt, complain, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,RT mleew17 boy dats coldtyga dwn bad for cuffi...,rt mleew17 boy dats coldtyga dwn bad for cuffi...,"[rt, mleew17, boy, dats, coldtyga, dwn, bad, f...","[rt, mleew17, boy, dats, coldtyga, dwn, bad, c...","[rt, mleew17, boy, dat, coldtyga, dwn, bad, cu...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,RT UrKindOfBrand Dawg RT 80sbaby4life You ever...,rt urkindofbrand dawg rt 80sbaby4life you ever...,"[rt, urkindofbrand, dawg, rt, 80sbaby4life, yo...","[rt, urkindofbrand, dawg, rt, 80sbaby4life, ev...","[rt, urkindofbrand, dawg, rt, 80sbaby4life, ev...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,RT CGAnderson vivabased she look like a tranny,rt cganderson vivabased she look like a tranny,"[rt, cganderson, vivabased, she, look, like, a...","[rt, cganderson, vivabased, look, like, tranny]","[rt, cganderson, vivabased, look, like, tranny]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,RT ShenikaRoberts The shit you hear about me m...,rt shenikaroberts the shit you hear about me m...,"[rt, shenikaroberts, the, shit, you, hear, abo...","[rt, shenikaroberts, shit, hear, might, true, ...","[rt, shenikaroberts, shit, hear, might, true, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [11]:
# Feature Extraction using Tf-idf
# Todo

In [12]:
# Split the dataset between Train and Test Set
X_train, X_test, y_train, y_test = train_test_split(list(df["count_vectorized_tweets"]), df["class"], test_size=0.2, random_state=42, shuffle=True)

### Fit the models
Algorithms
* Logistic Regression
* Naive Bayes Classifier
* Support Vector Machine/Decision Trees/Kth Nearest Neighbor/Simple Neural Networks

### Logistic Regression

In [13]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
print(logistic_regression.score(X_test, y_test))

0.9330669330669331


### Naive Bayes Classifier

In [14]:
naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(X_train, y_train)
naive_bayes_classifier.score(X_test, y_test)

0.7582417582417582

## Support Vector Machine

In [None]:
# knn

knn = KNeighborsClassifier() 
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))

In [None]:
# Test the models using percentages, Confusion matrices, f1 score etc for measuring accuracy

In [None]:
# Include a lot of plots throughout the project