In [None]:
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import nltk
import string
import contractions
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score ,accuracy_score,classification_report

# DATA CLEANING

In [None]:
df = pd.read_csv('labeled_data.csv')

In [None]:
df.info

In [None]:
df.head(10)

In [None]:
#Rename column names which are displaced by row
df = df.rename(columns={"Unnamed: 0": "Tweet ID"})

In [None]:
#Get the Columns with NAN values
print(df.isnull().sum().sort_values())

In [None]:
df.nunique()

In [None]:
df.isna().sum()

In [None]:
df.memory_usage()

# DATA PREPROCESSING

In [None]:
#Expansion of Clitics:
#For example, Shouldn't -> Should not, I'll -> I will
def cliticexp(sent:str):
    exp = [contractions.fix(w) for w in sent]
    exp_text = ''.join(exp)
    return exp_text.strip()

In [None]:
#Tokenizes sentences
def sent_tokenizer(sent:str):
    tokens = re.split(r"[^A-Za-z0-9-']",sent)
    tokens = list(filter(len,tokens))
    return tokens

In [None]:
#Removes punctuations
def listtostring(L:list):
    string = ""
    for l in L:
        string = string+l.lower()+' '
    return string.strip()

In [None]:
#Converting list of tokens to lowercase string
tokenized = []
for i in range(len(df)):
    tok = sent_tokenizer(df.iloc[i,6])
    tokenized.append(listtostring(tok))

In [None]:
tokenized

In [None]:
df['Processed_Tweets'] = tokenized

In [None]:
df.head(5)

# EXPLORATORY DATA ANALYSIS

In [None]:
df.describe()

In [None]:
df['class'].value_counts()

In [None]:
df.info()

In [None]:
fig = plt.figure(figsize=(5,5))
sns.countplot(x='class',data = df)

# BUILDING A  BASELINE MODEL

In [None]:
# Initialize the TfidfVectorizer model and document-term matrix
# Pre-process the documents
def vectorize_train(training_docs):
    tfidf_train = None
# Initialize the TfidfVectorizer model and document-term matrix
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
    tfidf_train = vectorizer.fit_transform(training_docs)
    return vectorizer, tfidf_train

In [None]:
#Initializing the vectorizer
vectorizer, tfidf_train = vectorize_train(df['Processed_Tweets'])

In [None]:
#Train Test Split
X_train, X_test, Y_train, Y_test = train_test_split(tfidf_train,df['class'],test_size=0.25,random_state=0)

In [None]:
#Building a logistic regression model
LR = LogisticRegression(C=5e1, solver='lbfgs', multi_class='multinomial')

In [None]:
#Fitting the model
LR.fit(X_train,Y_train)

In [None]:
#Making the prediction
pred = LR.predict(X_test)

In [None]:
#RESULTS
confusion = sklearn.metrics.confusion_matrix(Y_test,pred)
accuracy = sklearn.metrics.accuracy_score(Y_test,pred)
#precision = sklearn.metrics.precision_score(Y_test,pred)
#recall = sklearn.metrics.recall_score(Y_test,pred)
#F1 = (2 * precision * recall)/(precsion + recall)

In [None]:
print("CONFUSION MATRIX")
print(confusion)

In [None]:
#Class-wise Precision score
ph = confusion[0,0]/ np.sum(confusion[:,0])
poff = confusion[1,1]/ np.sum(confusion[:,1])
pn = confusion[2,2]/ np.sum(confusion[:,2])
print("Precision for Hate Tweets:",ph)
print("Precision for Offensive Tweets:",poff)
print("Precision for Neutral Tweets:",pn)

In [None]:
#Class-wise Recall/Sensitivity score
rh = confusion[0,0]/ np.sum(confusion[0,:])
roff = confusion[1,1]/ np.sum(confusion[1,:])
rn = confusion[2,2]/ np.sum(confusion[2,:])
print("Recall for Hate Tweets:",rh)
print("Recall for Offensive Tweets:",roff)
print("Recall for Neutral Tweets:",rn)

In [None]:
#Class-wise F1 Score
f1h = (2*ph*rh)/(ph+rh)
f1off = (2*poff*roff)/(poff+roff)
f1n = (2*pn*rn)/(pn+rn)
print("F1 Score for Hate Tweets:",f1h)
print("F1 Score for Offensive Tweets:",f1off)
print("F1 Score for Neutral Tweets:",f1n)

In [None]:
#Class-wise Accuracy Score
rh = confusion[0,0]/ np.sum(confusion[0,:])
roff = confusion[1,1]/ np.sum(confusion[1,:])
rn = confusion[2,2]/ np.sum(confusion[2,:])
print("Recall for Hate Tweets:",rh)
print("Recall for Offensive Tweets:",roff)
print("Recall for Neutral Tweets:",rn)

In [None]:
#Classification Report
print(sklearn.metrics.classification_report(Y_test,pred))
print("Overall Accuracy =","{0:.4f}".format(accuracy*100),"%")