In [284]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Data Collection
Loading the Dataset obtained from online 

In [285]:
df=pd.read_csv('labeled_data.csv',index_col=0)
df=df.drop(columns=['count','hate_speech','offensive_language','neither'])

In [286]:
df.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


# Data PreParation
We have cleaned the tweets without any handles, removed Special characters and removed unwanted punctuations.
We did stemming and stop word removal on this dataset.

In [287]:
# this function uses regex to remove links and unwanted punctuations and special characters in the tweets
def split_it(tweet):
    x = re.sub(r'@[a-zA-Z@#_$%^&*0-9;:]*',r'', tweet)
    x= re.sub(r'&[@#$%&()0-9]*',r'',x)
    x= re.sub(r'\sRT\s|!',r'',x)
    x = re.sub(r'https?\S+',r'',x)
    x = x.translate(str.maketrans(string.punctuation,' '*len(string.punctuation)))
    tokens = x.split()
    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    tokens = [ps.stem(token) for token in tokens if token not in stop_words] 
    return ' '.join([token for token in tokens if token not in stop_words])


In [288]:
df['tweet']=df['tweet'].apply(split_it)
df.head()

Unnamed: 0,class,tweet
0,2,As woman complain clean hous amp man alway tak...
1,1,boy dat cold tyga dwn bad cuffin dat hoe 1st p...
2,1,dawg ever fuck bitch start cri confus shit
3,1,look like tranni
4,1,shit hear might true might faker bitch told ya


In [289]:
# saving the cleaned data for future reference
df.to_csv('cleaned_tweets.csv')

# Balancing the Data
Data Splitting to build binary classifier for each category such a way that for each category data is balanced

In [290]:
df_0=df[df['class']==0]
df_1=df[df['class']==1]
df_2=df[df['class']==2]

#getting data from other classifier to balance 0th category
df_0=df_0.append(df_1[:715])
df_0=df_0.append(df_2[:715])

#getting data from other classifier to balance 1st category
df_1=df_1.append(df_0[:1430])
df_1=df_1.append(df_2[:4600])

#getting data from other classifier to balance 2nd category
df_2=df_2.append(df_2[:2000])
df_2=df_2.append(df_0[:1000])
df_2=df_2.append(df_1[:1000])


# Converting all the values to a binary classifier

In [291]:
df_0['class']=df_0['class'].apply(lambda x: 1 if x==0 else 0)
df_1['class']=df_1['class'].apply(lambda x: 1 if x==1 else 0)
df_2['class']=df_2['class'].apply(lambda x: 1 if x==2 else 0)

# Splitting the data into train and test

In [292]:
from sklearn.model_selection import train_test_split
Train_X0, Test_X0, Train_Y0, Test_Y0 = train_test_split(df_0['tweet'],df_0['class'],test_size=0.3)
Train_X1, Test_X1, Train_Y1, Test_Y1 = train_test_split(df_1['tweet'],df_1['class'],test_size=0.3)
Train_X2, Test_X2, Train_Y2, Test_Y2 = train_test_split(df_2['tweet'],df_2['class'],test_size=0.3)

# Vectoriser and building the model

In [293]:
def vector(df):
    Tfidf_vect = TfidfVectorizer(max_features=500)
    Tfidf_vect.fit(df['tweet'])
    return Tfidf_vect

def pass_the_value(Tfidf_vect,Train_X,Test_X,Train_Y,Test_Y):
    Train_X_Tfidf = Tfidf_vect.transform(Train_X)
    Test_X_Tfidf = Tfidf_vect.transform(Test_X)
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto',probability=True)
    SVM.fit(Train_X_Tfidf,Train_Y)
    predictions_SVM = SVM.predict(Test_X_Tfidf)
    print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
    return SVM

In [294]:
Tfidf_vect_cat_0=vector(df_0)
svm_cat_0=pass_the_value(Tfidf_vect,Train_X0, Test_X0, Train_Y0, Test_Y0)


Tfidf_vect_cat_1=vector(df_1)
svm_cat_1=pass_the_value(Tfidf_vect_cat_1,Train_X1, Test_X1, Train_Y1, Test_Y1)

Tfidf_vect_cat_2=vector(df_2)
svm_cat_2=pass_the_value(Tfidf_vect_cat_2,Train_X2, Test_X2, Train_Y2, Test_Y2)

SVM Accuracy Score ->  83.68298368298368
SVM Accuracy Score ->  90.78681909885677
SVM Accuracy Score ->  93.56469002695418


In [301]:
text=["i hate black people"]
Tfidf_vect_cat_0 = Tfidf_vect.transform(text)
Tfidf_vect_cat_1 = Tfidf_vect.transform(text)
Tfidf_vect_cat_2 = Tfidf_vect.transform(text)

predictions_0 = svm_cat_0.predict_proba(Tfidf_vect_cat_0)[0]
predictions_1 = svm_cat_1.predict_proba(Tfidf_vect_cat_1)[0]
predictions_2 = svm_cat_2.predict_proba(Tfidf_vect_cat_2)[0]

predictions_0

array([0.15053589, 0.84946411])

In [302]:
predictions_1

array([0.62949235, 0.37050765])

In [303]:
if prediction_0

array([0.08491395, 0.91508605])