In [83]:
#Importing the required Libraries

In [109]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
#unidecode will be used to remove Greek Characters
from unidecode import unidecode

In [169]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [137]:
df = pd.read_csv('data/train_E6oV3lV.csv')

In [138]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


## Preprocessing

In [139]:
#Removing @ symbol
df['clean_tweet'] = df['tweet'].apply(lambda x : ' '.join([tweet for tweet in x.split()if not tweet.startswith("@")]))

In [140]:
df.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause they...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ur...
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation


In [141]:
#Removing Numbers in the clean_tweet column
df['clean_tweet'] = df['clean_tweet'].apply(lambda x : ' '.join([tweet for tweet in x.split()if not tweet == '\d*']))

In [142]:
df.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause they...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ur...
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation


In [143]:
#Removing Greek Letters
df['clean_tweet'] = df['clean_tweet'].apply(lambda x : ' '.join([unidecode(word) for word in x.split()]))

In [144]:
df.head(10)

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause they...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ur...
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow dannya|
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams.d- ...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome here ! i'm it's so #gr8 !


In [145]:
#Removing hmm nd it's variants
df['clean_tweet'] = df['clean_tweet'].apply(lambda x : ' '.join([tweet for tweet in x.split()if not tweet == '(hm)+']))

In [146]:
#Creating a dictionary of slang words, can use a better source if available.
slang_dict = {'bihday':'birthday','gr8':'great','luv':'love','wud':'would','lyk':'like','wateva':'whatever','ttyl':'talk to you later',
               'kul':'cool','fyn':'fine','omg':'oh my god!','fam':'family','bruh':'brother',
               'cud':'could','fud':'food'}

In [147]:
#Removing slangs 
df['clean_tweet'] = df['clean_tweet'].apply(lambda x : ' '.join([slang_dict[word] if word in slang_dict else word for word in x.split()]))

In [148]:
df.head(10)

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for #lyft credit i can't use cause they...
2,3,0,bihday your majesty,birthday your majesty
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ur...
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow dannya|
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams.d- ...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome here ! i'm it's so #gr8 !


In [149]:
#separating hashtags from the tweets
df['hashtags'] = df['clean_tweet'].apply(lambda x: ''.join([word for word in x.split() if word.startswith('#')]))
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: ''.join([(word+' ') for word in x.split() if not word.startswith('#')]))

In [150]:
hashtag = pd.DataFrame(df['hashtags'])

In [151]:
hashtag[hashtag['hashtags'] == ''] = 'No Hashtag'

In [152]:
hashtag.head()

Unnamed: 0,hashtags
0,#run
1,#lyft#disapointed#getthanked
2,No Hashtag
3,#model
4,#motivation


In [153]:
df['hashtag'] = hashtag['hashtags']

In [154]:
df.drop('hashtags',axis=1,inplace=True)

In [155]:
df.head()

Unnamed: 0,id,label,tweet,clean_tweet,hashtag
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...,#run
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i can't use cause they don't...,#lyft#disapointed#getthanked
2,3,0,bihday your majesty,birthday your majesty,No Hashtag
3,4,0,#model i love u take with u all the time in ...,i love u take with u all the time in urd+-!!! ...,#model
4,5,0,factsguide: society now #motivation,factsguide: society now,#motivation


In [156]:
#Removing stopwords
df['clean_tweet'] = df['clean_tweet'].apply(lambda x : ' '.join([word for word in x.split() if not word in set(stopwords.words('english'))]))

In [157]:
#This resource is required for WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [158]:
#Lemmitization
lemmatizer = WordNetLemmatizer()
df['clean_tweet'] = df['clean_tweet'].apply(lambda x : ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

In [159]:
#Stemming
ps = PorterStemmer()
df['clean_tweet'] = df['clean_tweet'].apply(lambda x : ' '.join([ps.stem(word) for word in x.split()]))

In [160]:
df.head(10)

Unnamed: 0,id,label,tweet,clean_tweet,hashtag
0,1,0,@user when a father is dysfunctional and is s...,father dysfunct selfish drag kid dysfunction.,#run
1,2,0,@user @user thanks for #lyft credit i can't us...,thank credit can't use caus offer wheelchair v...,#lyft#disapointed#getthanked
2,3,0,bihday your majesty,birthday majesti,No Hashtag
3,4,0,#model i love u take with u all the time in ...,love u take u time urd+-!!! dddd d|d|d|,#model
4,5,0,factsguide: society now #motivation,factsguide: societi,#motivation
5,6,0,[2/2] huge fan fare and big talking before the...,[2/2] huge fan fare big talk leave. chao pay d...,#allshowandnogo
6,7,0,@user camping tomorrow @user @user @user @use...,camp tomorrow dannya|,No Hashtag
7,8,0,the next school year is the year for exams.ð...,next school year year exams.d- can't think,#school#exams#hate#imagine#actorslife#revoluti...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,won!!! love land!!! a|,#allin#cavs#champions#cleveland#clevelandcaval...
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcom ! i'm !,#gr8


In [161]:
#Creating a corpus of tweets
#Tokenization
corpus = []
for i in range(0,31962):
    tweet = df['clean_tweet'][i]
    tweet = tweet.lower()
    tweet = tweet.split()
    tweet = [ps.stem(word) for word in tweet if not word in set(stopwords.words('english'))]
    tweet = ' '.join(tweet)
    corpus.append(tweet)

In [162]:
corpus[0]

'father dysfunct selfish drag kid dysfunction.'

In [163]:
df['clean_tweet'][0]

'father dysfunct selfish drag kid dysfunction.'

In [185]:
#Techniques to convert the tweets into Bag-of-Words, TF-IDF, and Word Embeddings
#Building various classifiers: -
#TF-IDF approach
X_train, X_test, y_train, y_test = train_test_split(df['clean_tweet'], df['label'], test_size = 0.3, random_state=101, shuffle = True, stratify=df['label'])

## Using Random Forest Classifier

In [180]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,confusion_matrix

In [190]:
rf_pipe = Pipeline([('tfidf',TfidfVectorizer()),('randomforest',RandomForestClassifier())])

In [191]:
rf_pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [193]:
rf_pred = rf_pipe.predict(X_test)

In [195]:
print(classification_report(y_test,rf_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      8916
           1       0.81      0.42      0.56       673

    accuracy                           0.95      9589
   macro avg       0.88      0.71      0.77      9589
weighted avg       0.95      0.95      0.95      9589



In [196]:
df['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [202]:
# As we can see that there is a huge difference in the dataset of 0 and 1 so we would be atleast creating a trainset which would have equal composition of the data by downgrading the number of samples of Non-Hate tweets.
#We would only sample the training set and remain testing set as it is as that is the general distribution seen by the classifier

In [205]:
X_train

26879                                      duck fes. dd a|
10123    95 days, 16 hours, 34 minut &amp; 38 second ma...
3991     yup. he' former pm, 20% nation think he' bee' ...
22355    0 day loui ledra beach hotel, papho region, it...
19287    see someth surpris fuck outta still shock d+-d...
                               ...                        
30215    sex blond free onlin white girlvideo porn blac...
29375    n seen, time time again, least 90% success der...
9438                                   content w/ life atm
12991                              deep sorrow. save drown
15473                                  joy first time snow
Name: clean_tweet, Length: 22373, dtype: object

In [206]:
y_train

26879    0
10123    0
3991     0
22355    0
19287    0
        ..
30215    0
29375    0
9438     0
12991    0
15473    0
Name: label, Length: 22373, dtype: int64

In [207]:
train_df = pd.DataFrame(X_train)

In [209]:
train_df['label'] = y_train

In [211]:
train_df.label.value_counts()

0    20804
1     1569
Name: label, dtype: int64

In [212]:
hate = train_df[train_df['label'] == 1]

In [218]:
hate.shape[0]

1569

In [216]:
nonhate = train_df[train_df['label'] == 0]

In [217]:
nonhate.shape

(20804, 2)

In [219]:
nonhate_samp = nonhate.sample(n = hate.shape[0])

In [220]:
nonhate_samp.shape

(1569, 2)

In [225]:
ds = pd.concat([hate,nonhate_samp]).sample(frac = 1)

In [226]:
ds.label.value_counts()

1    1569
0    1569
Name: label, dtype: int64

In [229]:
ds.head(10)

Unnamed: 0,clean_tweet,label
22934,live! swiftli sharpen fang tale a|,1
13987,knee-jerk reaction assum ait must isis.a a|,1
8374,far blog comment claim senad lulic statement r...,1
30545,vandalis condemn act,1
31863,refus veto resolut,1
10558,finmin aso: close watch uk referendum exit eu,0
28832,nation news-channel play old youtub video girl...,0
18408,enjoy everi moment life dd,0
28753,time spend 3/4 feel posit at...,0
7556,"4 beat ? no, 3 bp might make 2 cou aliv serv l...",1


In [230]:
#Let us see what effect does it have on the result

In [241]:
X_train_samp =ds.drop('label',axis=1)

In [246]:
X_train_samp = X_train_samp.iloc[:,0]

In [237]:
y_train_samp = ds.label

In [239]:
type(y_train_samp)

pandas.core.series.Series

In [247]:
type(X_train_samp)

pandas.core.series.Series

In [248]:
rf_pipe_samp = Pipeline([('tfidf',TfidfVectorizer()),('randomforest',RandomForestClassifier())])

In [249]:
rf_pipe_samp.fit(X_train_samp,y_train_samp)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [251]:
rf_pred_samp =rf_pipe_samp.predict(X_test)

In [252]:
print(classification_report(y_test,rf_pred_samp))

              precision    recall  f1-score   support

           0       0.98      0.85      0.91      8916
           1       0.28      0.74      0.40       673

    accuracy                           0.85      9589
   macro avg       0.63      0.79      0.66      9589
weighted avg       0.93      0.85      0.88      9589



In [253]:
print(classification_report(y_test,rf_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      8916
           1       0.81      0.42      0.56       673

    accuracy                           0.95      9589
   macro avg       0.88      0.71      0.77      9589
weighted avg       0.95      0.95      0.95      9589



In [254]:
#Atleast in the case of Random Forest Classifier our downsampling didnt work as we had planned. Lets try for other prominent classifiers.

In [255]:
ds.to_csv('data/train_samp_data.csv')

## Logistic Regression

In [256]:
from sklearn.linear_model import LogisticRegression

In [259]:
lr_pipe = Pipeline([('tfidf',TfidfVectorizer()),('log_reg',LogisticRegression())])

In [260]:
#First we would be fitting it to the normal Dataset with a division of about 93/7

In [261]:
lr_pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('log_reg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_sca

In [262]:
lr_pred = lr_pipe.predict(X_test)

In [263]:
print(classification_report(y_test,lr_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      8916
           1       0.88      0.22      0.35       673

    accuracy                           0.94      9589
   macro avg       0.91      0.61      0.66      9589
weighted avg       0.94      0.94      0.93      9589



In [264]:
#Now we would be using the downggraded dataset

In [265]:
lr_pipe_samp = Pipeline([('tfidf',TfidfVectorizer()),('log_reg',LogisticRegression())])

In [266]:
lr_pipe_samp.fit(X_train_samp,y_train_samp)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('log_reg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_sca

In [267]:
lr_pred_samp = lr_pipe_samp.predict(X_test)

In [268]:
print(classification_report(y_test,lr_pred_samp))

              precision    recall  f1-score   support

           0       0.98      0.83      0.90      8916
           1       0.25      0.76      0.38       673

    accuracy                           0.83      9589
   macro avg       0.62      0.80      0.64      9589
weighted avg       0.93      0.83      0.86      9589



## Naive Bayes

In [269]:
from sklearn.naive_bayes import MultinomialNB

In [270]:
nb = Pipeline([('tfidf',TfidfVectorizer()),('nb',MultinomialNB())])

In [271]:
nb.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [272]:
nb_pred = nb.predict(X_test)

In [273]:
print(classification_report(y_test,nb_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.97      8916
           1       1.00      0.07      0.13       673

    accuracy                           0.93      9589
   macro avg       0.97      0.53      0.55      9589
weighted avg       0.94      0.93      0.91      9589



In [274]:
#We will now use the sampled dataset that we created

In [275]:
nb_samp =  Pipeline([('tfidf',TfidfVectorizer()),('nb',MultinomialNB())])

In [276]:
nb_samp.fit(X_train_samp,y_train_samp)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [277]:
nb_pred_samp = nb.predict(X_test)

In [278]:
print(classification_report(y_test,nb_pred_samp))

              precision    recall  f1-score   support

           0       0.93      1.00      0.97      8916
           1       1.00      0.07      0.13       673

    accuracy                           0.93      9589
   macro avg       0.97      0.53      0.55      9589
weighted avg       0.94      0.93      0.91      9589

