**Loading the dataset**

In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive


In [2]:
#go to the location of the dataset on your drive
%cd 'My Drive'

/gdrive/My Drive


In [3]:
%cd FYP

/gdrive/My Drive/FYP


In [4]:
import pandas as pd
tweets_df = pd.read_csv('HOT_dataset1.csv')
tweets_df = tweets_df[['Label', 'Tweet']]
tweets_df.head(100)

Unnamed: 0,Label,Tweet
0,0,@saud5683 @Mutayyab410 @shivang598 @Ranask35 @...
1,1,"Banti hai empowered woman, feminism pe gyan pe..."
2,1,RT @kim_jong_korea: @updatingwait @Acutereply ...
3,1,@InviSibleSold @mabkhan86 @dridadahn Punjab in...
4,1,RT @MrMonsterSaid: Agar koi bole ki ja ke chil...
...,...,...
95,1,Anushka: baby tum mujhe shadi bad kis nam se b...
96,1,@AvijitEmmi Bahenchod .... experienced lagte h...
97,1,@behenchodabhay @sarita_beta @parvatikhan1 @sh...
98,1,Or koi gaali bhi nahi dega bahenchod\xf0\x9f\x...


# **Data Preprocessing** 

**Simple cleaning function to remove all irrelevant html tags, twitter mentions and retweets.**

In [5]:
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer

def tweet_cleaner_updated(text):

    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()

    clean = re.sub(r'@\S+|(https?://[^ ]+)|(RT)|(\\x[a-zA-z0-9]+)|(www.[^ ]+)','',souped)
    clean_lower_case = clean.lower()
    clean_letters_only = re.sub("[^a-zA-Z]", " ", clean_lower_case)
  
    #unnecessary white spaces may have been created due to cleaning
    #Therefore it is necessary to tokenize the string and then join them together to eliminate extra spaces
    tokenizer  = WordPunctTokenizer()
    words = [x for x  in tokenizer.tokenize(clean_letters_only) if len(x) > 1]
    return (" ".join(words)).strip()

In [6]:
# cleaned_tweet is used to store the tweets after they have been cleaned
cleaned_tweet = []
for tweet in tweets_df['Tweet']:
  cleaned_tweet.append(tweet_cleaner_updated(tweet))
print(cleaned_tweet)

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup




In [7]:
cleaned = pd.DataFrame(cleaned_tweet, columns =['Tweet'])  #cleaned is turned to a dataframe
cleaned.head()


Unnamed: 0,Tweet
0,haa jaise tum bhi abhi
1,banti hai empowered woman feminism pe gyan pel...
2,ab usko chhod mjse bat kr tera baap aa gya hai...
3,punjab in madarchodon ko khila raha hai nokria...
4,agar koi bole ki ja ke chill maar to madarchod...


**Function to remove stop words**

In [8]:
def create_stopwords_set(filename):
  stop_words = pd.read_csv(filename) 
  stop_words_list = stop_words.values.tolist()
  new_list =[]
  for stopword in stop_words_list:
    for word in stopword:
      new_list.append(word)
  return set(new_list)

set_of_stopwords = create_stopwords_set("Hinglish StopWords.csv")

In [9]:
cleaned['Tweet'] = cleaned['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in set_of_stopwords]))
cleaned.head()

Unnamed: 0,Tweet
0,haa
1,banti empowered woman feminism gyan pelti din ...
2,usko chhod mjse bat kr baap aa gya ldki beech ...
3,punjab madarchodon khila nokrian day imran ma
4,chill maar madarchod gand maar lene


In [10]:
#The Tweet column of the orignal dataset is replaced by the Tweet column of the cleaned dataframe
tweets_df['Tweet'] = cleaned['Tweet'] 
tweets_df

Unnamed: 0,Label,Tweet
0,0,haa
1,1,banti empowered woman feminism gyan pelti din ...
2,1,usko chhod mjse bat kr baap aa gya ldki beech ...
3,1,punjab madarchodon khila nokrian day imran ma
4,1,chill maar madarchod gand maar lene
...,...,...
3151,1,islamic kashmiri jihaadi suar jisk
3152,1,love jihaadi killed
3153,1,madarjaat rand aullad islamic jihaadi
3154,1,jihaadi kutte double maut aane


In [11]:
tweets_df.to_csv('Cleaned Tweets.csv')

In [12]:
tweets_df.dropna(inplace = True)
tweets_df

Unnamed: 0,Label,Tweet
0,0,haa
1,1,banti empowered woman feminism gyan pelti din ...
2,1,usko chhod mjse bat kr baap aa gya ldki beech ...
3,1,punjab madarchodon khila nokrian day imran ma
4,1,chill maar madarchod gand maar lene
...,...,...
3151,1,islamic kashmiri jihaadi suar jisk
3152,1,love jihaadi killed
3153,1,madarjaat rand aullad islamic jihaadi
3154,1,jihaadi kutte double maut aane


## **EXTRACTION OF FEATURES FROM THE DATA SET**

Splitting the clean Dataset into Train and Test in 80:20 ratio

In [13]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(tweets_df, test_size=0.2)

In [14]:
#save the test set
test.to_csv("test.csv")

**Bag of word features**

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer()
train_bow = bow_vectorizer.fit_transform(train['Tweet'])

In [16]:
test_bow = bow_vectorizer.transform(test['Tweet'])

**TF-IDF Features**

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
# TF-IDF feature matrix
train_tfidf = tfidf_vectorizer.fit_transform(train['Tweet'])

In [18]:
test_tfidf = tfidf_vectorizer.transform(test['Tweet'])

# **Model Building**

splitting data into training and validation set

In [19]:
#splitting the bow features
xtrain_bow, xvalid_bow, ytrain_bow, yvalid_bow = train_test_split(train_bow, train['Label'], random_state=1000, test_size=0.1)

In [20]:
#splitting the tfidf features
xtrain_tfidf, xvalid_tfidf, ytrain_tfidf, yvalid_tfidf = train_test_split(train_tfidf, train['Label'], random_state=1000, test_size = 0.1)

**1. Logistic Regression**

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score

import numpy as np

**Using Bag-of-Words features**

In [22]:
logistic_classifier_bow = LogisticRegression()
logistic_classifier_bow.fit(xtrain_bow, ytrain_bow)

#predicting on the validation set
prediction_bow = logistic_classifier_bow.predict_proba(xvalid_bow)
prediction_bow_int = prediction_bow[:,1] >= 0.6
prediction_bow_int = prediction_bow_int.astype(np.int)

Printing out the metrics of performance measure for the Bag of Word features

In [23]:
print("F1 Score= ", f1_score(yvalid_bow, prediction_bow_int)) # calculating f1 score
print(confusion_matrix(yvalid_bow, prediction_bow_int))
print("Accuracy Score= ", accuracy_score(yvalid_bow, prediction_bow_int))
print("Precision Score= ", precision_score(yvalid_bow, prediction_bow_int))
print("Recall Score= ", recall_score(yvalid_bow, prediction_bow_int))

F1 Score=  0.9006622516556292
[[ 87   8]
 [ 22 136]]
Accuracy Score=  0.8814229249011858
Precision Score=  0.9444444444444444
Recall Score=  0.8607594936708861


Testing the performance of the BoW Logistic Regression model on the Test set

In [24]:
test_bow_prediction = logistic_classifier_bow.predict_proba(test_bow)
test_bow_prediction_int = test_bow_prediction[:, 1] >= 0.6
test_bow_prediction_int = test_bow_prediction_int.astype(np.int)

In [25]:
print("F1 Score= ", f1_score(test['Label'], test_bow_prediction_int)) 
print(confusion_matrix(test['Label'], test_bow_prediction_int))
print("Accuracy Score= ", accuracy_score(test['Label'], test_bow_prediction_int))
print("Precision Score= ", precision_score(test['Label'], test_bow_prediction_int))
print("Recall Score= ", recall_score(test['Label'],test_bow_prediction_int))

F1 Score=  0.9174757281553398
[[186  19]
 [ 49 378]]
Accuracy Score=  0.8924050632911392
Precision Score=  0.9521410579345088
Recall Score=  0.8852459016393442


**Using TF-IDF Features**

In [26]:
logistic_classifier_tfidf = LogisticRegression()
logistic_classifier_tfidf.fit(xtrain_tfidf, ytrain_tfidf)

#predicting on the validation set
prediction_tfidf = logistic_classifier_tfidf.predict_proba(xvalid_bow)
prediction_tfidf_int = prediction_tfidf[:, 1] >= 0.6
prediction_tfidf_int = prediction_tfidf_int.astype(np.int)

Printing out the metrics of performance measure for the TF-IDF features

In [27]:
print("F1 Score= ", f1_score(yvalid_tfidf, prediction_tfidf_int))
print(confusion_matrix(yvalid_tfidf, prediction_tfidf_int))
print("Accuracy Score= ", accuracy_score(yvalid_tfidf, prediction_tfidf_int))
print("Precision Score= ", precision_score(yvalid_tfidf, prediction_tfidf_int))
print("Recall Score= ", recall_score(yvalid_tfidf, prediction_tfidf_int))

F1 Score=  0.910828025477707
[[ 82  13]
 [ 15 143]]
Accuracy Score=  0.8893280632411067
Precision Score=  0.9166666666666666
Recall Score=  0.9050632911392406


Testing the performance of the TF-IDF Logistic Regression Model on the test set

In [28]:
test_tfidf_prediction = logistic_classifier_tfidf.predict_proba(test_tfidf)
test_tfidf_prediction_int = test_tfidf_prediction[:, 1] >= 0.6
test_tfidf_prediction_int = test_tfidf_prediction_int.astype(np.int)

In [29]:
print("F1 Score= ", f1_score(test['Label'], test_tfidf_prediction_int))
print(confusion_matrix(test['Label'], test_tfidf_prediction_int))
print("Accuracy Score= ", accuracy_score(test['Label'], test_tfidf_prediction_int))
print("Precision Score= ", precision_score(test['Label'], test_tfidf_prediction_int))
print("Recall Score= ", recall_score(test['Label'], test_tfidf_prediction_int))

F1 Score=  0.923444976076555
[[182  23]
 [ 41 386]]
Accuracy Score=  0.8987341772151899
Precision Score=  0.9437652811735942
Recall Score=  0.9039812646370023


**2. Support Vector Machine**

In [30]:
from sklearn import svm

Using Bag of Words Features

In [31]:
svc_bow = svm.SVC(kernel='linear', C=1, probability=True).fit(xtrain_bow, ytrain_bow)

prediction_svc = svc_bow.predict_proba(xvalid_bow)
prediction_svc_int = prediction_svc[:,1] >= 0.6
prediction_svc_int = prediction_svc_int.astype(np.int)

Printing out the metrics of performance measure for the BoW features

In [32]:
print("F1 Score= ", f1_score(yvalid_bow, prediction_svc_int))
print(confusion_matrix(yvalid_bow, prediction_svc_int))
print("Accuracy Score= ", accuracy_score(yvalid_bow, prediction_svc_int))
print("Precision Score= ", precision_score(yvalid_bow, prediction_svc_int))
print("Recall Score= ", recall_score(yvalid_bow, prediction_svc_int))

F1 Score=  0.926517571884984
[[ 85  10]
 [ 13 145]]
Accuracy Score=  0.9090909090909091
Precision Score=  0.9354838709677419
Recall Score=  0.9177215189873418


Testing the performance of BoW SVM

In [33]:
test_bow_prediction_SVM = svc_bow.predict_proba(test_bow)
test_bow_prediction_SVM_int = test_bow_prediction_SVM[:, 1] >= 0.6
test_bow_prediction_SVM_int = test_bow_prediction_SVM_int.astype(np.int)

In [34]:
print("F1 Score= ", f1_score(test['Label'], test_bow_prediction_SVM_int))
print(confusion_matrix(test['Label'], test_bow_prediction_SVM_int))
print("Accuracy Score= ", accuracy_score(test['Label'], test_bow_prediction_SVM_int))
print("Precision Score= ", precision_score(test['Label'], test_bow_prediction_SVM_int))
print("Recall Score= ", recall_score(test['Label'], test_bow_prediction_SVM_int))

F1 Score=  0.9182692307692307
[[182  23]
 [ 45 382]]
Accuracy Score=  0.8924050632911392
Precision Score=  0.9432098765432099
Recall Score=  0.8946135831381733


Using TF-IDF Features

In [35]:
svc_tfidf = svm.SVC(kernel='linear', C=1, probability=True).fit(xtrain_tfidf, ytrain_tfidf)
prediction_tfidf_svc = svc_tfidf.predict_proba(xvalid_tfidf)
prediction_tfidf_svc_int = prediction_tfidf_svc[:,1] >= 0.6
prediction_tfidf_svc_int = prediction_tfidf_svc_int.astype(np.int)

Printing out the metrics of performance measure for the TFIDF features

In [36]:
print("F1 Score= ", f1_score(yvalid_tfidf, prediction_tfidf_svc_int))
print(confusion_matrix(yvalid_tfidf, prediction_tfidf_svc_int))
print("Accuracy Score= ", accuracy_score(yvalid_tfidf, prediction_tfidf_svc_int))
print("Precision Score= ", precision_score(yvalid_tfidf, prediction_tfidf_svc_int))
print("Recall Score= ", recall_score(yvalid_tfidf, prediction_tfidf_svc_int))

F1 Score=  0.9032258064516129
[[ 83  12]
 [ 18 140]]
Accuracy Score=  0.8814229249011858
Precision Score=  0.9210526315789473
Recall Score=  0.8860759493670886


Testing the performance of TF-IDF SVM

In [37]:
test_tfidf_prediction_SVM = svc_tfidf.predict_proba(test_tfidf)
test_tfidf_prediction_SVM_int = test_tfidf_prediction_SVM[:, 1] >= 0.6
test_tfidf_prediction_SVM_int = test_tfidf_prediction_SVM_int.astype(np.int)

In [38]:
print("F1 Score= ", f1_score(test['Label'], test_tfidf_prediction_SVM_int))
print(confusion_matrix(test['Label'], test_tfidf_prediction_SVM_int))
print("Accuracy Score= ", accuracy_score(test['Label'], test_tfidf_prediction_SVM_int))
print("Precision Score= ", precision_score(test['Label'], test_tfidf_prediction_SVM_int))
print("Recall Score= ", recall_score(test['Label'], test_tfidf_prediction_SVM_int))

F1 Score=  0.9140164899882214
[[171  34]
 [ 39 388]]
Accuracy Score=  0.884493670886076
Precision Score=  0.919431279620853
Recall Score=  0.9086651053864169


**3. Random Forrest Classifier**

In [39]:
from sklearn.ensemble import RandomForestClassifier

Using BoW features

In [40]:
rf = RandomForestClassifier(n_estimators=50, random_state=11).fit(xtrain_bow, ytrain_bow)

prediction_bow_rf = rf.predict_proba(xvalid_bow)
prediction_bow_rf_int = prediction_bow_rf[:,1] >= 0.6
prediction_bow_rf_int = prediction_bow_rf_int.astype(np.int)

Printing out the metrics of performance measure for the BoW features

In [41]:
print("F1 Score= ", f1_score(yvalid_bow, prediction_bow_rf_int))
print(confusion_matrix(yvalid_bow, prediction_bow_rf_int))
print("Accuracy Score= ", accuracy_score(yvalid_bow, prediction_bow_rf_int))
print("Precision Score= ", precision_score(yvalid_bow, prediction_bow_rf_int))
print("Recall Score= ", recall_score(yvalid_bow, prediction_bow_rf_int))

F1 Score=  0.9133333333333333
[[ 90   5]
 [ 21 137]]
Accuracy Score=  0.8972332015810277
Precision Score=  0.9647887323943662
Recall Score=  0.8670886075949367


Testing the performance of BoW Random Forest Classifier

In [42]:
test_bow_prediction_rf = rf.predict_proba(test_bow)
test_bow_prediction_rf_int = test_bow_prediction_rf[:, 1] >= 0.6
test_bow_prediction_rf_int = test_bow_prediction_rf_int.astype(np.int)

In [43]:
print("F1 Score= ", f1_score(test['Label'], test_bow_prediction_rf_int))
print(confusion_matrix(test['Label'], test_bow_prediction_rf_int))
print("Accuracy Score= ", accuracy_score(test['Label'], test_bow_prediction_rf_int))
print("Precision Score= ", precision_score(test['Label'], test_bow_prediction_rf_int))
print("Recall Score= ", recall_score(test['Label'],test_bow_prediction_rf_int))

F1 Score=  0.8981132075471698
[[194  11]
 [ 70 357]]
Accuracy Score=  0.8718354430379747
Precision Score=  0.970108695652174
Recall Score=  0.8360655737704918


Using TF-IDF features

In [44]:
rf_tfidf = RandomForestClassifier(n_estimators=50, random_state=11).fit(xtrain_tfidf, ytrain_tfidf)

prediction_tfidf_rf = rf_tfidf.predict_proba(xvalid_tfidf)
prediction_tfidf_rf_int = prediction_tfidf_rf[:,1] >= 0.6
prediction_tfidf_rf_int = prediction_tfidf_rf_int.astype(np.int)

Printing out the metrics of performance measure for the TF-IDF features

In [45]:
print("F1 Score= ", f1_score(yvalid_tfidf, prediction_tfidf_rf_int))
print(confusion_matrix(yvalid_tfidf, prediction_tfidf_rf_int))
print("Accuracy Score= ", accuracy_score(yvalid_tfidf, prediction_tfidf_rf_int))
print("Precision Score= ", precision_score(yvalid_tfidf, prediction_tfidf_rf_int))
print("Recall Score= ", recall_score(yvalid_tfidf, prediction_tfidf_rf_int))

F1 Score=  0.9016949152542374
[[ 91   4]
 [ 25 133]]
Accuracy Score=  0.8853754940711462
Precision Score=  0.9708029197080292
Recall Score=  0.8417721518987342


Testing the performance of TF-IDF Random Forest Classifier

In [46]:
test_tfidf_prediction_rf = rf_tfidf.predict_proba(test_bow)
test_tfidf_prediction_rf_int = test_tfidf_prediction_rf[:, 1] >= 0.6
test_tfidf_prediction_rf_int = test_tfidf_prediction_rf_int.astype(np.int)

In [47]:
print("F1 Score= ", f1_score(test['Label'], test_tfidf_prediction_rf_int))
print(confusion_matrix(test['Label'], test_tfidf_prediction_rf_int))
print("Accuracy Score= ", accuracy_score(test['Label'], test_tfidf_prediction_rf_int))
print("Precision Score= ", precision_score(test['Label'], test_tfidf_prediction_rf_int))
print("Recall Score= ", recall_score(test['Label'],test_tfidf_prediction_rf_int))

F1 Score=  0.8902900378310213
[[192  13]
 [ 74 353]]
Accuracy Score=  0.8623417721518988
Precision Score=  0.9644808743169399
Recall Score=  0.8266978922716628
