In [56]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, auc, precision_score, recall_score, f1_score, roc_auc_score, classification_report,balanced_accuracy_score, precision_recall_curve, plot_precision_recall_curve
from utils import *
from sklearn import svm
from sklearn import preprocessing
seed = 42
np.random.seed(seed)
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.preprocessing import sequence
import torch
from torch import optim
import scipy
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# Read the Datasets 

### HASOC Dataset 

In [2]:
hasoc = pd.read_csv('data/Cleaning/HASOC_Dataset.csv',on_bad_lines='skip')
hasoc.head()

Unnamed: 0,label;clean_tweet;Hash Words
0,0;murder let u bombard;#murderer
1,0;funni attack doctor happen nonbjp state medi...
2,0;think focu upheld spo man sprit rather glove...
3,0;arrog refus turn c4debat allerg debat hust b...
4,0;minut left almond oneplus7pro avail sbi cash...


In [3]:
hasoc= hasoc.rename(columns={'label;clean_tweet;Hash Words': 'temp'})
hasoc= hasoc.temp.str.split(";",expand=True)
hasoc.columns = ['label', 'text', 'hashtags']

In [4]:
hasoc

Unnamed: 0,label,text,hashtags
0,0,murder let u bombard,#murderer
1,0,funni attack doctor happen nonbjp state medium...,#doctorsfightback #bengalviolence #kerala #doc...
2,0,think focu upheld spo man sprit rather glove s...,#suppodhoni #dhonikeepstheglove
3,0,arrog refus turn c4debat allerg debat hust bor...,#c4debate. #borisjohnsonshouldnotbepm
4,0,minut left almond oneplus7pro avail sbi cash b...,#oneplus7pro #sbi #oneplus7 #oneplus7ishere #o...
...,...,...,...
4637,0,lost polit good momotah hatao doctorsfightback,#doctorsfightback
4638,0,world cup go icc shameonicc,#shameonicc
4639,0,whitehous visit year wethenoh fucktrump,#wethenoh #fucktrump
4640,0,made ok nazi fucktrump hope spend next bihday ...,#fucktrump.


In [5]:
hasoc.label.value_counts()

0    3523
1    1119
Name: label, dtype: int64

### Aristotle Dataset

In [6]:
aris = pd.read_csv('data/Cleaning/Aristotle_Dataset.csv',on_bad_lines='skip')
aris.head()

Unnamed: 0,label;clean_tweet;Hash Words
0,1;found transpond snail exclus shot skypiea ka...
1,0;logic treasuri sign deal even exist provid g...
2,0;result risen isil dateoveral end sta chang b...
3,0;nowplay onair 90 actuel sur live run water s...
4,1;democrat never thisthey wrote law could read...


In [7]:
aris= aris.rename(columns={'label;clean_tweet;Hash Words': 'temp'})
aris= aris.temp.str.split(";",expand=True)
aris.columns = ['label', 'text', 'hashtags']
aris

Unnamed: 0,label,text,hashtags
0,1,found transpond snail exclus shot skypiea kami...,#trecru
1,0,logic treasuri sign deal even exist provid gua...,No hashtags
2,0,result risen isil dateoveral end sta chang big,No hashtags
3,0,nowplay onair 90 actuel sur live run water sme...,#nowplaying #onair #90s
4,1,democrat never thisthey wrote law could read t...,No hashtags
...,...,...,...
7075,0,omg see bigol gt star 4 n share,#bigolive
7076,1,take pleasur insult hindu sad exist one conve ...,No hashtags
7077,0,made remind 45 well promisebreak,#promisebreaker
7078,1,everyth lie one degre anoth real truth even ey...,No hashtags


In [8]:
aris.label.value_counts()

0    5542
1    1538
Name: label, dtype: int64

### Hugging Face Dataset 

In [9]:
hf = pd.read_csv('data/Cleaning/HuggingFace_Dataset.csv',on_bad_lines='skip')
hf.columns

Index(['label;clean_tweet;Hash Words;;'], dtype='object')

In [10]:
hf= hf.rename(columns={'label;clean_tweet;Hash Words;;': 'temp'})
hf= hf.temp.str.split(";",expand=True)
hf.columns = ['label', 'text', 'hashtags','temp1','temp2']
hf.drop(columns=['temp1','temp2'],inplace=True)

In [11]:
hf.label.value_counts()

0    28859
1     2087
Name: label, dtype: int64

### Copenhagen Dataset 

In [12]:
copen = pd.read_csv('data/Cleaning/Copenhagen_Dataset.csv',on_bad_lines='skip')
copen.head()

Unnamed: 0,label;clean_tweet;Hash Words
0,0;new mascot;No hashtags
1,0;cook public safe option good option want peo...
2,0;one said threat seriou howev differ isi game...
3,0;disneyland realli;No hashtags
4,1;vagina tri funni;No hashtags


In [13]:
copen= copen.rename(columns={'label;clean_tweet;Hash Words': 'temp'})
copen= copen.temp.str.split(";",expand=True)
copen.columns = ['label', 'text', 'hashtags']
copen

Unnamed: 0,label,text,hashtags
0,0,new mascot,No hashtags
1,0,cook public safe option good option want peopl...,No hashtags
2,0,one said threat seriou howev differ isi gamerg,No hashtags
3,0,disneyland realli,No hashtags
4,1,vagina tri funni,No hashtags
...,...,...,...
9871,0,go npo never get rich p goal make enough live ...,No hashtags
9872,0,yeah men never talk shit know noth like bih co...,No hashtags
9873,0,someon realli want keep,No hashtags
9874,1,yup,No hashtags


In [14]:
copen.label.value_counts()

0    7210
1    2666
Name: label, dtype: int64

# Train and Test Data Preparation 

### Hugging Face Dataset to Train and Others to Test.

In [15]:
hf_x_train = hf.text
hasoc_x_test = hasoc.text
aris_x_test = aris.text
copen_x_test = copen.text

hf_y_train = hf.label
hasoc_y_test = hasoc.label
aris_y_test = aris.label
copen_y_test = copen.label

### Implement TFIDF for Unshuffled Data

In [16]:
vec = TfidfVectorizer()
tfidf_tr = vec.fit_transform(hf_x_train)
tfidf_hasoc_test = vec.transform(hasoc_x_test)
tfidf_aris_test = vec.transform(aris_x_test)
tfidf_copen_test = vec.transform(copen_x_test)

### Shuffle the Data for Neural Network Model

In [17]:
hf_shuffle = hf.sample(frac=1)
hasoc_shuffle = hasoc.sample(frac=1)
aris_shuffle = aris.sample(frac=1)
copen_shuffle = copen.sample(frac=1)

In [18]:
hf_x_train_shuffle = hf_shuffle.text
hasoc_x_test_shuffle = hasoc_shuffle.text
aris_x_test_shuffle = aris_shuffle.text
copen_x_test_shuffle = copen_shuffle.text

hf_y_train_shuffle = hf_shuffle.label
hasoc_y_test_shuffle = hasoc_shuffle.label
aris_y_test_shuffle = aris_shuffle.label
copen_y_test_shuffle = copen_shuffle.label

### Implement TFIDF for Shuffled Data 

In [19]:
vec = TfidfVectorizer()
tfidf_tr_shuffle = vec.fit_transform(hf_x_train_shuffle)
tfidf_hasoc_test_shuffle = vec.transform(hasoc_x_test_shuffle)
tfidf_aris_test_shuffle = vec.transform(aris_x_test_shuffle)
tfidf_copen_test_shuffle = vec.transform(copen_x_test_shuffle)

# Artifical Neural Network 

In [20]:
hf_y_train_encode = preprocessing.LabelEncoder().fit_transform(hf_y_train_shuffle)
hasoc_y_test_encode = preprocessing. LabelEncoder().fit_transform(hasoc_y_test_shuffle)
aris_y_test_encode = preprocessing. LabelEncoder().fit_transform(aris_y_test_shuffle)
copen_y_test_encode = preprocessing. LabelEncoder().fit_transform(copen_y_test_shuffle)

In [21]:
hf_x_train = scipy.sparse.csr_matrix.todense(tfidf_tr_shuffle)
hasoc_x_test = scipy.sparse.csr_matrix.todense(tfidf_hasoc_test_shuffle)
aris_x_test = scipy.sparse.csr_matrix.todense(tfidf_aris_test_shuffle)
copen_x_test = scipy.sparse.csr_matrix.todense(tfidf_copen_test_shuffle)

In [22]:
# define and fit the model
def get_model(trainX,trainy):
 # define model
    model = tf.keras.Sequential([
                            tf.keras.layers.Dense(128, activation='relu', input_shape=(trainX.shape[1],)),
                            tf.keras.layers.Dropout(0.2),
                            tf.keras.layers.Dense(32, activation='relu'),
                            tf.keras.layers.Dropout(0.2),
                            tf.keras.layers.Dense(128, activation='softmax')])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(trainX, trainy, epochs=10, verbose=2)
    return model

### Use HASOC to Test

In [23]:
ann_model = get_model(hf_x_train,hf_y_train_encode)

2023-03-11 19:44:23.503102: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10
968/968 - 7s - loss: 0.4044 - accuracy: 0.9423 - 7s/epoch - 7ms/step
Epoch 2/10
968/968 - 7s - loss: 0.0700 - accuracy: 0.9767 - 7s/epoch - 7ms/step
Epoch 3/10
968/968 - 7s - loss: 0.0324 - accuracy: 0.9891 - 7s/epoch - 7ms/step
Epoch 4/10
968/968 - 7s - loss: 0.0158 - accuracy: 0.9949 - 7s/epoch - 7ms/step
Epoch 5/10
968/968 - 7s - loss: 0.0086 - accuracy: 0.9974 - 7s/epoch - 7ms/step
Epoch 6/10
968/968 - 7s - loss: 0.0055 - accuracy: 0.9985 - 7s/epoch - 7ms/step
Epoch 7/10
968/968 - 7s - loss: 0.0032 - accuracy: 0.9990 - 7s/epoch - 7ms/step
Epoch 8/10
968/968 - 7s - loss: 0.0018 - accuracy: 0.9994 - 7s/epoch - 7ms/step
Epoch 9/10
968/968 - 7s - loss: 0.0019 - accuracy: 0.9995 - 7s/epoch - 7ms/step
Epoch 10/10
968/968 - 7s - loss: 0.0011 - accuracy: 0.9996 - 7s/epoch - 7ms/step


In [24]:
y_hasoc_predict = ann_model.predict(hasoc_x_test,verbose=0)
y_predict_hasoc_classes=np.argmax(y_hasoc_predict,axis=1)

In [25]:
ann_Evaluation(hasoc_y_test_encode, y_predict_hasoc_classes)

Accuracy:  0.6809564842740198
F1:  0.1512893982808023
Recall:  0.11796246648793565
Precision:  0.2108626198083067
ROC-AUC:  0.48887053213695675
PR-AUC:  0.237497743744669


### Use Aristotle Dataset to Test 

In [26]:
y_aris_predict = ann_model.predict(aris_x_test,verbose=0)
y_predict_aris_classes=np.argmax(y_aris_predict,axis=1)

In [27]:
ann_Evaluation(aris_y_test_encode, y_predict_aris_classes)

Accuracy:  0.7816384180790961
F1:  0.2262262262262262
Recall:  0.1469440832249675
Precision:  0.49130434782608695
ROC-AUC:  0.5523605295229854
PR-AUC:  0.2575050014390217


### Use Copenhagen Dataset to Test 

In [28]:
y_copen_predict = ann_model.predict(copen_x_test,verbose=0)
y_predict_copen_classes=np.argmax(y_copen_predict,axis=1)

In [29]:
ann_Evaluation(copen_y_test_encode, y_predict_copen_classes)

Accuracy:  0.7562778452814904
F1:  0.3876876112948359
Recall:  0.28582145536384096
Precision:  0.6023715415019762
ROC-AUC:  0.6080286194988415
PR-AUC:  0.36496131414504673


# Random Forest Classifier

In [30]:
rf = RandomForestClassifier(n_estimators=100).fit(tfidf_tr, hf_y_train)

### Use HASOC to Test

In [31]:
hasoc_rf_test = rf.predict(tfidf_hasoc_test)
get_metrics_confusion(tfidf_hasoc_test, hasoc_y_test, hasoc_rf_test, rf)

Accuracy:  0.7397673416630762
F1 Score:  0.0944527736131934
ROC-AUC:  0.5124972192184285
Recall:  0.05630026809651475
Precision:  0.2930232558139535
PR-AUC:  0.2496060623866653


### Use Aristotle Dataset to Test 

In [32]:
aris_rf_test = rf.predict(tfidf_aris_test)
get_metrics_confusion(tfidf_aris_test, aris_y_test, aris_rf_test, rf)

Accuracy:  0.7879943502824859
F1 Score:  0.11861421021726362
ROC-AUC:  0.7088533994337601
Recall:  0.06566970091027308
Precision:  0.6121212121212121
PR-AUC:  0.39234347640983047


### Use Copenhagen Dataset to Test

In [33]:
copen_rf_test = rf.predict(tfidf_copen_test)
get_metrics_confusion(tfidf_copen_test, copen_y_test, copen_rf_test, rf)

Accuracy:  0.7996152288375861
F1 Score:  0.47296937416777624
ROC-AUC:  0.7245990242359481
Recall:  0.33308327081770445
Precision:  0.8154269972451791
PR-AUC:  0.5713141207204333


# Support Vector Classifier

In [34]:
svc = svm.LinearSVC(random_state=42).fit(tfidf_tr, hf_y_train)

### Use HASOC to Test 

In [35]:
hasoc_svc_test = svc.predict(tfidf_hasoc_test)
get_metrics_2(tfidf_hasoc_test, hasoc_y_test, hasoc_svc_test, svc)

Accuracy:  0.7156398104265402
F1:  0.12928759894459105
Recall:  0.08757819481680071
Precision:  0.24685138539042822
ROC-AUC:  0.5037936582706722
PR-AUC:  0.2506970111854758


### Use Aristotle Dataset to Test 

In [36]:
aris_svc_test = svc.predict(tfidf_aris_test)
get_metrics_2(tfidf_aris_test, aris_y_test, aris_svc_test, svc)

Accuracy:  0.7889830508474577
F1:  0.1827133479212254
Recall:  0.10858257477243173
Precision:  0.5758620689655173
ROC-AUC:  0.6810545103263927
PR-AUC:  0.38254934906360244


### Use Copenhagen Dataset to Test 

In [37]:
copen_svc_test = svc.predict(tfidf_copen_test)
get_metrics_2(tfidf_copen_test, copen_y_test, copen_svc_test, svc)

Accuracy:  0.7775415147833131
F1:  0.405090712158137
Recall:  0.2805701425356339
Precision:  0.7283349561830574
ROC-AUC:  0.7178558942787016
PR-AUC:  0.5427381183279067


# Multinomial Naive Bayes

In [40]:
nb = MultinomialNB().fit(tfidf_tr, hf_y_train)

### Use HASOC to Test 

In [41]:
hasoc_nb_test = nb.predict(tfidf_hasoc_test)

In [42]:
get_metrics_confusion(tfidf_hasoc_test, hasoc_y_test, hasoc_nb_test, nb)

Accuracy:  0.7585092632485998
F1 Score:  0.0070859167404783
ROC-AUC:  0.47425941159803436
Recall:  0.0035746201966041107
Precision:  0.4
PR-AUC:  0.2274568832318805


###  Use Aristotle Dataset to Test 

In [43]:
aris_nb_test = nb.predict(tfidf_aris_test)

In [44]:
get_metrics_confusion(tfidf_aris_test, aris_y_test, aris_nb_test, nb)

Accuracy:  0.782909604519774
F1 Score:  0.001299545159194282
ROC-AUC:  0.6346916841201765
Recall:  0.0006501950585175553
Precision:  1.0
PR-AUC:  0.31763151362702025


### Use Copenhagen Dataset to Test 

In [45]:
copen_nb_test = nb.predict(tfidf_copen_test)

In [46]:
get_metrics_confusion(tfidf_copen_test, copen_y_test, copen_nb_test, nb)

Accuracy:  0.7300526528959093
F1 Score:  0.0007496251874062968
ROC-AUC:  0.6282697408055203
Recall:  0.00037509377344336085
Precision:  0.5
PR-AUC:  0.3903570018342335


# Logistic Regression

In [49]:
log = LogisticRegression().fit(tfidf_tr, hf_y_train)

### Use HASOC to Test

In [50]:
hasoc_log_test = log.predict(tfidf_hasoc_test)

In [51]:
get_metrics_confusion(tfidf_hasoc_test, hasoc_y_test, hasoc_log_test, log)

Accuracy:  0.7462300732442912
F1 Score:  0.05152979066022545
ROC-AUC:  0.5182732544999197
Recall:  0.028596961572832886
Precision:  0.2601626016260163
PR-AUC:  0.25332573986045753


### Use Aristotle Dataset to Test 

In [52]:
aris_log_test = log.predict(tfidf_aris_test)

In [53]:
get_metrics_confusion(tfidf_aris_test, aris_y_test, aris_log_test, log)

Accuracy:  0.7877118644067796
F1 Score:  0.08409506398537478
ROC-AUC:  0.7233732687471345
Recall:  0.044863459037711315
Precision:  0.6699029126213593
PR-AUC:  0.434280330515161


### Use Copenhagen Dataset to Test 

In [54]:
copen_log_test = log.predict(tfidf_copen_test)

In [55]:
get_metrics_confusion(tfidf_copen_test, copen_y_test, copen_log_test, log)

Accuracy:  0.743215876873228
F1 Score:  0.1484217595701813
ROC-AUC:  0.714625639766391
Recall:  0.08289572393098274
Precision:  0.7083333333333334
PR-AUC:  0.5018639536322111


# Gradient Boosting

In [57]:
gbc = GradientBoostingClassifier().fit(tfidf_tr, hf_y_train)

### Use HASOC to Test  

In [58]:
hasoc_gbc_test = gbc.predict(tfidf_hasoc_test)

In [59]:
get_metrics_confusion(tfidf_hasoc_test, hasoc_y_test, hasoc_gbc_test,gbc)

Accuracy:  0.7539853511417492
F1 Score:  0.035472972972972965
ROC-AUC:  0.5055505287987505
Recall:  0.01876675603217158
Precision:  0.3230769230769231
PR-AUC:  0.24900747237360282


### Use Aristotle Dataset to Test  

In [60]:
aris_gbc_test = gbc.predict(tfidf_aris_test)

In [61]:
get_metrics_confusion(tfidf_aris_test, aris_y_test, aris_gbc_test,gbc)

Accuracy:  0.7854519774011299
F1 Score:  0.06523076923076923
ROC-AUC:  0.6477778862348708
Recall:  0.03446033810143043
Precision:  0.6091954022988506
PR-AUC:  0.33989571603967006


### Use Copenhagen Dataset to Test  

In [62]:
copen_gbc_test = gbc.predict(tfidf_copen_test)

In [63]:
get_metrics_confusion(tfidf_copen_test, copen_y_test, copen_gbc_test,gbc)

Accuracy:  0.7382543539894694
F1 Score:  0.09710094306671324
ROC-AUC:  0.6933008304087117
Recall:  0.052138034508627154
Precision:  0.7055837563451777
PR-AUC:  0.4763762784731524
