In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, auc, precision_score, recall_score, f1_score, roc_auc_score, classification_report,balanced_accuracy_score, precision_recall_curve, plot_precision_recall_curve
from sklearn import svm
import numpy as np
from sklearn import preprocessing
seed = 42
np.random.seed(seed)
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.preprocessing import sequence
import torch
from torch import optim
import scipy
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression


# Function of Metrics Confusion.

In [2]:
def get_metrics_confusion(y, y_pred):
    
    acc = accuracy_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    rec = recall_score(y, y_pred)
    prec = precision_score(y, y_pred)
    

    print('Accuracy: ', acc)
    print('Recall: ', rec)
    print('Precision: ', prec)
    print('F1 Score: ', f1)

# Read the Datasets 

### HASOC Dataset 

In [3]:
hasoc = pd.read_csv('data/Cleaning/HASOC_Dataset.csv',on_bad_lines='skip')
hasoc.head()

Unnamed: 0,label,clean_tweet,Hash Words
0,0,lulz new meme day opkkk optrump opdomesticterr...,#opkkk #optrump #opdomesticterrorism #fucktrum...
1,1,gandinaaliabus arnob r great job showcas genui...,#gandinaaliabuse
2,0,enough harass still like unwelcom rel leav eve...,#johnmccainday #trumpisatraitor #collusion
3,0,3rd class organ world biggest cup sad see help...,#icc #shameonicc
4,0,hey love india cc dhonikeepstheglov balidaanba...,#dhonikeepstheglove #balidaanbadge #dhonikesaa...


In [4]:
hasoc

Unnamed: 0,label,clean_tweet,Hash Words
0,0,lulz new meme day opkkk optrump opdomesticterr...,#opkkk #optrump #opdomesticterrorism #fucktrum...
1,1,gandinaaliabus arnob r great job showcas genui...,#gandinaaliabuse
2,0,enough harass still like unwelcom rel leav eve...,#johnmccainday #trumpisatraitor #collusion
3,0,3rd class organ world biggest cup sad see help...,#icc #shameonicc
4,0,hey love india cc dhonikeepstheglov balidaanba...,#dhonikeepstheglove #balidaanbadge #dhonikesaa...
...,...,...,...
4536,0,leader free world find need even care mayor ci...,#pathetic #sociopathic #douchebag
4537,0,befit repli section medium look controversi dh...,#dhonikeepstheglove #dhonigloves
4538,0,heard pa clear trumpisatraitor trumplieseveryt...,#trumpisatraitor #trumplieseverytimehespeaks #...
4539,0,celebr putinspuppet desperatedonald trumpisatr...,#putinspuppet #desperatedonald #trumpisatraito...


In [5]:
hasoc.label.value_counts()

0    3419
1    1122
Name: label, dtype: int64

### Aristotle Dataset

In [6]:
aris = pd.read_csv('data/Cleaning/Aristotle_Dataset.csv',on_bad_lines='skip')
aris.head()

Unnamed: 0,label,clean_tweet,Hash Words
0,0,take shot predict bengal first two draft day i...,#bengals
1,0,next week localnewspap shop wednesday afternoo...,#localnewspaper
2,0,final get player dnd soon align go think chaot...,No hashtags
3,0,commit oral write idea goal like honor commitm...,No hashtags
4,1,syrian alli iran blast you missil strike dange...,No hashtags


In [7]:
aris.label.value_counts()

0    5352
1    1538
Name: label, dtype: int64

### Hugging Face Dataset 

In [8]:
hf = pd.read_csv('data/Cleaning/HuggingFace_Dataset.csv',on_bad_lines='skip')
hf.columns

Index(['label', 'clean_tweet', 'Hash Words'], dtype='object')

In [9]:
hf.head()

Unnamed: 0,label,clean_tweet,Hash Words
0,0,spread positiv give someon high five today pos...,#positiveenergy #highfive #tuesdaymotivation
1,0,difficult ask mani languag measur happi unifor...,#languages- #happiness
2,0,pray orlando orlando orlandostrong bestrong pr...,#orlando #orlandostrong #bestrong #pray #belie...
3,0,bought thing kyli cosmet today ddd lovethemal,#lovethemall
4,0,gorgeou modeltoi look wild sexi follow victori...,#model


In [10]:
hf.label.value_counts()

0    25255
1     1676
Name: label, dtype: int64

### Copenhagen Dataset 

In [11]:
copen = pd.read_csv('data/Cleaning/Copenhagen_Dataset.csv',on_bad_lines='skip')
copen.head()

Unnamed: 0,label,clean_tweet,Hash Words
0,0,ok enough instant restaur alreadi,No hashtags
1,0,femal cowork asse woman choos work hooter okay,No hashtags
2,1,call sexist want girl never comment footbal me...,No hashtags
3,0,live long prosper leonard nimoy repo dead 83,No hashtags
4,1,sexist right hate girl,No hashtags


In [12]:
copen.label.value_counts()

0    6698
1    2570
Name: label, dtype: int64

# Train and Test Data Preparation 

### Hugging Face Dataset to Train and Others to Test.

In [13]:
hf_x_train = hf.clean_tweet
hasoc_x_test = hasoc.clean_tweet
aris_x_test = aris.clean_tweet
copen_x_test = copen.clean_tweet

hf_y_train = hf.label
hasoc_y_test = hasoc.label
aris_y_test = aris.label
copen_y_test = copen.label

### Implement TFIDF for Unshuffled Data

In [14]:
vec = TfidfVectorizer()
tfidf_tr = vec.fit_transform(hf_x_train)
tfidf_hasoc_test = vec.transform(hasoc_x_test)
tfidf_aris_test = vec.transform(aris_x_test)
tfidf_copen_test = vec.transform(copen_x_test)

In [15]:
tfidf_tr.shape

(26931, 37214)

In [16]:
first_document_vector=tfidf_hasoc_test[1]

In [17]:
print(first_document_vector)

  (0, 29780)	0.46275361977981866
  (0, 29778)	0.2735459351853603
  (0, 26640)	0.45018801771431777
  (0, 18310)	0.3066054002230288
  (0, 14979)	0.2534897903538734
  (0, 14166)	0.41838888140766856
  (0, 4181)	0.41838888140766856


In [18]:
feature_names = vec.get_feature_names()
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False).head(45)



Unnamed: 0,tfidf
showcas,0.462754
prime,0.450188
genuin,0.418389
bastard,0.418389
job,0.306605
show,0.273546
great,0.25349
ottbik,0.0
otss,0.0
otta,0.0


### Shuffle the Data for Neural Network Model

In [19]:
hf_shuffle = hf.sample(frac=1)
hasoc_shuffle = hasoc.sample(frac=1)
aris_shuffle = aris.sample(frac=1)
copen_shuffle = copen.sample(frac=1)

In [20]:
hf_x_train_shuffle = hf_shuffle.clean_tweet
hasoc_x_test_shuffle = hasoc_shuffle.clean_tweet
aris_x_test_shuffle = aris_shuffle.clean_tweet
copen_x_test_shuffle = copen_shuffle.clean_tweet

hf_y_train_shuffle = hf_shuffle.label
hasoc_y_test_shuffle = hasoc_shuffle.label
aris_y_test_shuffle = aris_shuffle.label
copen_y_test_shuffle = copen_shuffle.label

### Implement TFIDF for Shuffled Data 

In [21]:
vec = TfidfVectorizer()
tfidf_tr_shuffle = vec.fit_transform(hf_x_train_shuffle)
tfidf_hasoc_test_shuffle = vec.transform(hasoc_x_test_shuffle)
tfidf_aris_test_shuffle = vec.transform(aris_x_test_shuffle)
tfidf_copen_test_shuffle = vec.transform(copen_x_test_shuffle)

# Artifical Neural Network 

In [22]:
hf_y_train_encode = preprocessing.LabelEncoder().fit_transform(hf_y_train_shuffle)
hasoc_y_test_encode = preprocessing. LabelEncoder().fit_transform(hasoc_y_test_shuffle)
aris_y_test_encode = preprocessing. LabelEncoder().fit_transform(aris_y_test_shuffle)
copen_y_test_encode = preprocessing. LabelEncoder().fit_transform(copen_y_test_shuffle)

In [23]:
hf_x_train = scipy.sparse.csr_matrix.todense(tfidf_tr_shuffle)
hasoc_x_test = scipy.sparse.csr_matrix.todense(tfidf_hasoc_test_shuffle)
aris_x_test = scipy.sparse.csr_matrix.todense(tfidf_aris_test_shuffle)
copen_x_test = scipy.sparse.csr_matrix.todense(tfidf_copen_test_shuffle)

In [24]:
# define and fit the model
def get_model(trainX,trainy):
 # define model
    model = tf.keras.Sequential([
                            tf.keras.layers.Dense(128, activation='relu', input_shape=(trainX.shape[1],)),
                            tf.keras.layers.Dropout(0.2),
                            tf.keras.layers.Dense(32, activation='relu'),
                            tf.keras.layers.Dropout(0.2),
                            tf.keras.layers.Dense(128, activation='softmax')])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(trainX, trainy, epochs=10, verbose=2)
    return model

### Use HASOC to Test

In [25]:
ann_model = get_model(hf_x_train,hf_y_train_encode)

2023-03-19 22:10:35.069122: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10
842/842 - 7s - loss: 0.4773 - accuracy: 0.9381 - 7s/epoch - 8ms/step
Epoch 2/10
842/842 - 6s - loss: 0.0920 - accuracy: 0.9679 - 6s/epoch - 7ms/step
Epoch 3/10
842/842 - 6s - loss: 0.0437 - accuracy: 0.9854 - 6s/epoch - 7ms/step
Epoch 4/10
842/842 - 6s - loss: 0.0225 - accuracy: 0.9926 - 6s/epoch - 7ms/step
Epoch 5/10
842/842 - 6s - loss: 0.0111 - accuracy: 0.9969 - 6s/epoch - 7ms/step
Epoch 6/10
842/842 - 6s - loss: 0.0069 - accuracy: 0.9980 - 6s/epoch - 7ms/step
Epoch 7/10
842/842 - 6s - loss: 0.0047 - accuracy: 0.9985 - 6s/epoch - 7ms/step
Epoch 8/10
842/842 - 6s - loss: 0.0031 - accuracy: 0.9990 - 6s/epoch - 7ms/step
Epoch 9/10
842/842 - 6s - loss: 0.0018 - accuracy: 0.9994 - 6s/epoch - 7ms/step
Epoch 10/10
842/842 - 7s - loss: 0.0011 - accuracy: 0.9996 - 7s/epoch - 8ms/step


In [26]:
y_hasoc_predict = ann_model.predict(hasoc_x_test,verbose=0)
y_predict_hasoc_classes=np.argmax(y_hasoc_predict,axis=1)

In [27]:
get_metrics_confusion(hasoc_y_test_encode, y_predict_hasoc_classes)

Accuracy:  0.6798062100858842
Recall:  0.12477718360071301
Precision:  0.22875816993464052
F1 Score:  0.16147635524798154


### Use Aristotle Dataset to Test 

In [28]:
y_aris_predict = ann_model.predict(aris_x_test,verbose=0)
y_predict_aris_classes=np.argmax(y_aris_predict,axis=1)

In [29]:
get_metrics_confusion(aris_y_test_encode, y_predict_aris_classes)

Accuracy:  0.7761973875181423
Recall:  0.12483745123537061
Precision:  0.4948453608247423
F1 Score:  0.19937694704049844


### Use Copenhagen Dataset to Test 

In [30]:
y_copen_predict = ann_model.predict(copen_x_test,verbose=0)
y_predict_copen_classes=np.argmax(y_copen_predict,axis=1)

In [31]:
get_metrics_confusion(copen_y_test_encode, y_predict_copen_classes)

Accuracy:  0.7556107034958999
Recall:  0.3007782101167315
Precision:  0.6228847703464948
F1 Score:  0.4056678037260561


# Random Forest Classifier

In [32]:
rf = RandomForestClassifier(n_estimators=100).fit(tfidf_tr, hf_y_train)

### Use HASOC to Test

In [33]:
hasoc_rf_test = rf.predict(tfidf_hasoc_test)
get_metrics_confusion(hasoc_y_test, hasoc_rf_test)

Accuracy:  0.7346399471482052
Recall:  0.045454545454545456
Precision:  0.2756756756756757
F1 Score:  0.07804131599081868


### Use Aristotle Dataset to Test 

In [34]:
aris_rf_test = rf.predict(tfidf_aris_test)
get_metrics_confusion(aris_y_test, aris_rf_test)

Accuracy:  0.781422351233672
Recall:  0.053966189856957086
Precision:  0.6194029850746269
F1 Score:  0.0992822966507177


### Use Copenhagen Dataset to Test

In [35]:
copen_rf_test = rf.predict(tfidf_copen_test)
get_metrics_confusion(copen_y_test, copen_rf_test)

Accuracy:  0.8000647388864911
Recall:  0.35175097276264594
Precision:  0.8285976168652612
F1 Score:  0.4938541382136029


# Support Vector Classifier

In [36]:
svc = svm.LinearSVC(random_state=42).fit(tfidf_tr, hf_y_train)

### Use HASOC to Test 

In [37]:
hasoc_svc_test = svc.predict(tfidf_hasoc_test)
get_metrics_confusion(hasoc_y_test, hasoc_svc_test)

Accuracy:  0.7181237612860604
Recall:  0.08377896613190731
Precision:  0.27167630057803466
F1 Score:  0.12806539509536785


### Use Aristotle Dataset to Test 

In [38]:
aris_svc_test = svc.predict(tfidf_aris_test)
get_metrics_confusion(aris_y_test, aris_svc_test)

Accuracy:  0.7820029027576197
Recall:  0.09752925877763328
Precision:  0.5681818181818182
F1 Score:  0.16648168701442842


### Use Copenhagen Dataset to Test 

In [39]:
copen_svc_test = svc.predict(tfidf_copen_test)
get_metrics_confusion(copen_y_test, copen_svc_test)

Accuracy:  0.7678031937850669
Recall:  0.26731517509727626
Precision:  0.7186192468619247
F1 Score:  0.38967668746454903


# Logistic Regression

In [40]:
log = LogisticRegression().fit(tfidf_tr, hf_y_train)

### Use HASOC to Test

In [41]:
hasoc_log_test = log.predict(tfidf_hasoc_test)

In [42]:
get_metrics_confusion(hasoc_y_test, hasoc_log_test)

Accuracy:  0.7443294428539969
Recall:  0.020499108734402853
Precision:  0.27058823529411763
F1 Score:  0.03811101905550953


### Use Aristotle Dataset to Test 

In [43]:
aris_log_test = log.predict(tfidf_aris_test)

In [44]:
get_metrics_confusion(aris_y_test, aris_log_test)

Accuracy:  0.7818577648766328
Recall:  0.03966189856957087
Precision:  0.7011494252873564
F1 Score:  0.07507692307692307


### Use Copenhagen Dataset to Test 

In [45]:
copen_log_test = log.predict(tfidf_copen_test)

In [46]:
get_metrics_confusion(copen_y_test, copen_log_test)

Accuracy:  0.7342468709538196
Recall:  0.07042801556420233
Precision:  0.7098039215686275
F1 Score:  0.12814159292035399


# Gradient Boosting

In [47]:
gbc = GradientBoostingClassifier().fit(tfidf_tr, hf_y_train)

### Use HASOC to Test  

In [48]:
hasoc_gbc_test = gbc.predict(tfidf_hasoc_test)

In [49]:
get_metrics_confusion(hasoc_y_test, hasoc_gbc_test)

Accuracy:  0.7511561330103501
Recall:  0.022281639928698752
Precision:  0.43103448275862066
F1 Score:  0.04237288135593221


### Use Aristotle Dataset to Test  

In [50]:
aris_gbc_test = gbc.predict(tfidf_aris_test)

In [51]:
get_metrics_confusion(aris_y_test, aris_gbc_test)

Accuracy:  0.7786647314949202
Recall:  0.0305591677503251
Precision:  0.5802469135802469
F1 Score:  0.058060531192093895


### Use Copenhagen Dataset to Test  

In [52]:
copen_gbc_test = gbc.predict(tfidf_copen_test)

In [53]:
get_metrics_confusion(copen_y_test, copen_gbc_test)

Accuracy:  0.7287440656020716
Recall:  0.04202334630350195
Precision:  0.675
F1 Score:  0.07912087912087913
