In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, auc, precision_score, recall_score, f1_score, roc_auc_score, classification_report,balanced_accuracy_score, precision_recall_curve, plot_precision_recall_curve
from sklearn import svm
import numpy as np
from sklearn import preprocessing
seed = 42
np.random.seed(seed)
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.preprocessing import sequence
import torch
from torch import optim
import scipy
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression


# Function of Metrics Confusion.

In [2]:
def get_metrics_confusion(y, y_pred):
    
    acc = accuracy_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    rec = recall_score(y, y_pred)
    prec = precision_score(y, y_pred)
    

    print('Accuracy: ', acc)
    print('Recall: ', rec)
    print('Precision: ', prec)
    print('F1 Score: ', f1)

# Read the Datasets 

### HASOC Dataset 

In [3]:
hasoc = pd.read_csv('data/Cleaning/HASOC_Dataset.csv',on_bad_lines='skip')
hasoc.head()

Unnamed: 0,label,clean_tweet,Hash Words
0,0,lulz new meme day opkkk optrump opdomesticterr...,#opkkk #optrump #opdomesticterrorism #fucktrum...
1,1,gandinaaliabus arnob r great job showcas genui...,#gandinaaliabuse
2,0,enough harass still like unwelcom rel leav eve...,#johnmccainday #trumpisatraitor #collusion
3,0,3rd class organ world biggest cup sad see help...,#icc #shameonicc
4,0,hey love india cc dhonikeepstheglov balidaanba...,#dhonikeepstheglove #balidaanbadge #dhonikesaa...


In [4]:
hasoc

Unnamed: 0,label,clean_tweet,Hash Words
0,0,lulz new meme day opkkk optrump opdomesticterr...,#opkkk #optrump #opdomesticterrorism #fucktrum...
1,1,gandinaaliabus arnob r great job showcas genui...,#gandinaaliabuse
2,0,enough harass still like unwelcom rel leav eve...,#johnmccainday #trumpisatraitor #collusion
3,0,3rd class organ world biggest cup sad see help...,#icc #shameonicc
4,0,hey love india cc dhonikeepstheglov balidaanba...,#dhonikeepstheglove #balidaanbadge #dhonikesaa...
...,...,...,...
4536,0,leader free world find need even care mayor ci...,#pathetic #sociopathic #douchebag
4537,0,befit repli section medium look controversi dh...,#dhonikeepstheglove #dhonigloves
4538,0,heard pa clear trumpisatraitor trumplieseveryt...,#trumpisatraitor #trumplieseverytimehespeaks #...
4539,0,celebr putinspuppet desperatedonald trumpisatr...,#putinspuppet #desperatedonald #trumpisatraito...


In [5]:
hasoc.label.value_counts()

0    3419
1    1122
Name: label, dtype: int64

### Aristotle Dataset

In [6]:
aris = pd.read_csv('data/Cleaning/Aristotle_Dataset.csv',on_bad_lines='skip')
aris.head()

Unnamed: 0,label,clean_tweet,Hash Words
0,0,take shot predict bengal first two draft day i...,#bengals
1,0,next week localnewspap shop wednesday afternoo...,#localnewspaper
2,0,final get player dnd soon align go think chaot...,No hashtags
3,0,commit oral write idea goal like honor commitm...,No hashtags
4,1,syrian alli iran blast you missil strike dange...,No hashtags


In [7]:
aris.label.value_counts()

0    5352
1    1538
Name: label, dtype: int64

### Hugging Face Dataset 

In [8]:
hf = pd.read_csv('data/Cleaning/HuggingFace_Dataset.csv',on_bad_lines='skip')
hf.columns

Index(['label', 'clean_tweet', 'Hash Words'], dtype='object')

In [9]:
hf.head()

Unnamed: 0,label,clean_tweet,Hash Words
0,0,spread positiv give someon high five today pos...,#positiveenergy #highfive #tuesdaymotivation
1,0,difficult ask mani languag measur happi unifor...,#languages- #happiness
2,0,pray orlando orlando orlandostrong bestrong pr...,#orlando #orlandostrong #bestrong #pray #belie...
3,0,bought thing kyli cosmet today ddd lovethemal,#lovethemall
4,0,gorgeou modeltoi look wild sexi follow victori...,#model


In [10]:
hf.label.value_counts()

0    25255
1     1676
Name: label, dtype: int64

### Copenhagen Dataset 

In [11]:
copen = pd.read_csv('data/Cleaning/Copenhagen_Dataset.csv',on_bad_lines='skip')
copen.head()

Unnamed: 0,label,clean_tweet,Hash Words
0,0,ok enough instant restaur alreadi,No hashtags
1,0,femal cowork asse woman choos work hooter okay,No hashtags
2,1,call sexist want girl never comment footbal me...,No hashtags
3,0,live long prosper leonard nimoy repo dead 83,No hashtags
4,1,sexist right hate girl,No hashtags


In [12]:
copen.label.value_counts()

0    6698
1    2570
Name: label, dtype: int64

# Train and Test Data Preparation 

### Hugging Face Dataset to Train and Others to Test.

In [13]:
copen_x_train = copen.clean_tweet
hasoc_x_test = hasoc.clean_tweet
aris_x_test = aris.clean_tweet
hf_x_test = hf.clean_tweet

copen_y_train = copen.label
hf_y_test = hf.label
hasoc_y_test = hasoc.label
aris_y_test = aris.label

### Implement TFIDF for Unshuffled Data

In [14]:
vec = TfidfVectorizer()
tfidf_tr = vec.fit_transform(copen_x_train)
tfidf_hasoc_test = vec.transform(hasoc_x_test)
tfidf_aris_test = vec.transform(aris_x_test)
tfidf_hf_test = vec.transform(hf_x_test)

In [15]:
tfidf_tr.shape

(9268, 9293)

In [16]:
first_document_vector=tfidf_hasoc_test[1]

In [17]:
print(first_document_vector)

  (0, 7316)	0.28428519424008153
  (0, 6322)	0.4789268289653885
  (0, 4352)	0.3334796045453805
  (0, 3555)	0.3206900897795523
  (0, 3389)	0.4567291734612506
  (0, 868)	0.516873971691242


In [18]:
feature_names = vec.get_feature_names()
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False).head(45)



Unnamed: 0,tfidf
bastard,0.516874
prime,0.478927
genuin,0.456729
job,0.33348
great,0.32069
show,0.284285
pommi,0.0
pommiesfohewin,0.0
ponder,0.0
poni,0.0


### Shuffle the Data for Neural Network Model

In [19]:
hf_shuffle = hf.sample(frac=1)
hasoc_shuffle = hasoc.sample(frac=1)
aris_shuffle = aris.sample(frac=1)
copen_shuffle = copen.sample(frac=1)

In [20]:
copen_x_train_shuffle = copen_shuffle.clean_tweet
hasoc_x_test_shuffle = hasoc_shuffle.clean_tweet
aris_x_test_shuffle = aris_shuffle.clean_tweet
hf_x_test_shuffle = hf_shuffle.clean_tweet

copen_y_train_shuffle = copen_shuffle.label
hasoc_y_test_shuffle = hasoc_shuffle.label
aris_y_test_shuffle = aris_shuffle.label
hf_y_test_shuffle = hf_shuffle.label

### Implement TFIDF for Shuffled Data 

In [21]:
vec = TfidfVectorizer()
tfidf_tr_shuffle = vec.fit_transform(copen_x_train_shuffle)
tfidf_hasoc_test_shuffle = vec.transform(hasoc_x_test_shuffle)
tfidf_aris_test_shuffle = vec.transform(aris_x_test_shuffle)
tfidf_hf_test_shuffle = vec.transform(hf_x_test_shuffle)

# Artifical Neural Network 

In [22]:
copen_y_train_encode = preprocessing.LabelEncoder().fit_transform(copen_y_train_shuffle)
hasoc_y_test_encode = preprocessing. LabelEncoder().fit_transform(hasoc_y_test_shuffle)
aris_y_test_encode = preprocessing. LabelEncoder().fit_transform(aris_y_test_shuffle)
hf_y_test_encode = preprocessing. LabelEncoder().fit_transform(hf_y_test_shuffle)

In [23]:
copen_x_train = scipy.sparse.csr_matrix.todense(tfidf_tr_shuffle)
hasoc_x_test = scipy.sparse.csr_matrix.todense(tfidf_hasoc_test_shuffle)
aris_x_test = scipy.sparse.csr_matrix.todense(tfidf_aris_test_shuffle)
hf_x_test = scipy.sparse.csr_matrix.todense(tfidf_hf_test_shuffle)

In [24]:
# define and fit the model
def get_model(trainX,trainy):
 # define model
    model = tf.keras.Sequential([
                            tf.keras.layers.Dense(128, activation='relu', input_shape=(trainX.shape[1],)),
                            tf.keras.layers.Dropout(0.2),
                            tf.keras.layers.Dense(32, activation='relu'),
                            tf.keras.layers.Dropout(0.2),
                            tf.keras.layers.Dense(128, activation='softmax')])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(trainX, trainy, epochs=10, verbose=2)
    return model

### Use HASOC to Test

In [25]:
ann_model = get_model(copen_x_train,copen_y_train_encode)

2023-03-19 22:12:41.710196: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10
290/290 - 1s - loss: 1.3059 - accuracy: 0.7299 - 1s/epoch - 4ms/step
Epoch 2/10
290/290 - 1s - loss: 0.3437 - accuracy: 0.8608 - 812ms/epoch - 3ms/step
Epoch 3/10
290/290 - 1s - loss: 0.2349 - accuracy: 0.9097 - 786ms/epoch - 3ms/step
Epoch 4/10
290/290 - 1s - loss: 0.1710 - accuracy: 0.9376 - 797ms/epoch - 3ms/step
Epoch 5/10
290/290 - 1s - loss: 0.1341 - accuracy: 0.9530 - 788ms/epoch - 3ms/step
Epoch 6/10
290/290 - 1s - loss: 0.1062 - accuracy: 0.9619 - 784ms/epoch - 3ms/step
Epoch 7/10
290/290 - 1s - loss: 0.0872 - accuracy: 0.9698 - 790ms/epoch - 3ms/step
Epoch 8/10
290/290 - 1s - loss: 0.0736 - accuracy: 0.9719 - 783ms/epoch - 3ms/step
Epoch 9/10
290/290 - 1s - loss: 0.0597 - accuracy: 0.9783 - 796ms/epoch - 3ms/step
Epoch 10/10
290/290 - 1s - loss: 0.0543 - accuracy: 0.9815 - 782ms/epoch - 3ms/step


In [26]:
y_hasoc_predict = ann_model.predict(hasoc_x_test,verbose=0)
y_predict_hasoc_classes=np.argmax(y_hasoc_predict,axis=1)

In [27]:
get_metrics_confusion(hasoc_y_test_encode, y_predict_hasoc_classes)

Accuracy:  0.6793657784628936
Recall:  0.18092691622103388
Precision:  0.2743243243243243
F1 Score:  0.2180451127819549


### Use Aristotle Dataset to Test 

In [28]:
y_aris_predict = ann_model.predict(aris_x_test,verbose=0)
y_predict_aris_classes=np.argmax(y_aris_predict,axis=1)

In [29]:
get_metrics_confusion(aris_y_test_encode, y_predict_aris_classes)

Accuracy:  0.7307692307692307
Recall:  0.21261378413524057
Precision:  0.3367662203913491
F1 Score:  0.2606616181745715


### Use Hugging Face Dataset to Test 

In [30]:
y_hf_predict = ann_model.predict(hf_x_test,verbose=0)
y_predict_hf_classes=np.argmax(y_hf_predict,axis=1)

In [31]:
get_metrics_confusion(hf_y_test_encode, y_predict_hf_classes)

Accuracy:  0.8543685715346626
Recall:  0.24105011933174225
Precision:  0.13228552717747216
F1 Score:  0.1708245243128964


# Random Forest Classifier

In [32]:
rf = RandomForestClassifier(n_estimators=100).fit(tfidf_tr, copen_y_train)

### Use HASOC to Test

In [33]:
hasoc_rf_test = rf.predict(tfidf_hasoc_test)
get_metrics_confusion(hasoc_y_test, hasoc_rf_test)

Accuracy:  0.7381634001321294
Recall:  0.0374331550802139
Precision:  0.2781456953642384
F1 Score:  0.06598586017282011


### Use Aristotle Dataset to Test 

In [34]:
aris_rf_test = rf.predict(tfidf_aris_test)
get_metrics_confusion(aris_y_test, aris_rf_test)

Accuracy:  0.7702467343976778
Recall:  0.06501950585175553
Precision:  0.40816326530612246
F1 Score:  0.11217049915872127


### Use Hugging Face Dataset to Test

In [35]:
hf_rf_test = rf.predict(tfidf_hf_test)
get_metrics_confusion(hf_y_test, hf_rf_test)

Accuracy:  0.9141138464966024
Recall:  0.07040572792362769
Precision:  0.13516609392898052
F1 Score:  0.09258532757944292


# Support Vector Classifier

In [36]:
svc = svm.LinearSVC(random_state=42).fit(tfidf_tr, copen_y_train)

### Use HASOC to Test 

In [37]:
hasoc_svc_test = svc.predict(tfidf_hasoc_test)
get_metrics_confusion(hasoc_y_test, hasoc_svc_test)

Accuracy:  0.7093151288262497
Recall:  0.12477718360071301
Precision:  0.2928870292887029
F1 Score:  0.175


### Use Aristotle Dataset to Test 

In [38]:
aris_svc_test = svc.predict(tfidf_aris_test)
get_metrics_confusion(aris_y_test, aris_svc_test)

Accuracy:  0.7516690856313498
Recall:  0.12288686605981794
Precision:  0.3430127041742287
F1 Score:  0.18094782192436573


### Use Hugging Face Dataset to Test 

In [39]:
hf_svc_test = svc.predict(tfidf_hf_test)
get_metrics_confusion(hf_y_test, hf_svc_test)

Accuracy:  0.9030113995024321
Recall:  0.1575178997613365
Precision:  0.18032786885245902
F1 Score:  0.1681528662420382


# Logistic Regression

In [40]:
log = LogisticRegression().fit(tfidf_tr, copen_y_train)

### Use HASOC to Test

In [41]:
hasoc_log_test = log.predict(tfidf_hasoc_test)

In [42]:
get_metrics_confusion(hasoc_y_test, hasoc_log_test)

Accuracy:  0.7485135432724069
Recall:  0.0106951871657754
Precision:  0.2727272727272727
F1 Score:  0.02058319039451115


### Use Aristotle Dataset to Test 

In [43]:
aris_log_test = log.predict(tfidf_aris_test)

In [44]:
get_metrics_confusion(aris_y_test, aris_log_test)

Accuracy:  0.7799709724238026
Recall:  0.03446033810143043
Precision:  0.6309523809523809
F1 Score:  0.06535141800246609


### Use Hugging Face Dataset to Test 

In [45]:
hf_log_test = log.predict(tfidf_hf_test)

In [46]:
get_metrics_confusion(hf_y_test, hf_log_test)

Accuracy:  0.9331253945267536
Recall:  0.04116945107398568
Precision:  0.2623574144486692
F1 Score:  0.07117070654976793


# Gradient Boosting

In [47]:
gbc = GradientBoostingClassifier().fit(tfidf_tr, copen_y_train)

### Use HASOC to Test  

In [48]:
hasoc_gbc_test = gbc.predict(tfidf_hasoc_test)

In [49]:
get_metrics_confusion(hasoc_y_test, hasoc_gbc_test)

Accuracy:  0.7509359171988549
Recall:  0.006238859180035651
Precision:  0.30434782608695654
F1 Score:  0.01222707423580786


### Use Aristotle Dataset to Test  

In [50]:
aris_gbc_test = gbc.predict(tfidf_aris_test)

In [51]:
get_metrics_confusion(aris_y_test, aris_gbc_test)

Accuracy:  0.7769230769230769
Recall:  0.009102730819245773
Precision:  0.5185185185185185
F1 Score:  0.017891373801916934


### Use Hugging Face Dataset to Test  

In [52]:
hf_gbc_test = gbc.predict(tfidf_hf_test)

In [53]:
get_metrics_confusion(hf_y_test, hf_gbc_test)

Accuracy:  0.9362816085551966
Recall:  0.016706443914081145
Precision:  0.2916666666666667
F1 Score:  0.03160270880361174
