In [98]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
import pickle
import warnings
from keras.layers import Input, Dense, Dropout, LSTM, Embedding
from keras.models import Model
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential

In [42]:
warnings.filterwarnings("ignore")

In [7]:
# Read TSV file
train_data = pd.read_csv('train_en.tsv', delimiter='\t')
dev_data = pd.read_csv('dev_en.tsv', delimiter='\t')

In [8]:
# Count the null values in train data
train_data.isnull().sum()

id      0
text    0
HS      0
TR      0
AG      0
dtype: int64

In [9]:
# Count the null values in dev data
dev_data.isnull().sum()

id      0
text    0
HS      0
TR      0
AG      0
dtype: int64

In [10]:
train_data.head()

Unnamed: 0,id,text,HS,TR,AG
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1,0,0
1,202,Why would young fighting age men be the vast m...,1,0,0
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1,0,0
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,0,0
4,205,Orban in Brussels: European leaders are ignori...,0,0,0


In [11]:
dev_data.head()

Unnamed: 0,id,text,HS,TR,AG
0,18201,I swear I’m getting to places just in the nick...,0,0,0
1,18202,I’m an immigrant — and Trump is right on immig...,0,0,0
2,18203,#IllegalImmigrants #IllegalAliens #ElectoralSy...,1,0,1
3,18204,@DRUDGE_REPORT We have our own invasion issues...,1,0,1
4,18205,Worker Charged With Sexually Molesting Eight C...,0,0,0


In [12]:
# There is one wrong label in AG column of train data
train_data['AG'] = train_data['AG'].replace(['discredit'], '0')

In [13]:
# Change the type of values of AG column in train data to int
train_data[['AG']] = train_data[['AG']].apply(pd.to_numeric)

In [14]:
# Drop the column id
# We don't need it

train_data = train_data.drop('id', axis=1)
dev_data = dev_data.drop('id', axis=1)

### Text Preprocessing

In [15]:
# Load NLTK stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')
nltk.download('wordnet')

# Load NLTK stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove punctuation and lowercase the tokens
    tokens = [token.lower() for token in tokens if token not in string.punctuation]
    
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stem and lemmatize the tokens
    tokens = [stemmer.stem(lemmatizer.lemmatize(token, pos='v')) for token in tokens]
    
    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/chaitanya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/chaitanya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/chaitanya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
train_data['text'] = train_data['text'].apply(preprocess_text)
dev_data['text'] = dev_data['text'].apply(preprocess_text)

In [17]:
# train data after preprocessing text
train_data.head()

Unnamed: 0,text,HS,TR,AG
0,hurray save us mani way potu realdonaldtrump l...,1,0,0
1,would young fight age men vast major one escap...,1,0,0
2,kamalaharri illeg dump kid border like road ki...,1,0,0
3,ny time nearli white state pose array problem ...,0,0,0
4,orban brussel european leader ignor peopl want...,0,0,0


In [18]:
# dev data after preprocessing text
dev_data.head()

Unnamed: 0,text,HS,TR,AG
0,swear get place nick time exhaust sam schulman...,0,0,0
1,immigr trump right immigr http co pldngi fmv m...,0,0,0
2,illegalimmigr illegalalien electoralsystem ele...,1,0,1
3,drudg report invas issu mexican buildthatwal,1,0,1
4,worker charg sexual molest eight children immi...,0,0,0


### Create vectors of text

#### Tfidf vectors

In [19]:
# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the text data
vectorizer.fit(train_data['text'])

# Transform the 'text' column
train_tfidf_vectors = vectorizer.transform(train_data['text'])
dev_tfidf_vectors = vectorizer.transform(dev_data['text'])

#### Word2Vec vectors

In [59]:
# Convert the preprocessed text into a list of words
train_sentences = [text.split() for text in train_data['text']]
dev_sentences = [text.split() for text in dev_data['text']]
all_sentences = train_sentences + dev_sentences

In [60]:
# Train the Word2Vec model on the train data
train_word2vec_model = Word2Vec(train_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Train the Word2Vec model on the dev data
dev_word2vec_model = Word2Vec(all_sentences, vector_size=100, window=5, min_count=1, workers=4)

In [61]:
# train_word2vec_vectors
train_word2vec_vectors = []
for sentence in train_sentences:
    sentence_vector = []
    for word in sentence:
        if word in train_word2vec_model.wv.key_to_index:
            sentence_vector.append(train_word2vec_model.wv[word])
    if len(sentence_vector) > 0:
        sentence_avg = sum(sentence_vector) / len(sentence_vector)
    else:
        sentence_avg = np.zeros(train_word2vec_model.vector_size)
    train_word2vec_vectors.append(sentence_avg)

train_word2vec_vectors = np.array(train_word2vec_vectors)

# dev_word2vec_vectors
dev_word2vec_vectors = []
for sentence in dev_sentences:
    sentence_vector = []
    for word in sentence:
        if word in dev_word2vec_model.wv.key_to_index:
            sentence_vector.append(dev_word2vec_model.wv[word])
    if len(sentence_vector) > 0:
        sentence_avg = sum(sentence_vector) / len(sentence_vector)
    else:
        sentence_avg = np.zeros(dev_word2vec_model.vector_size)
    dev_word2vec_vectors.append(sentence_avg)

dev_word2vec_vectors = np.array(dev_word2vec_vectors)

#### GloVe vectors

In [23]:
glove_file = 'glove.6B.300d.txt'
glove_vectors = {}

with open(glove_file, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_vectors[word] = vector

def get_glove_vectors(text):
    tokens = word_tokenize(text)
    vectors = []
    for token in tokens:
        if token in glove_vectors:
            vectors.append(glove_vectors[token])
    if len(vectors) == 0:
        return np.zeros(300)
    else:
        return sum(vectors) / len(vectors)

train_glove_vectors = []
for text in train_data['text']:
    train_glove_vectors.append(get_glove_vectors(text))
    
dev_glove_vectors = []
for text in dev_data['text']:
    dev_glove_vectors.append(get_glove_vectors(text))
    
train_glove_vectors = np.array(train_glove_vectors)
dev_glove_vectors = np.array(dev_glove_vectors)

#### BERT vectors

In [30]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_vectors(text):
    # Tokenize the text
    tokens = tokenizer.encode(text, add_special_tokens=True)
    # Convert the tokens to PyTorch tensors
    tokens = torch.tensor([tokens])
    # Get the BERT embeddings for the tokens
    with torch.no_grad():
        embeddings = bert_model(tokens)[0]
    # Take the mean of the embeddings to get a single vector for the text
    vector = torch.mean(embeddings, dim=1).squeeze()
    # Convert the vector to a numpy array
    vector = vector.numpy()
    return vector

train_bert_vectors = []
for text in train_data['text']:
    train_bert_vectors.append(get_bert_vectors(text))
    
dev_bert_vectors = []
for text in dev_data['text']:
    dev_bert_vectors.append(get_bert_vectors(text))
    
train_bert_vectors = np.array(train_bert_vectors)
dev_bert_vectors = np.array(dev_bert_vectors)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
train_bert_vectors.shape

(9000, 768)

### Training & Testing

In [65]:
# Get the labels 
train_output = train_data.loc[:, ['HS', 'TR', 'AG']].values
dev_output =  dev_data.loc[:, ['HS', 'TR', 'AG']].values

# Types of embeddings
vector_types = ["Tfidf vectors", "Word2Vec vectors", "GloVe vectors", "BERT vectors"]

# Types of Labels
labels = ['HS', 'TR', 'AG']

train_vectors = {
    "Tfidf vectors" : train_tfidf_vectors,
    "Word2Vec vectors" : train_word2vec_vectors,
    "GloVe vectors" : train_glove_vectors,
    "BERT vectors" : train_bert_vectors
}

dev_vectors = {
    "Tfidf vectors" : dev_tfidf_vectors,
    "Word2Vec vectors" : dev_word2vec_vectors,
    "GloVe vectors" : dev_glove_vectors,
    "BERT vectors" : dev_bert_vectors
}

In [102]:
with open('train_vectors.pkl', 'wb') as f:
    pickle.dump(train_vectors, f)

with open('dev_vectors.pkl', 'wb') as f:
    pickle.dump(dev_vectors, f)

In [80]:
evaluation_results = {
  'model type' : [],
  'vector type' : [],
  'HS F1' : [],
  'TR F1' : [],
  'AG F1' :  [],
  'macro-averaged F1' : []
}

#### Logistic Regression

In [81]:
for vector_type in vector_types:
  # create a logistic regression object
  logreg = LogisticRegression(max_iter=1000)
  # create a multioutput classifier object
  multi_logreg = MultiOutputClassifier(logreg)
  # fit the multioutput classifier on training data
  multi_logreg.fit(train_vectors[vector_type], train_output)
  # predict on test data
  y_pred = multi_logreg.predict(dev_vectors[vector_type])
  # classification report
  report = classification_report(dev_output, y_pred, target_names=labels, zero_division=0, output_dict=True)
  # storing and printing the results
  evaluation_results['model type'].append('Logistic Regression')
  evaluation_results['vector type'].append(vector_type)
  evaluation_results['HS F1'].append(report['HS']['f1-score'])
  evaluation_results['TR F1'].append(report['TR']['f1-score'])
  evaluation_results['AG F1'].append(report['AG']['f1-score'])
  evaluation_results['macro-averaged F1'].append(report['macro avg']['f1-score'])

  print('Model: Logistic Regression')
  print(f'Vectors: {vector_type}')
  print('HS (F1 score):', report['HS']['f1-score'])
  print('TR (F1 score):', report['TR']['f1-score'])
  print('AG (F1 score):', report['AG']['f1-score'])
  print('macro-averaged (F1 score):', report['macro avg']['f1-score'], '\n\n')

Model: Logistic Regression
Vectors: Tfidf vectors
HS (F1 score): 0.6996466431095407
TR (F1 score): 0.5380116959064327
AG (F1 score): 0.3680555555555556
macro-averaged (F1 score): 0.5352379648571763 


Model: Logistic Regression
Vectors: Word2Vec vectors
HS (F1 score): 0.2777777777777778
TR (F1 score): 0.36500000000000005
AG (F1 score): 0.0
macro-averaged (F1 score): 0.2142592592592593 


Model: Logistic Regression
Vectors: GloVe vectors
HS (F1 score): 0.645465253239105
TR (F1 score): 0.43692307692307686
AG (F1 score): 0.22745098039215686
macro-averaged (F1 score): 0.43661310351811294 


Model: Logistic Regression
Vectors: BERT vectors
HS (F1 score): 0.6797235023041475
TR (F1 score): 0.5978260869565217
AG (F1 score): 0.4210526315789474
macro-averaged (F1 score): 0.5662007402798722 




#### Decision Tree

In [82]:
for vector_type in vector_types:
  # create a decision tree classifier object
  tree = DecisionTreeClassifier()
  # create a multioutput classifier object
  multi_tree = MultiOutputClassifier(tree)
  # fit the multioutput classifier on training data
  multi_tree.fit(train_vectors[vector_type], train_output)
  # predict on test data
  y_pred = multi_tree.predict(dev_vectors[vector_type])
  # classification report
  report = classification_report(dev_output, y_pred, target_names=labels, zero_division=0, output_dict=True)
  # storing and printing the results
  evaluation_results['model type'].append('Decision Tree')
  evaluation_results['vector type'].append(vector_type)
  evaluation_results['HS F1'].append(report['HS']['f1-score'])
  evaluation_results['TR F1'].append(report['TR']['f1-score'])
  evaluation_results['AG F1'].append(report['AG']['f1-score'])
  evaluation_results['macro-averaged F1'].append(report['macro avg']['f1-score'])

  print('Model: Decision Tree')
  print(f'Vectors: {vector_type}')
  print('HS (F1 score):', report['HS']['f1-score'])
  print('TR (F1 score):', report['TR']['f1-score'])
  print('AG (F1 score):', report['AG']['f1-score'])
  print('macro-averaged (F1 score):', report['macro avg']['f1-score'], '\n\n')

Model: Decision Tree
Vectors: Tfidf vectors
HS (F1 score): 0.6198156682027651
TR (F1 score): 0.567237163814181
AG (F1 score): 0.4305555555555555
macro-averaged (F1 score): 0.5392027958575005 


Model: Decision Tree
Vectors: Word2Vec vectors
HS (F1 score): 0.5465465465465464
TR (F1 score): 0.2929936305732484
AG (F1 score): 0.2541176470588235
macro-averaged (F1 score): 0.36455260805953943 


Model: Decision Tree
Vectors: GloVe vectors
HS (F1 score): 0.5611353711790393
TR (F1 score): 0.4267352185089974
AG (F1 score): 0.2871046228710462
macro-averaged (F1 score): 0.4249917375196943 


Model: Decision Tree
Vectors: BERT vectors
HS (F1 score): 0.579185520361991
TR (F1 score): 0.41791044776119396
AG (F1 score): 0.35046728971962615
macro-averaged (F1 score): 0.4491877526142704 




#### Random Forest

In [83]:
for vector_type in vector_types:
  # create a random forest classifier object
  forest = RandomForestClassifier()
  # create a multioutput classifier object
  multi_forest = MultiOutputClassifier(forest)
  # fit the multioutput classifier on training data
  multi_forest.fit(train_vectors[vector_type], train_output)
  # predict on test data
  y_pred = multi_forest.predict(dev_vectors[vector_type])
  # classification report
  report = classification_report(dev_output, y_pred, target_names=labels, zero_division=0, output_dict=True)
  # storing and printing the results
  evaluation_results['model type'].append('Random Forest')
  evaluation_results['vector type'].append(vector_type)
  evaluation_results['HS F1'].append(report['HS']['f1-score'])
  evaluation_results['TR F1'].append(report['TR']['f1-score'])
  evaluation_results['AG F1'].append(report['AG']['f1-score'])
  evaluation_results['macro-averaged F1'].append(report['macro avg']['f1-score'])

  print('Model: Random Forest')
  print(f'Vectors: {vector_type}')
  print('HS (F1 score):', report['HS']['f1-score'])
  print('TR (F1 score):', report['TR']['f1-score'])
  print('AG (F1 score):', report['AG']['f1-score'])
  print('macro-averaged (F1 score):', report['macro avg']['f1-score'], '\n\n')

Model: Random Forest
Vectors: Tfidf vectors
HS (F1 score): 0.6963855421686748
TR (F1 score): 0.5259938837920489
AG (F1 score): 0.33802816901408456
macro-averaged (F1 score): 0.5201358649916027 


Model: Random Forest
Vectors: Word2Vec vectors
HS (F1 score): 0.5742411812961443
TR (F1 score): 0.345679012345679
AG (F1 score): 0.33978132884777124
macro-averaged (F1 score): 0.41990050749653146 


Model: Random Forest
Vectors: GloVe vectors
HS (F1 score): 0.6467780429594272
TR (F1 score): 0.35738831615120276
AG (F1 score): 0.07207207207207207
macro-averaged (F1 score): 0.35874614372756736 


Model: Random Forest
Vectors: BERT vectors
HS (F1 score): 0.6456310679611651
TR (F1 score): 0.3745583038869258
AG (F1 score): 0.18257261410788386
macro-averaged (F1 score): 0.4009206619853249 




#### Support Vector Machine

In [84]:
for vector_type in vector_types:
  # create a SVM classifier object
  svm = SVC()
  # create a multioutput classifier object
  multi_svm = MultiOutputClassifier(svm)
  # fit the multioutput classifier on training data
  multi_svm.fit(train_vectors[vector_type], train_output)
  # predict on test data
  y_pred = multi_svm.predict(dev_vectors[vector_type])
  # classification report
  report = classification_report(dev_output, y_pred, target_names=labels, zero_division=0, output_dict=True)
  # storing and printing the results
  evaluation_results['model type'].append('Support Vector Machine')
  evaluation_results['vector type'].append(vector_type)
  evaluation_results['HS F1'].append(report['HS']['f1-score'])
  evaluation_results['TR F1'].append(report['TR']['f1-score'])
  evaluation_results['AG F1'].append(report['AG']['f1-score'])
  evaluation_results['macro-averaged F1'].append(report['macro avg']['f1-score'])

  print('Model: Support Vector Machine')
  print(f'Vectors: {vector_type}')
  print('HS (F1 score):', report['HS']['f1-score'])
  print('TR (F1 score):', report['TR']['f1-score'])
  print('AG (F1 score):', report['AG']['f1-score'])
  print('macro-averaged (F1 score):', report['macro avg']['f1-score'], '\n\n')

Model: Support Vector Machine
Vectors: Tfidf vectors
HS (F1 score): 0.7137809187279152
TR (F1 score): 0.5485714285714286
AG (F1 score): 0.33935018050541516
macro-averaged (F1 score): 0.5339008426015863 


Model: Support Vector Machine
Vectors: Word2Vec vectors
HS (F1 score): 0.6058394160583942
TR (F1 score): 0.0
AG (F1 score): 0.0
macro-averaged (F1 score): 0.20194647201946472 


Model: Support Vector Machine
Vectors: GloVe vectors
HS (F1 score): 0.6867749419953597
TR (F1 score): 0.4466019417475728
AG (F1 score): 0.09999999999999999
macro-averaged (F1 score): 0.4111256279143109 


Model: Support Vector Machine
Vectors: BERT vectors
HS (F1 score): 0.6979542719614921
TR (F1 score): 0.46349206349206346
AG (F1 score): 0.19834710743801653
macro-averaged (F1 score): 0.45326448096385735 




#### Naive Bayes

In [85]:
for vector_type in vector_types:
  # create a Naive Bayes classifier object
  nb = GaussianNB()
  # create a multioutput classifier object
  multi_nb = MultiOutputClassifier(nb)
  # fit the multioutput classifier on training data
  if vector_type == 'Tfidf vectors':
    multi_nb.fit(train_vectors[vector_type].toarray(), train_output)
  else:
    multi_nb.fit(train_vectors[vector_type], train_output)
  # predict on test data
  if vector_type == 'Tfidf vectors':
    y_pred = multi_nb.predict(dev_vectors[vector_type].toarray())
  else:
    y_pred = multi_nb.predict(dev_vectors[vector_type])
  # classification report
  report = classification_report(dev_output, y_pred, target_names=labels, zero_division=0, output_dict=True)
  # storing and printing the results
  evaluation_results['model type'].append('Naive Bayes')
  evaluation_results['vector type'].append(vector_type)
  evaluation_results['HS F1'].append(report['HS']['f1-score'])
  evaluation_results['TR F1'].append(report['TR']['f1-score'])
  evaluation_results['AG F1'].append(report['AG']['f1-score'])
  evaluation_results['macro-averaged F1'].append(report['macro avg']['f1-score'])

  print('Model: Naive Bayes')
  print(f'Vectors: {vector_type}')
  print('HS (F1 score):', report['HS']['f1-score'])
  print('TR (F1 score):', report['TR']['f1-score'])
  print('AG (F1 score):', report['AG']['f1-score'])
  print('macro-averaged (F1 score):', report['macro avg']['f1-score'], '\n\n')

Model: Naive Bayes
Vectors: Tfidf vectors
HS (F1 score): 0.6260236578707916
TR (F1 score): 0.43478260869565216
AG (F1 score): 0.3356401384083045
macro-averaged (F1 score): 0.46548213499158275 


Model: Naive Bayes
Vectors: Word2Vec vectors
HS (F1 score): 0.5992982456140351
TR (F1 score): 0.0
AG (F1 score): 0.3397169025811823
macro-averaged (F1 score): 0.3130050493984058 


Model: Naive Bayes
Vectors: GloVe vectors
HS (F1 score): 0.6422594142259413
TR (F1 score): 0.5991561181434599
AG (F1 score): 0.43163538873994634
macro-averaged (F1 score): 0.5576836403697826 


Model: Naive Bayes
Vectors: BERT vectors
HS (F1 score): 0.6572295247724974
TR (F1 score): 0.5906542056074766
AG (F1 score): 0.425
macro-averaged (F1 score): 0.557627910126658 




#### K-Nearest Neighbours (KNN)

In [86]:
for vector_type in vector_types:
  # create a KNN classifier object
  knn = KNeighborsClassifier()
  # create a multioutput classifier object
  multi_knn = MultiOutputClassifier(knn)
  # fit the multioutput classifier on training data
  multi_knn.fit(train_vectors[vector_type], train_output)
  # predict on test data
  y_pred = multi_knn.predict(dev_vectors[vector_type])
  # classification report
  report = classification_report(dev_output, y_pred, target_names=labels, zero_division=0, output_dict=True)
  # storing and printing the results
  evaluation_results['model type'].append('KNN')
  evaluation_results['vector type'].append(vector_type)
  evaluation_results['HS F1'].append(report['HS']['f1-score'])
  evaluation_results['TR F1'].append(report['TR']['f1-score'])
  evaluation_results['AG F1'].append(report['AG']['f1-score'])
  evaluation_results['macro-averaged F1'].append(report['macro avg']['f1-score'])

  print('Model: KNN')
  print(f'Vectors: {vector_type}')
  print('HS (F1 score):', report['HS']['f1-score'])
  print('TR (F1 score):', report['TR']['f1-score'])
  print('AG (F1 score):', report['AG']['f1-score'])
  print('macro-averaged (F1 score):', report['macro avg']['f1-score'], '\n\n')

Model: KNN
Vectors: Tfidf vectors
HS (F1 score): 0.6382978723404255
TR (F1 score): 0.5406976744186046
AG (F1 score): 0.28668941979522183
macro-averaged (F1 score): 0.488561655518084 


Model: KNN
Vectors: Word2Vec vectors
HS (F1 score): 0.5787159190853122
TR (F1 score): 0.42916666666666664
AG (F1 score): 0.21710526315789475
macro-averaged (F1 score): 0.40832928296995785 


Model: KNN
Vectors: GloVe vectors
HS (F1 score): 0.6142857142857142
TR (F1 score): 0.4930747922437674
AG (F1 score): 0.29447852760736193
macro-averaged (F1 score): 0.4672796780456145 


Model: KNN
Vectors: BERT vectors
HS (F1 score): 0.6711560044893378
TR (F1 score): 0.5604395604395604
AG (F1 score): 0.38395415472779365
macro-averaged (F1 score): 0.5385165732188973 




#### Neural Network (NN)

In [87]:
for vector_type in vector_types:
  # define model architecture
  if vector_type == 'Tfidf vectors':
    input_layer = Input(shape=len(train_vectors[vector_type].toarray()[0]))
  else:
    input_layer = Input(shape=len(train_vectors[vector_type][0]))
  dense_layer1 = Dense(64, activation='relu')(input_layer)
  dropout_layer1 = Dropout(0.2)(dense_layer1)
  dense_layer2 = Dense(32, activation='relu')(dropout_layer1)
  dropout_layer2 = Dropout(0.2)(dense_layer2)
  output_layer = Dense(3, activation='sigmoid')(dropout_layer2)

  model = Model(inputs=input_layer, outputs=output_layer)
  model.compile(loss='binary_crossentropy', optimizer='adam')
  # define ideal batch size
  batch_size = 32
  # train the model
  if vector_type == 'Tfidf vectors':
    history = model.fit(train_vectors[vector_type].toarray(), train_output, epochs=40, batch_size=batch_size, validation_split=0.3)
  else:
    history = model.fit(train_vectors[vector_type], train_output, epochs=40, batch_size=batch_size, validation_split=0.3)
  # make predictions on test data
  if vector_type == 'Tfidf vectors':
    predictions = model.predict(dev_vectors[vector_type].toarray(), batch_size=batch_size)
  else:
    predictions = model.predict(dev_vectors[vector_type], batch_size=batch_size)
  # round the predictions to 0 or 1
  rounded_predictions = np.round(predictions)
  # classification report
  report = classification_report(dev_output, rounded_predictions, target_names=labels, zero_division=0, output_dict=True)
  # storing and printing the results
  evaluation_results['model type'].append('Neural Network')
  evaluation_results['vector type'].append(vector_type)
  evaluation_results['HS F1'].append(report['HS']['f1-score'])
  evaluation_results['TR F1'].append(report['TR']['f1-score'])
  evaluation_results['AG F1'].append(report['AG']['f1-score'])
  evaluation_results['macro-averaged F1'].append(report['macro avg']['f1-score'])

  print('Model: Neural Network')
  print(f'Vectors: {vector_type}')
  print('HS (F1 score):', report['HS']['f1-score'])
  print('TR (F1 score):', report['TR']['f1-score'])
  print('AG (F1 score):', report['AG']['f1-score'])
  print('macro-averaged (F1 score):', report['macro avg']['f1-score'], '\n\n')

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Model: Neural Network
Vectors: Tfidf vectors
HS (F1 score): 0.6678445229681979
TR (F1 score): 0.548148148148148
AG (F1 score): 0.3958762886597938
macro-averaged (F1 score): 0.5372896532587133 


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 2

#### LSTM

In [101]:
# Define the maximum number of words to consider and the maximum sequence length
max_words = 10000
maxlen = 100

# Create a tokenizer and fit it to the training text data
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data['text'])

# Convert the training and dev text data to sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
dev_sequences = tokenizer.texts_to_sequences(dev_data['text'])

# Pad the sequences to ensure they all have the same length
train_lstm = pad_sequences(train_sequences, maxlen=maxlen)
dev_lstm = pad_sequences(dev_sequences, maxlen=maxlen)

# Define the model architecture
model = Sequential()
model.add(Embedding(max_words, 128, input_length=maxlen))
model.add(LSTM(128))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam')

# Train the model
model.fit(train_lstm, train_output, epochs=30, batch_size=32, validation_split=0.3)

# Use the model to make predictions on the test data
y_pred = model.predict(dev_lstm)

# Convert the predicted probabilities to binary labels
y_pred = (y_pred > 0.5).astype(int)

# classification report
report = classification_report(dev_output, y_pred, target_names=labels, zero_division=0, output_dict=True)

print('Model: LSTM')
print('HS (F1 score):', report['HS']['f1-score'])
print('TR (F1 score):', report['TR']['f1-score'])
print('AG (F1 score):', report['AG']['f1-score'])
print('macro-averaged (F1 score):', report['macro avg']['f1-score'], '\n\n')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Model: LSTM
HS (F1 score): 0.672
TR (F1 score): 0.5649484536082474
AG (F1 score): 0.4232365145228216
macro-averaged (F1 score): 0.553394989377023 




In [107]:
# storing the results
evaluation_results['model type'].append('LSTM')
evaluation_results['vector type'].append('')
evaluation_results['HS F1'].append(report['HS']['f1-score'])
evaluation_results['TR F1'].append(report['TR']['f1-score'])
evaluation_results['AG F1'].append(report['AG']['f1-score'])
evaluation_results['macro-averaged F1'].append(report['macro avg']['f1-score'])

### Results

In [110]:
results = pd.DataFrame(evaluation_results)
results

Unnamed: 0,model type,vector type,HS F1,TR F1,AG F1,macro-averaged F1
0,Logistic Regression,Tfidf vectors,0.699647,0.538012,0.368056,0.535238
1,Logistic Regression,Word2Vec vectors,0.277778,0.365,0.0,0.214259
2,Logistic Regression,GloVe vectors,0.645465,0.436923,0.227451,0.436613
3,Logistic Regression,BERT vectors,0.679724,0.597826,0.421053,0.566201
4,Decision Tree,Tfidf vectors,0.619816,0.567237,0.430556,0.539203
5,Decision Tree,Word2Vec vectors,0.546547,0.292994,0.254118,0.364553
6,Decision Tree,GloVe vectors,0.561135,0.426735,0.287105,0.424992
7,Decision Tree,BERT vectors,0.579186,0.41791,0.350467,0.449188
8,Random Forest,Tfidf vectors,0.696386,0.525994,0.338028,0.520136
9,Random Forest,Word2Vec vectors,0.574241,0.345679,0.339781,0.419901


In [105]:
# storing the results
with open('results.pkl', 'wb') as f:
    pickle.dump(evaluation_results, f)

In [106]:
# load the results
with open('results.pkl', 'rb') as f:
    diya = pickle.load(f)

In [111]:
diya

{'model type': ['Logistic Regression',
  'Logistic Regression',
  'Logistic Regression',
  'Logistic Regression',
  'Decision Tree',
  'Decision Tree',
  'Decision Tree',
  'Decision Tree',
  'Random Forest',
  'Random Forest',
  'Random Forest',
  'Random Forest',
  'Support Vector Machine',
  'Support Vector Machine',
  'Support Vector Machine',
  'Support Vector Machine',
  'Naive Bayes',
  'Naive Bayes',
  'Naive Bayes',
  'Naive Bayes',
  'KNN',
  'KNN',
  'KNN',
  'KNN',
  'Neural Network',
  'Neural Network',
  'Neural Network',
  'Neural Network',
  'LSTM'],
 'vector type': ['Tfidf vectors',
  'Word2Vec vectors',
  'GloVe vectors',
  'BERT vectors',
  'Tfidf vectors',
  'Word2Vec vectors',
  'GloVe vectors',
  'BERT vectors',
  'Tfidf vectors',
  'Word2Vec vectors',
  'GloVe vectors',
  'BERT vectors',
  'Tfidf vectors',
  'Word2Vec vectors',
  'GloVe vectors',
  'BERT vectors',
  'Tfidf vectors',
  'Word2Vec vectors',
  'GloVe vectors',
  'BERT vectors',
  'Tfidf vectors',
  '