In [1]:
#Importing of necessary packages
import nltk
import re
import emoji
import demoji
import torch
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from datetime import datetime
from numpy import loadtxt, savetxt

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score

from xgboost import XGBClassifier
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D, BatchNormalization, Bidirectional

from bayes_opt import BayesianOptimization

warnings.filterwarnings("ignore")

In [2]:
# Reading in the dataset
df_tweets = pd.read_csv('tweets_dataset_processed.csv')

In [3]:
df_tweets = df_tweets[["cleaned_text", "isBot"]]
df_tweets

Unnamed: 0,cleaned_text,isBot
0,aleah is me,0
1,__user_mention__ I got you bruh,0
2,__user_mention__ its a diet where u can only e...,1
3,When we are no longer able to change a situati...,1
4,__user_mention__ __user_mention__ __user_menti...,0
...,...,...
49995,__user_mention__ jfc whats wrong with the foru...,0
49996,__user_mention__ die hard with a vengeance __u...,0
49997,What would you do.... IF! a little old man pic...,1
49998,__user_mention__ lucky you..I can't unless if ...,1


# Model
1. Logistic Regression (BoW)
2. Random Forest Classifier (BoW)
3. Logistic Regression (TF-IDF)
4. Random Forest Classifier (TF-IDF)
5. LSTM
6. Logistic Regression (BERT)
7. Random Forest Classifier (BERT)
8. Adaboost Classifier (BERT)
9. XGBoost (BERT)

## BERT Models
While previously the word embeddings from Glove are not fully context dependent, let us consider an embedding that considers both context and both directions, ie the BERT word embedding

Here, first, we generate the BERT feature vector for each tweet and then we train models on those feature representations 

In [4]:
#initialising a pretrained bert model
tokenizer = BertTokenizer.from_pretrained("bert-base-cased", padding = True)
model = BertModel.from_pretrained("bert-base-cased")
nlp = pipeline("feature-extraction", tokenizer = tokenizer, model = model)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
#store all the tweets after they have been encoded by BERT

# feature_list = []

# for index, row in df_tweets.iterrows():
#     #extracting the ith tweet and restricting the characters to 512, which is fine because twitter's limit is 280
#     text = row['cleaned_text'][:512]
#     #encoding all the individual words present in the tweet
#     vec = np.array(nlp(text))
#     #getting the mean representation of the words present in the tweet
#     vec = vec.reshape((vec.shape[1], vec.shape[2])).mean(axis = 0)
#     feature_list.append(vec)

# feature_vectors = np.array(feature_list)

In [6]:
feature_vectors = pd.read_csv("feature_vect.csv")
feature_vectors = feature_vectors.drop(columns = "Unnamed: 0")

In [7]:
#splitting the generated features into training and testing features
x_train, x_test, y_train, y_test = train_test_split(feature_vectors, df_tweets['isBot'], test_size = 0.2)

## 06 Logistic Regression (BERT)
* Model
* Error Metrics

In [8]:
# Model
log_regression = LogisticRegression(max_iter = 100000)

In [9]:
%%time

# Fit Model
log_model = log_regression.fit(x_train, y_train)

Wall time: 15.4 s


In [10]:
%%time

# y_prediction
y_pred = log_model.predict(x_test)

Wall time: 29.7 ms


In [11]:
# Error Metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
lg_loss = metrics.log_loss(y_test, y_pred)
print(f'Log Loss: {lg_loss}')
roc_auc = metrics.roc_auc_score(y_test, y_pred)
print(f'ROC AUC: {roc_auc}')
f1_score = metrics.f1_score(y_test, y_pred)
print(f'F1-score: {f1_score}')
precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')
recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {recall}')
report = metrics.classification_report(y_test, y_pred)
print(f'Classification Report: \n {report}')

Accuracy: 0.7351
Log Loss: 9.149428453349286
ROC AUC: 0.7350831729810059
F1-score: 0.7323972118395797
Precision: 0.7311415893505446
Recall: 0.7336571544221817
Classification Report: 
               precision    recall  f1-score   support

           0       0.74      0.74      0.74      5059
           1       0.73      0.73      0.73      4941

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000



## 07 Random Forest Classifier (BERT)
* Model
* Error Metrics

### Model (Optimal Hyperparameters)

In [12]:
# Model (initialise the object based on parameters selected by random search)
rf_classifier = RandomForestClassifier(bootstrap = False, 
                                       max_depth = 80, 
                                       max_features = "auto", 
                                       min_samples_split = 10, 
                                       n_estimators = 200)

In [13]:
%%time

# Fit Model
rf_model = rf_classifier.fit(x_train, np.ravel(y_train))

Wall time: 10min 30s


In [14]:
%%time

# y_prediction for the best model
y_pred_optimal_rf = rf_model.predict(x_test)

Wall time: 2.93 s


In [15]:
# Metrics for tuned random forest
accuracy = metrics.accuracy_score(y_test, y_pred_optimal_rf)
print(f'Accuracy: {accuracy}')
lg_loss = metrics.log_loss(y_test, y_pred_optimal_rf)
print(f'Log Loss: {lg_loss}')
roc_auc = metrics.roc_auc_score(y_test, y_pred_optimal_rf)
print(f'ROC AUC: {roc_auc}')
f1_score = metrics.f1_score(y_test, y_pred_optimal_rf)
print(f'F1-score: {f1_score}')
precision = metrics.precision_score(y_test, y_pred_optimal_rf)
print(f'Precision: {precision}')
recall = metrics.recall_score(y_test, y_pred_optimal_rf)
print(f'Recall: {recall}')
report = metrics.classification_report(y_test, y_pred_optimal_rf)
print(f'Classification Report: \n {report}')

Accuracy: 0.801
Log Loss: 6.873295902612044
ROC AUC: 0.8009675667239906
F1-score: 0.798542215023284
Precision: 0.7988657079197894
Recall: 0.7982189840113337
Classification Report: 
               precision    recall  f1-score   support

           0       0.80      0.80      0.80      5059
           1       0.80      0.80      0.80      4941

    accuracy                           0.80     10000
   macro avg       0.80      0.80      0.80     10000
weighted avg       0.80      0.80      0.80     10000



## 08 Adaboost Classifier (BERT)
* Model
* Error Metrics

### Model (Optimal Hyperparameters)

In [16]:
# Model (initialise the object based on parameters selected by random search)
adaboost_classifier = AdaBoostClassifier(n_estimators = 500, 
                                         learning_rate = 0.1)

In [17]:
%%time

# Fit Model
adaboost_model = adaboost_classifier.fit(x_train, y_train)

Wall time: 41min 32s


In [18]:
%%time

# y_prediction for the best model
y_pred_optimal_ada = adaboost_model.predict(x_test)

Wall time: 11.1 s


In [19]:
# Metrics
accuracy = metrics.accuracy_score(y_test, y_pred_optimal_ada)
print(f'Accuracy: {accuracy}')
lg_loss = metrics.log_loss(y_test, y_pred_optimal_ada)
print(f'Log Loss: {lg_loss}')
roc_auc = metrics.roc_auc_score(y_test, y_pred_optimal_ada)
print(f'ROC AUC: {roc_auc}')
f1_score = metrics.f1_score(y_test, y_pred_optimal_ada)
print(f'F1-score: {f1_score}')
precision = metrics.precision_score(y_test, y_pred_optimal_ada)
print(f'Precision: {precision}')
recall = metrics.recall_score(y_test, y_pred_optimal_ada)
print(f'Recall: {recall}')
report = metrics.classification_report(y_test, y_pred_optimal_ada)
print(f'Classification Report: \n {report}')

Accuracy: 0.7156
Log Loss: 9.822949945320703
ROC AUC: 0.7158035084805209
F1-score: 0.718080888183981
Precision: 0.7037108995531377
Recall: 0.733049989880591
Classification Report: 
               precision    recall  f1-score   support

           0       0.73      0.70      0.71      5059
           1       0.70      0.73      0.72      4941

    accuracy                           0.72     10000
   macro avg       0.72      0.72      0.72     10000
weighted avg       0.72      0.72      0.72     10000



## 09 XGBoost Classifier (BERT)
* Model
* Error Metrics

### Model (Optimal Hyperparameters)

In [20]:
# Model (initialise the object based on parameters selected by bayesian optimisation)
weight_train = y_train.value_counts()[0] / y_train.value_counts()[1]

xgb_classifier = XGBClassifier(gamma = 0.1,
                               alpha = 0.5,
                               max_depth = 25, 
                               eta = 0.01, 
                               subsample = 0.8,
                               colsample_bytree = 1.0,
                               scale_pos_weight = weight_train,
                               objective = "binary:logistic",
                               eval_metric = "logloss")

In [21]:
%%time

# Fit Model
xgb_model = xgb_classifier.fit(x_train, y_train)

Wall time: 13min 17s


In [22]:
%%time

# y_prediction for the best model
y_pred_optimal_xgb = xgb_model.predict(x_test)

Wall time: 310 ms


In [23]:
# Metrics
accuracy = metrics.accuracy_score(y_test, y_pred_optimal_xgb)
print(f'Accuracy: {accuracy}')
lg_loss = metrics.log_loss(y_test, y_pred_optimal_xgb)
print(f'Log Loss: {lg_loss}')
roc_auc = metrics.roc_auc_score(y_test, y_pred_optimal_xgb)
print(f'ROC AUC: {roc_auc}')
f1_score = metrics.f1_score(y_test, y_pred_optimal_xgb)
print(f'F1-score: {f1_score}')
precision = metrics.precision_score(y_test, y_pred_optimal_xgb)
print(f'Precision: {precision}')
recall = metrics.recall_score(y_test, y_pred_optimal_xgb)
print(f'Recall: {recall}')
report = metrics.classification_report(y_test, y_pred_optimal_xgb)
print(f'Classification Report: \n {report}')

Accuracy: 0.7883
Log Loss: 7.311953235339612
ROC AUC: 0.7885549383896214
F1-score: 0.7908722710658896
Precision: 0.772481667309919
Recall: 0.810159886662619
Classification Report: 
               precision    recall  f1-score   support

           0       0.81      0.77      0.79      5059
           1       0.77      0.81      0.79      4941

    accuracy                           0.79     10000
   macro avg       0.79      0.79      0.79     10000
weighted avg       0.79      0.79      0.79     10000

