In [1]:
#Importing of necessary packages
import nltk
import re
import emoji
import demoji
import torch
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from datetime import datetime
from numpy import loadtxt, savetxt

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score

from xgboost import XGBClassifier
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D, BatchNormalization, Bidirectional

from bayes_opt import BayesianOptimization

warnings.filterwarnings("ignore")

In [2]:
# Reading in the dataset
df_human = pd.read_csv('human_tweets_processed.csv')
df_fake = pd.read_csv('bot_tweets_fake_processed.csv')
df_social = pd.read_csv('bot_tweets_social_processed.csv')

In [3]:
df_tweets_all = pd.concat([df_human, df_fake, df_social])
df_tweets_all = df_tweets_all.reset_index(inplace = False, drop = True)

In [4]:
df_tweets = df_tweets_all[["text", "cleaned_text", "bot"]]
df_tweets = df_tweets.rename(columns = {"bot": "isBot"})
df_tweets

Unnamed: 0,text,cleaned_text,isBot
0,@KyleDavidHall @YouTube YASS THANKS BABE,__user_mention__ __user_mention__ YASS THANKS ...,0
1,RT @AbnInfVet: ...And Then Bloomberg Says He D...,RT __user_mention__ ...And Then Bloomberg Says...,0
2,RT @lgbtqnation: Federal judge considers separ...,RT __user_mention__ Federal judge considers se...,0
3,RT @OKFosterWishes: URGENT; Beds needed to get...,RT __user_mention__ URGENT; Beds needed to get...,0
4,"RT @luke_brooks: N.America and EU! Our EP ""Wou...","RT __user_mention__ N.America and EU! Our EP ""...",0
...,...,...,...
248177,theawkwardmoment#theawkwardmoment when you sta...,theawkwardmoment#theawkwardmoment when you sta...,1
248178,I will have the rest of the site updated when ...,I will have the rest of the site updated when ...,1
248179,RT @EMANSANGELS: Follow @scottstorch and view ...,RT __user_mention__ Follow __user_mention__ an...,1
248180,Stupidity in numbers. penn state riot,Stupidity in numbers. penn state riot,1


# Model
1. Logistic Regression (BoW)
2. Random Forest Classifier (BoW)
3. Logistic Regression (TF-IDF)
4. Random Forest Classifier (TF-IDF)
5. LSTM
6. Logistic Regression (BERT)
7. Random Forest Classifier (BERT)
8. Adaboost Classifier (BERT)
9. XGBoost (BERT)

## BERT Models
While previously the word embeddings from Glove are not fully context dependent, let us consider an embedding that considers both context and both directions, ie the BERT word embedding

Here, first, we generate the BERT feature vector for each tweet and then we train models on those feature representations 

In [6]:
#initialising a pretrained bert model
tokenizer = BertTokenizer.from_pretrained("bert-base-cased", padding = True)
model = BertModel.from_pretrained("bert-base-cased")
nlp = pipeline("feature-extraction", tokenizer = tokenizer, model = model)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
#store all the tweets after they have been encoded by BERT

# feature_list = []

# for index, row in df_tweets_searches.iterrows():
#     #extracting the ith tweet and restricting the characters to 512, which is fine because twitter's limit is 280
#     text = row['cleaned_text'][:512]
#     #encoding all the individual words present in the tweet
#     vec = np.array(nlp(text))
#     #getting the mean representation of the words present in the tweet
#     vec = vec.reshape((vec.shape[1], vec.shape[2])).mean(axis = 0)
#     feature_list.append(vec)

# feature_vectors = np.array(feature_list)

In [8]:
feature_vectors = pd.read_csv("feature_vect.csv")
feature_vectors = feature_vectors.drop(columns = "Unnamed: 0")

In [9]:
#splitting the generated features into training and testing features
x_train, x_test, y_train, y_test = train_test_split(feature_vectors, df_tweets['isBot'], test_size = 0.2)

## 06 Logistic Regression (BERT)
* Model
* Error Metrics

In [10]:
# Model
log_regression = LogisticRegression(max_iter = 100000)

In [11]:
%%time

# Fit Model
log_model = log_regression.fit(x_train, y_train)

Wall time: 1min 44s


In [12]:
%%time

# y_prediction
y_pred = log_model.predict(x_test)

Wall time: 116 ms


In [13]:
# Error Metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
lg_loss = metrics.log_loss(y_test, y_pred)
print(f'Log Loss: {lg_loss}')
roc_auc = metrics.roc_auc_score(y_test, y_pred)
print(f'ROC AUC: {roc_auc}')
f1_score = metrics.f1_score(y_test, y_pred)
print(f'F1-score: {f1_score}')
precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')
recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {recall}')
report = metrics.classification_report(y_test, y_pred)
print(f'Classification Report: \n {report}')

Accuracy: 0.7675524306464936
Log Loss: 8.028520023572051
ROC AUC: 0.7160054008279689
F1-score: 0.6185785123966943
Precision: 0.697376267143709
Recall: 0.5557799691101343
Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.88      0.83     32803
           1       0.70      0.56      0.62     16834

    accuracy                           0.77     49637
   macro avg       0.75      0.72      0.73     49637
weighted avg       0.76      0.77      0.76     49637



## 07 Random Forest Classifier (BERT)
* Model
* Error Metrics

### Model (Optimal Hyperparameters)

In [14]:
# Model (initialise the object based on parameters selected by random search)
rf_classifier = RandomForestClassifier(bootstrap = False, 
                                       max_depth = 80, 
                                       max_features = "auto", 
                                       min_samples_split = 10, 
                                       n_estimators = 200)

In [18]:
%%time

# Fit Model
rf_model = rf_classifier.fit(x_train, np.ravel(y_train))

Wall time: 1h 13min 40s


In [19]:
%%time

# y_prediction for the best model
y_pred_optimal_rf = rf_model.predict(x_test)

Wall time: 11.8 s


In [20]:
# Metrics for tuned random forest
accuracy = metrics.accuracy_score(y_test, y_pred_optimal_rf)
print(f'Accuracy: {accuracy}')
lg_loss = metrics.log_loss(y_test, y_pred_optimal_rf)
print(f'Log Loss: {lg_loss}')
roc_auc = metrics.roc_auc_score(y_test, y_pred_optimal_rf)
print(f'ROC AUC: {roc_auc}')
f1_score = metrics.f1_score(y_test, y_pred_optimal_rf)
print(f'F1-score: {f1_score}')
precision = metrics.precision_score(y_test, y_pred_optimal_rf)
print(f'Precision: {precision}')
recall = metrics.recall_score(y_test, y_pred_optimal_rf)
print(f'Recall: {recall}')
report = metrics.classification_report(y_test, y_pred_optimal_rf)
print(f'Classification Report: \n {report}')

Accuracy: 0.9551342748353043
Log Loss: 1.5496120336020083
ROC AUC: 0.9381485126352425
F1-score: 0.9304822850007805
Precision: 0.9804618117229129
Recall: 0.8853510752049424
Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.99      0.97     32803
           1       0.98      0.89      0.93     16834

    accuracy                           0.96     49637
   macro avg       0.96      0.94      0.95     49637
weighted avg       0.96      0.96      0.95     49637



## 08 Adaboost Classifier (BERT)
* Model
* Error Metrics

### Model (Optimal Hyperparameters)

In [21]:
# Model (initialise the object based on parameters selected by random search)
adaboost_classifier = AdaBoostClassifier(n_estimators = 200, 
                                         learning_rate = 1.0)

In [22]:
%%time

# Fit Model
adaboost_model = adaboost_classifier.fit(x_train, y_train)

Wall time: 1h 50min 32s


In [23]:
%%time

# y_prediction for the best model
y_pred_optimal_ada = adaboost_model.predict(x_test)

Wall time: 25.7 s


In [24]:
# Metrics
accuracy = metrics.accuracy_score(y_test, y_pred_optimal_ada)
print(f'Accuracy: {accuracy}')
lg_loss = metrics.log_loss(y_test, y_pred_optimal_ada)
print(f'Log Loss: {lg_loss}')
roc_auc = metrics.roc_auc_score(y_test, y_pred_optimal_ada)
print(f'ROC AUC: {roc_auc}')
f1_score = metrics.f1_score(y_test, y_pred_optimal_ada)
print(f'F1-score: {f1_score}')
precision = metrics.precision_score(y_test, y_pred_optimal_ada)
print(f'Precision: {precision}')
recall = metrics.recall_score(y_test, y_pred_optimal_ada)
print(f'Recall: {recall}')
report = metrics.classification_report(y_test, y_pred_optimal_ada)
print(f'Classification Report: \n {report}')

Accuracy: 0.7529665370590487
Log Loss: 8.53230676963379
ROC AUC: 0.701528514821457
F1-score: 0.5979408485802348
Precision: 0.6673009367681498
Recall: 0.5416419151716764
Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.86      0.82     32803
           1       0.67      0.54      0.60     16834

    accuracy                           0.75     49637
   macro avg       0.73      0.70      0.71     49637
weighted avg       0.75      0.75      0.75     49637



## 9 XGBoost Classifier (BERT)
* Model
* Error Metrics

### Model (Optimal Hyperparameters)

In [25]:
# Model (initialise the object based on parameters selected by bayesian optimisation)
weight_train = y_train.value_counts()[0] / y_train.value_counts()[1]

xgb_classifier = XGBClassifier(gamma = 0.1,
                               alpha = 0.5,
                               max_depth = 15, 
                               eta = 0.5, 
                               subsample = 0.8,
                               colsample_bytree = 0.7,
                               scale_pos_weight = weight_train,
                               objective = "binary:logistic",
                               eval_metric = "logloss")

In [None]:
%%time

# Fit Model
xgb_model = xgb_classifier.fit(x_train, y_train)

In [None]:
%%time

# y_prediction for the best model
y_pred_optimal_xgb = xgb_model.predict(x_test)

In [None]:
# Metrics
accuracy = metrics.accuracy_score(y_test, y_pred_optimal_xgb)
print(f'Accuracy: {accuracy}')
lg_loss = metrics.log_loss(y_test, y_pred_optimal_xgb)
print(f'Log Loss: {lg_loss}')
roc_auc = metrics.roc_auc_score(y_test, y_pred_optimal_xgb)
print(f'ROC AUC: {roc_auc}')
f1_score = metrics.f1_score(y_test, y_pred_optimal_xgb)
print(f'F1-score: {f1_score}')
precision = metrics.precision_score(y_test, y_pred_optimal_xgb)
print(f'Precision: {precision}')
recall = metrics.recall_score(y_test, y_pred_optimal_xgb)
print(f'Recall: {recall}')
report = metrics.classification_report(y_test, y_pred_optimal_xgb)
print(f'Classification Report: \n {report}')