In [21]:
#import necessary libraries
import pandas as pd
import numpy as np
import warnings
import nltk

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay, recall_score, precision_score, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import shuffle

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import matplotlib.pyplot as plt

import pickle

pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")

In [2]:
#Reading in the dataset
df_tweets_humans = pd.read_csv('human_tweets_processed.csv')
df_tweets_fake = pd.read_csv('bot_tweets_fake_processed.csv')
df_tweets_social = pd.read_csv('bot_tweets_social_processed.csv')

In [3]:
df_tweets_humans['isBot'] = 0
df_tweets_fake['isBot'] = 1
df_tweets_social['isBot'] = 1

In [4]:
df_tweets_humans_new = df_tweets_humans.sample(25000).reset_index(drop = True)
df_tweets_fake_new = df_tweets_fake.sample(12500).reset_index(drop = True)
df_tweets_social_new = df_tweets_social.sample(12500).reset_index(drop = True)

In [5]:
combined_tweets = pd.concat([df_tweets_humans_new, df_tweets_fake_new, df_tweets_social_new]).reset_index(drop = True)
combined_tweets.drop(combined_tweets.filter(regex="Unnamed"),axis=1, inplace=True)
combined_tweets

Unnamed: 0,id,text,source,user_id,retweeted_status_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,bot,cleaned_text,isBot
0,445378199917047810,@AlexJamesFitz @marcoarment Teach a man to Phi...,"<a href=""http://tapbots.com/software/tweetbot/...",12287842.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,2.0,Mon Mar 17 01:56:52 +0000 2014,0,__user_mention__ __user_mention__ Teach a man ...,0
1,504364795080867840,Newport 100s last so long that's why i love em,"<a href=""http://twitter.com/download/android"" ...",601401812.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,Tue Aug 26 20:28:32 +0000 2014,0,Newport 100s last so long that's why i love em,0
2,513459117030002688,cunliffe pls step down and let labour re-inven...,"<a href=""https://about.twitter.com/products/tw...",595827319.0,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,Sat Sep 20 22:46:07 +0000 2014,0,cunliffe pls step down and let labour re-inven...,0
3,593239621905047552,@harry_stylesYou've done so so much for methat...,"<a href=""https://about.twitter.com/products/tw...",219845841.0,0.000000e+00,1.0,0.0,0.0,0.0,1.0,1.0,Wed Apr 29 02:25:22 +0000 2015,0,__user_mention__ done so so much for methats w...,0
4,365003680128708608,"RT @donttrythis: Walter White, you have some c...","<a href=""http://twitter.com/download/iphone"" r...",186855547.0,3.648066e+17,211.0,0.0,0.0,0.0,1.0,5.0,Wed Aug 07 06:57:13 +0000 2013,0,"RT __user_mention__ Walter White, you have som...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,217723693193904130,Check out our E-Boutique Store for the Ladies,web,619305569.0,,0.0,0.0,0.0,0.0,0.0,0.0,Tue Jun 26 20:59:08 +0000 2012,1,Check out our E-Boutique Store for the Ladies,1
49996,88331881086586880,@ruthaprelia @arini_dwi Happy Birthday!,web,242186430.0,,0.0,1.0,1.0,0.0,0.0,2.0,Tue Jul 05 19:42:16 +0000 2011,1,__user_mention__ __user_mention__ Happy Birthday!,1
49997,306737245128368129,o´¯`❄.¸(░)`O.¸¸.¸.o´¯`❄.¸(░)`O.❄。¨¯`*✲ ´*。.❄¨¯...,web,616220039.0,,0.0,0.0,0.0,0.0,0.0,0.0,Wed Feb 27 12:07:12 +0000 2013,1,o´¯`❄.¸(░)`O.¸¸.¸.o´¯`❄.¸(░)`O.❄。¨¯`*✲ ´*。.❄¨¯...,1
49998,5260113690,@st3phy890527 definitely! har har har! RAWR!,"<a href=""http://www.tweetdeck.com/"" rel=""nofol...",72835654.0,,0.0,0.0,0.0,0.0,0.0,0.0,Thu Oct 29 14:39:44 +0000 2009,1,__user_mention__ definitely! har har har! RAWR!,1


In [6]:
combined_tweets_final = shuffle(combined_tweets)
combined_tweets_final = combined_tweets_final.reset_index(drop = True)
combined_tweets_final

Unnamed: 0,id,text,source,user_id,retweeted_status_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,bot,cleaned_text,isBot
0,578732050921865216,"Whoo, go get get it, Olivia! #ScandalABC Oh, d...","<a href=""https://twitter.com/download/android""...",1.780163e+07,0.000000e+00,0.0,0.0,0.0,1.0,0.0,0.0,Fri Mar 20 01:37:27 +0000 2015,0,"Whoo, go get get it, Olivia! __hashtag__ Oh, d...",0
1,570955971486482432,RT @purpleprism: [MPD/직캠] 150226 Congratulatio...,"<a href=""http://twitter.com/download/android"" ...",2.065326e+08,5.709108e+17,21.0,0.0,0.0,0.0,1.0,1.0,Thu Feb 26 14:38:06 +0000 2015,0,RT __user_mention__ [MPD/직캠] 150226 Congratula...,0
2,581606081895522304,"@cnrsocial2 @socialsensing All of Cairo did, a...","<a href=""https://about.twitter.com/products/tw...",1.694767e+08,0.000000e+00,0.0,0.0,1.0,0.0,0.0,2.0,Fri Mar 27 23:57:50 +0000 2015,0,__user_mention__ __user_mention__ All of Cairo...,0
3,488872386746261505,@MichelleFaithT thanks but her mom and dad are...,"<a href=""http://twitter.com/download/iphone"" r...",2.240095e+09,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,Tue Jul 15 02:27:14 +0000 2014,0,__user_mention__ thanks but her mom and dad ar...,0
4,268612298204123136,"""To achieve greatness, start where you are, us...",web,6.162287e+08,,0.0,0.0,0.0,0.0,0.0,0.0,Wed Nov 14 07:12:17 +0000 2012,1,"""To achieve greatness, start where you are, us...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,406753071054598144,... 24 days till Christmas,"<a href=""http://twitter.com/download/iphone"" r...",1.690357e+08,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,Sat Nov 30 11:54:23 +0000 2013,0,... 24 days till Christmas,0
49996,593286877257584640,@harry_stylesYou've done so so much for methat...,"<a href=""https://twitter.com/download/android""...",2.198458e+08,0.000000e+00,1.0,0.0,0.0,0.0,1.0,1.0,Wed Apr 29 05:33:08 +0000 2015,0,__user_mention__ done so so much for methats w...,0
49997,164197153051394048,Join @lovemiyake giveaway! :DD,web,8.329319e+07,,0.0,0.0,0.0,0.0,0.0,0.0,Tue Jan 31 04:03:46 +0000 2012,1,Join __user_mention__ giveaway! :DD,1
49998,445583842653773824,I posted a new photo to Facebook http://t.co/Z...,"<a href=""http://www.facebook.com/twitter"" rel=...",1.480228e+08,0.000000e+00,0.0,0.0,0.0,0.0,1.0,0.0,Mon Mar 17 15:34:01 +0000 2014,0,I posted a new photo to Facebook,0


In [7]:
combined_tweets_final.to_csv('tweets_dataset_allcols.csv', index = False)

In [8]:
combined_tweets_new = combined_tweets_final[['cleaned_text', 'isBot']]
combined_tweets_new

Unnamed: 0,cleaned_text,isBot
0,"Whoo, go get get it, Olivia! __hashtag__ Oh, d...",0
1,RT __user_mention__ [MPD/직캠] 150226 Congratula...,0
2,__user_mention__ __user_mention__ All of Cairo...,0
3,__user_mention__ thanks but her mom and dad ar...,0
4,"""To achieve greatness, start where you are, us...",1
...,...,...
49995,... 24 days till Christmas,0
49996,__user_mention__ done so so much for methats w...,0
49997,Join __user_mention__ giveaway! :DD,1
49998,I posted a new photo to Facebook,0


In [9]:
combined_tweets_new['tokenized'] = combined_tweets_new['cleaned_text'].apply(word_tokenize)

In [10]:
stop = list(stopwords.words('english'))
contextual_stop = ['day','new','one', 'time','need', "__user_mention__", "__hashtag__"]
lemmatizer = WordNetLemmatizer()

In [11]:
#Removed stop words
combined_tweets_new['processed_data'] = combined_tweets_new.tokenized.apply(lambda x: [word.casefold() for word in x if word.casefold() not in stop
                                                                    and word not in contextual_stop])
#lemmatize the words
combined_tweets_new['processed_data'] = combined_tweets_new.processed_data.apply(lambda x:[lemmatizer.lemmatize(word) for word in x])

combined_tweets_new['sentence'] = combined_tweets_new['processed_data'].apply(lambda x : " ".join(x))
#Change the sentence to lower case
combined_tweets_new['sentence'] = combined_tweets_new['sentence'].str.lower()

In [12]:
combined_tweets_new.to_csv('tweets_dataset_processed.csv', index = False)

In [13]:
combined_tweets_new['isBot'].value_counts()

0    25000
1    25000
Name: isBot, dtype: int64

we can consider TfIdf vectors, which could potentially identify rare but useful words that serve as useful features for classification

In [14]:
dataset = pd.read_csv('tweets_dataset_processed.csv')
dataset

Unnamed: 0,cleaned_text,isBot,tokenized,processed_data,sentence
0,"Whoo, go get get it, Olivia! __hashtag__ Oh, d...",0,"['Whoo', ',', 'go', 'get', 'get', 'it', ',', '...","['whoo', ',', 'go', 'get', 'get', ',', 'olivia...","whoo , go get get , olivia ! oh , damn flashba..."
1,RT __user_mention__ [MPD/직캠] 150226 Congratula...,0,"['RT', '__user_mention__', '[', 'MPD/직캠', ']',...","['rt', '[', 'mpd/직캠', ']', '150226', 'congratu...",rt [ mpd/직캠 ] 150226 congratulation 4minute~ !...
2,__user_mention__ __user_mention__ All of Cairo...,0,"['__user_mention__', '__user_mention__', 'All'...","['cairo', ',', 'yes', 'fine', ':', ')']","cairo , yes fine : )"
3,__user_mention__ thanks but her mom and dad ar...,0,"['__user_mention__', 'thanks', 'but', 'her', '...","['thanks', 'mom', 'dad', 'home', ""'m"", 'way', ...",thanks mom dad home 'm way home already .
4,"""To achieve greatness, start where you are, us...",1,"['``', 'To', 'achieve', 'greatness', ',', 'sta...","['``', 'achieve', 'greatness', ',', 'start', '...","`` achieve greatness , start , use , . '' -- a..."
...,...,...,...,...,...
49995,... 24 days till Christmas,0,"['...', '24', 'days', 'till', 'Christmas']","['...', '24', 'day', 'till', 'christmas']",... 24 day till christmas
49996,__user_mention__ done so so much for methats w...,0,"['__user_mention__', 'done', 'so', 'so', 'much...","['done', 'much', 'methats', 'got', 'tattoo', '...",done much methats got tattoo ! hope see & amp ...
49997,Join __user_mention__ giveaway! :DD,1,"['Join', '__user_mention__', 'giveaway', '!', ...","['join', 'giveaway', '!', ':', 'dd']",join giveaway ! : dd
49998,I posted a new photo to Facebook,0,"['I', 'posted', 'a', 'new', 'photo', 'to', 'Fa...","['posted', 'photo', 'facebook']",posted photo facebook


In [15]:
#using tfidf vectors
tfidf_vectorizer = TfidfVectorizer(max_df = 0.5, min_df = 10)
#Create an array that contains the BOW representation of each tweet
tfidf_bow_array  = tfidf_vectorizer.fit_transform(dataset['sentence'].values.astype(str))
# #Converting the result into a pandas dataframe
tfidf_bow = pd.DataFrame(tfidf_bow_array.toarray(), columns = tfidf_vectorizer.get_feature_names_out())

In [16]:
tfidf_bow.shape

(50000, 4368)

In [17]:
#Next, treat the BOW vectors as features and split the data into training and testing sets
X = tfidf_bow
y = dataset['isBot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

#### Logistic Regression TFIDF

In [18]:
%%time

model_log_reg_tfidf = LogisticRegression()

logreg_tfidf_model = model_log_reg_tfidf.fit(X_train, y_train)

CPU times: user 51.7 s, sys: 1.1 s, total: 52.8 s
Wall time: 16.8 s


In [None]:
y_pred = logreg_tfidf_model.predict(X_test)

# Error Metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
lg_loss = metrics.log_loss(y_test, y_pred)
print(f'Log Loss: {lg_loss}')
roc_auc = metrics.roc_auc_score(y_test, y_pred)
print(f'ROC AUC: {roc_auc}')
f1_score = metrics.f1_score(y_test, y_pred)
print(f'F1-score: {f1_score}')
precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')
recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {recall}')
report = metrics.classification_report(y_test, y_pred)
print(f'Classification Report: \n {report}')


Accuracy: 0.6839
Log Loss: 11.393398836299932
ROC AUC: 0.683804922162403
F1-score: 0.6734841441999794
Precision: 0.6940600383223334
Recall: 0.6540930979133226
Classification Report: 
               precision    recall  f1-score   support

           0       0.67      0.71      0.69      5016
           1       0.69      0.65      0.67      4984

    accuracy                           0.68     10000
   macro avg       0.68      0.68      0.68     10000
weighted avg       0.68      0.68      0.68     10000



#### Random Forest TFIDF

In [24]:
%%time

model_rf_tfidf = RandomForestClassifier()

rf_tfidf_model = model_rf_tfidf.fit(X_train, y_train)

CPU times: user 18min 37s, sys: 15 s, total: 18min 52s
Wall time: 20min 5s


In [26]:
y_pred = rf_tfidf_model.predict(X_test)

# Error Metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
lg_loss = metrics.log_loss(y_test, y_pred)
print(f'Log Loss: {lg_loss}')
roc_auc = metrics.roc_auc_score(y_test, y_pred)
print(f'ROC AUC: {roc_auc}')
f1_score = metrics.f1_score(y_test, y_pred)
print(f'F1-score: {f1_score}')
precision = metrics.precision_score(y_test, y_pred)
print(f'Precision: {precision}')
recall = metrics.recall_score(y_test, y_pred)
print(f'Recall: {recall}')
report = metrics.classification_report(y_test, y_pred)
print(f'Classification Report: \n {report}')


Accuracy: 0.7435
Log Loss: 9.24519709430855
ROC AUC: 0.7434868133049681
F1-score: 0.7418218419728232
Precision: 0.7442940820036357
Recall: 0.7393659711075441
Classification Report: 
               precision    recall  f1-score   support

           0       0.74      0.75      0.75      5016
           1       0.74      0.74      0.74      4984

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000

