In [4]:
#import necessary libraries
import pandas as pd
import numpy as np
import warnings
import nltk

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay, recall_score, precision_score, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import shuffle

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import matplotlib.pyplot as plt

import pickle

pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")

In [5]:
#Reading in the dataset
df_tweets_humans = pd.read_csv('human_tweets_processed.csv')
df_tweets_fake = pd.read_csv('bot_tweets_fake_processed.csv')
df_tweets_social = pd.read_csv('bot_tweets_social_processed.csv')

In [6]:
df_tweets_humans['isBot'] = 0
df_tweets_fake['isBot'] = 1
df_tweets_social['isBot'] = 1

In [7]:
df_tweets_humans_new = df_tweets_humans.sample(25000).reset_index(drop = True)
df_tweets_fake_new = df_tweets_fake.sample(12500).reset_index(drop = True)
df_tweets_social_new = df_tweets_social.sample(12500).reset_index(drop = True)

In [8]:
combined_tweets = pd.concat([df_tweets_humans_new, df_tweets_fake_new, df_tweets_social_new]).reset_index(drop = True)
combined_tweets.drop(combined_tweets.filter(regex="Unnamed"),axis=1, inplace=True)
combined_tweets

Unnamed: 0,id,text,source,user_id,retweeted_status_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,bot,cleaned_text,isBot
0,475357917839773696,@elementalbird_ i'd still beat dat ass those f...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",1.953036e+09,0.000000e+00,0.0,0.0,1.0,0.0,0.0,1.0,Sat Jun 07 19:25:33 +0000 2014,0,__user_mention__ i'd still beat dat ass those ...,0
1,560373851306360832,RT @peter_tl: Social media-advertising backlas...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",5.856182e+06,5.603508e+17,6.0,0.0,0.0,0.0,1.0,1.0,Wed Jan 28 09:48:32 +0000 2015,0,RT __user_mention__ Social media-advertising b...,0
2,548898415917031424,I dont want to walk this earth if i gotta do i...,"<a href=""http://twitter.com/download/iphone"" r...",5.572702e+08,0.000000e+00,0.0,0.0,1.0,0.0,0.0,0.0,Sat Dec 27 17:49:15 +0000 2014,0,I dont want to walk this earth if i gotta do i...,0
3,582393807179759616,Etta James - Something's Got A Hold On Mehttps...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",2.356426e+09,0.000000e+00,0.0,0.0,1.0,0.0,1.0,1.0,Mon Mar 30 04:07:58 +0000 2015,0,Etta James - Something's Got A Hold On MeÁmote...,0
4,421465899783045121,@sparknarry I'm going to tell him I know what ...,"<a href=""http://twitter.com/download/iphone"" r...",1.532884e+08,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1.0,Fri Jan 10 02:17:55 +0000 2014,0,__user_mention__ I'm going to tell him I know ...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,186948319484973056,Are you Ready to change your #iPhone view to 5...,web,1.232223e+08,,0.0,0.0,0.0,1.0,1.0,0.0,Mon Apr 02 22:48:47 +0000 2012,1,Are you Ready to change your __hashtag__ view ...,1
49996,324906881925668864,RT @RwandaEnergy: #Rwanda gears up to recharge...,web,4.757323e+08,,1500.0,0.0,1415.0,3.0,1.0,1.0,Thu Apr 18 15:26:51 +0000 2013,1,RT __user_mention__ __hashtag__ gears up to re...,1
49997,79869686153621504,"Put it down on mee , put it down on me ... &lt;3","<a href=""http://mobile.twitter.com"" rel=""nofol...",2.395259e+08,,0.0,0.0,0.0,0.0,0.0,0.0,Sun Jun 12 11:16:31 +0000 2011,1,"Put it down on mee , put it down on me ... &lt;3",1
49998,222874765097050112,donTrying to find ways to work more from home ...,web,6.166874e+08,,0.0,0.0,0.0,1.0,0.0,0.0,Wed Jul 11 02:07:39 +0000 2012,1,donTrying to find ways to work more from home ...,1


In [9]:
combined_tweets_final = shuffle(combined_tweets)
combined_tweets_final = combined_tweets_final.reset_index(drop = True)
combined_tweets_final

Unnamed: 0,id,text,source,user_id,retweeted_status_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,bot,cleaned_text,isBot
0,558857025270988801,aleah is me http://t.co/PXwTMPzLFf,"<a href=""http://twitter.com/download/iphone"" r...",2.286244e+09,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,Sat Jan 24 05:21:12 +0000 2015,0,aleah is me,0
1,361688175418150913,@WittyOfficial I got you bruh,"<a href=""http://twitter.com/download/iphone"" r...",2.575995e+07,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1.0,Mon Jul 29 03:22:35 +0000 2013,0,__user_mention__ I got you bruh,0
2,109649117361344513,@MissKotzyba its a diet where u can only eat p...,"<a href=""http://blackberry.com/twitter"" rel=""n...",1.093557e+07,,0.0,0.0,0.0,0.0,1.0,1.0,Fri Sep 02 15:29:21 +0000 2011,1,__user_mention__ its a diet where u can only e...,1
3,252533532839260161,When we are no longer able to change a situati...,web,6.176962e+08,,0.0,0.0,0.0,0.0,0.0,0.0,Sun Sep 30 22:21:00 +0000 2012,1,When we are no longer able to change a situati...,1
4,576292622286749696,@cat_lover_996 @Lauri777Ellonen @Wendy_Rich_UK...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",2.223970e+09,0.000000e+00,0.0,0.0,0.0,0.0,0.0,4.0,Fri Mar 13 08:04:02 +0000 2015,0,__user_mention__ __user_mention__ __user_menti...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,539658384753319937,@Mishakeet jfc whats wrong with the forum im s...,"<a href=""http://twitter.com/download/iphone"" r...",5.680155e+08,0.000000e+00,0.0,0.0,0.0,0.0,0.0,1.0,Tue Dec 02 05:52:39 +0000 2014,0,__user_mention__ jfc whats wrong with the foru...,0
49996,382063574794784768,@emmawehipeihana die hard with a vengeance @sa...,"<a href=""http://twitter.com/download/android"" ...",1.724285e+07,0.000000e+00,0.0,0.0,1.0,0.0,0.0,2.0,Mon Sep 23 08:47:09 +0000 2013,0,__user_mention__ die hard with a vengeance __u...,0
49997,47561861469716480,What would you do.... IF! a little old man pic...,"<a href=""http://formspring.me"" rel=""nofollow"">...",2.310354e+08,,0.0,0.0,0.0,0.0,0.0,0.0,Tue Mar 15 07:36:46 +0000 2011,1,What would you do.... IF! a little old man pic...,1
49998,212450434785361920,@SHADOW_VANITY6 lucky you..I can't unless if i...,"<a href=""http://twitter.com/download/android"" ...",1.645082e+08,,0.0,1.0,0.0,0.0,0.0,1.0,Tue Jun 12 07:45:05 +0000 2012,1,__user_mention__ lucky you..I can't unless if ...,1


In [10]:
combined_tweets_final.to_csv('tweets_dataset_allcols.csv', index = False)

In [11]:
combined_tweets_new = combined_tweets_final[['cleaned_text', 'isBot']]
combined_tweets_new

Unnamed: 0,cleaned_text,isBot
0,aleah is me,0
1,__user_mention__ I got you bruh,0
2,__user_mention__ its a diet where u can only e...,1
3,When we are no longer able to change a situati...,1
4,__user_mention__ __user_mention__ __user_menti...,0
...,...,...
49995,__user_mention__ jfc whats wrong with the foru...,0
49996,__user_mention__ die hard with a vengeance __u...,0
49997,What would you do.... IF! a little old man pic...,1
49998,__user_mention__ lucky you..I can't unless if ...,1


In [12]:
combined_tweets_new['tokenized'] = combined_tweets_new['cleaned_text'].apply(word_tokenize)

In [13]:
stop = list(stopwords.words('english'))
contextual_stop = ['day','new','one', 'time','need', "__user_mention__", "__hashtag__"]
lemmatizer = WordNetLemmatizer()

In [14]:
#Removed stop words
combined_tweets_new['processed_data'] = combined_tweets_new.tokenized.apply(lambda x: [word.casefold() for word in x if word.casefold() not in stop
                                                                    and word not in contextual_stop])
#lemmatize the words
combined_tweets_new['processed_data'] = combined_tweets_new.processed_data.apply(lambda x:[lemmatizer.lemmatize(word) for word in x])

combined_tweets_new['sentence'] = combined_tweets_new['processed_data'].apply(lambda x : " ".join(x))
#Change the sentence to lower case
combined_tweets_new['sentence'] = combined_tweets_new['sentence'].str.lower()

In [15]:
combined_tweets_new.to_csv('tweets_dataset_processed.csv', index = False)

In [16]:
combined_tweets_new['isBot'].value_counts()

0    25000
1    25000
Name: isBot, dtype: int64

we can consider TfIdf vectors, which could potentially identify rare but useful words that serve as useful features for classification

In [26]:
dataset = pd.read_csv('tweets_dataset_processed.csv')
dataset

Unnamed: 0,cleaned_text,isBot,tokenized,processed_data,sentence
0,aleah is me,0,"['aleah', 'is', 'me']",['aleah'],aleah
1,__user_mention__ I got you bruh,0,"['__user_mention__', 'I', 'got', 'you', 'bruh']","['got', 'bruh']",got bruh
2,__user_mention__ its a diet where u can only e...,1,"['__user_mention__', 'its', 'a', 'diet', 'wher...","['diet', 'u', 'eat', 'protein', 'exercise', '(...",diet u eat protein exercise ( walk ) every wor...
3,When we are no longer able to change a situati...,1,"['When', 'we', 'are', 'no', 'longer', 'able', ...","['longer', 'able', 'change', 'situation', ',',...","longer able change situation , challenged chan..."
4,__user_mention__ __user_mention__ __user_menti...,0,"['__user_mention__', '__user_mention__', '__us...","['complete', 'first', 'heist']",complete first heist
...,...,...,...,...,...
49995,__user_mention__ jfc whats wrong with the foru...,0,"['__user_mention__', 'jfc', 'whats', 'wrong', ...","['jfc', 'whats', 'wrong', 'forum', 'im', 'sorr...",jfc whats wrong forum im sorry 're acting like...
49996,__user_mention__ die hard with a vengeance __u...,0,"['__user_mention__', 'die', 'hard', 'with', 'a...","['die', 'hard', 'vengeance', 'maaaan', ':', '-...",die hard vengeance maaaan : - )
49997,What would you do.... IF! a little old man pic...,1,"['What', 'would', 'you', 'do', '....', 'IF', '...","['would', '....', '!', 'little', 'old', 'man',...",would .... ! little old man picked nose flicke...
49998,__user_mention__ lucky you..I can't unless if ...,1,"['__user_mention__', 'lucky', 'you', '..', 'I'...","['lucky', '..', 'ca', ""n't"", 'unless', 'wanted...",lucky .. ca n't unless wanted get slapped


In [28]:
#using tfidf vectors
tfidf_vectorizer = TfidfVectorizer(max_df = 0.5, min_df = 10)
#Create an array that contains the BOW representation of each tweet
tfidf_bow_array  = tfidf_vectorizer.fit_transform(dataset['sentence'].values.astype(str))
# #Converting the result into a pandas dataframe
tfidf_bow = pd.DataFrame(tfidf_bow_array.toarray(), columns = tfidf_vectorizer.get_feature_names_out())

In [29]:
tfidf_bow.shape

(50000, 4293)

In [30]:
#Next, treat the BOW vectors as features and split the data into training and testing sets
X = tfidf_bow
y = dataset['isBot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

#### Logistic Regression TFIDF

In [31]:
%%time

model_log_reg_tfidf = LogisticRegression()

logreg_tfidf_model = model_log_reg_tfidf.fit(X_train, y_train)

CPU times: total: 2min 28s
Wall time: 25.4 s


In [32]:
y_pred = logreg_tfidf_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.71      0.69      5024
           1       0.69      0.66      0.67      4976

    accuracy                           0.68     10000
   macro avg       0.68      0.68      0.68     10000
weighted avg       0.68      0.68      0.68     10000



#### Random Forest TFIDF

In [33]:
%%time

model_rf_tfidf = RandomForestClassifier()

rf_tfidf_model = model_rf_tfidf.fit(X_train, y_train)

CPU times: total: 29min 37s
Wall time: 30min 35s


In [34]:
y_pred = rf_tfidf_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.75      0.75      5024
           1       0.75      0.73      0.74      4976

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000

