In [22]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import os  
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# DATA PRE-PROCESSING

In [2]:
raw_path = '../data/raw/bot_detection_data.csv'
data = pd.read_csv(raw_path)

In [3]:
data

Unnamed: 0,User ID,Username,Tweet,Retweet Count,Mention Count,Follower Count,Verified,Bot Label,Location,Created At,Hashtags
0,132131,flong,Station activity person against natural majori...,85,1,2353,False,1,Adkinston,2020-05-11 15:29:50,
1,289683,hinesstephanie,Authority research natural life material staff...,55,5,9617,True,0,Sanderston,2022-11-26 05:18:10,both live
2,779715,roberttran,Manage whose quickly especially foot none to g...,6,2,4363,True,0,Harrisonfurt,2022-08-08 03:16:54,phone ahead
3,696168,pmason,Just cover eight opportunity strong policy which.,54,5,2242,True,1,Martinezberg,2021-08-14 22:27:05,ever quickly new I
4,704441,noah87,Animal sign six data good or.,26,3,8438,False,1,Camachoville,2020-04-13 21:24:21,foreign mention
...,...,...,...,...,...,...,...,...,...,...,...
49995,491196,uberg,Want but put card direction know miss former h...,64,0,9911,True,1,Lake Kimberlyburgh,2023-04-20 11:06:26,teach quality ten education any
49996,739297,jessicamunoz,Provide whole maybe agree church respond most ...,18,5,9900,False,1,Greenbury,2022-10-18 03:57:35,add walk among believe
49997,674475,lynncunningham,Bring different everyone international capital...,43,3,6313,True,1,Deborahfort,2020-07-08 03:54:08,onto admit artist first
49998,167081,richardthompson,Than about single generation itself seek sell ...,45,1,6343,False,0,Stephenside,2022-03-22 12:13:44,star


In [4]:
data["Created At"] = pd.to_datetime(data["Created At"])

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   User ID           50000 non-null  int64         
 1   Username          50000 non-null  object        
 2   Tweet             50000 non-null  object        
 3   Retweet Count     50000 non-null  int64         
 4   Mention Count     50000 non-null  int64         
 5   Follower Count    50000 non-null  int64         
 6   Verified          50000 non-null  bool          
 7   Bot Label         50000 non-null  int64         
 8   Location          50000 non-null  object        
 9   Created At        50000 non-null  datetime64[ns]
 10  Hashtags          50000 non-null  object        
 11  Tokenized Tweet   50000 non-null  object        
 12  Stemmed Tweet     50000 non-null  object        
 13  Lemmatized Tweet  50000 non-null  object        
dtypes: bool(1), datetime64

In [5]:
data['Hashtags'] = data['Hashtags'].fillna('')

In [6]:
data['Username'] = data['Username'].str.lower()
data['Tweet'] = data['Tweet'].str.lower() 
data['Hashtags'] = data['Hashtags'].str.lower()
data['Location'] = data['Location'].str.lower()

In [7]:
data['Tweet'] = data['Tweet'].str.translate(str.maketrans('', '', string.punctuation))
data['Hashtags'] = data['Hashtags'].str.translate(str.maketrans('', '', string.punctuation))

In [9]:
nltk.download('punkt')
data['Tokenized Tweet'] = data['Tweet'].apply(nltk.word_tokenize)

[nltk_data] Downloading package punkt to /home/sush/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
data['Tokenized Tweet'] = data['Tokenized Tweet'].apply(lambda x: [word for word in x if word not in stop_words])

[nltk_data] Downloading package stopwords to /home/sush/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
stemmer = PorterStemmer()
data['Stemmed Tweet'] = data['Tokenized Tweet'].apply(lambda x: [stemmer.stem(word) for word in x]) 

In [13]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
data['Lemmatized Tweet'] = data['Tokenized Tweet'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to /home/sush/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
data['Stemmed Tweet'] = data['Stemmed Tweet'].apply(lambda x: ' '.join(x))
data['Lemmatized Tweet'] = data['Lemmatized Tweet'].apply(lambda x: ' '.join(x))

In [19]:
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(data['Lemmatized Tweet'])

In [21]:
y = data['Bot Label']
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# MODEL 

In [23]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [24]:
y_pred = model.predict(X_test)

In [26]:
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5039
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.51      0.50      4968
           1       0.51      0.50      0.50      5032

    accuracy                           0.50     10000
   macro avg       0.50      0.50      0.50     10000
weighted avg       0.50      0.50      0.50     10000

