# Detection of social bots using tweets


#### Group 3: Stefani Platakidou, Kinga Jenei, Alejandro Lozada, Liam Le Tran



### Installations

In [3]:
# Uncomment the following line if you run the notebook for the first time

#%pip install emoji

### Imports

In [4]:
#Ignore print warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

import nltk
from nltk.corpus import stopwords
import emoji
import re

In [5]:
emoji.__version__

'2.2.0'

### Data presentation

In [14]:
df = pd.read_csv("data_sample_nlp.csv")
df.tail()

Unnamed: 0,label,id,author_id,created_at,type,text,language,geo_location,referenced_tweets_types,referenced_tweets_ids,user_mentions,media_types,in_reply_to_user_id,possibly_sensitive
299999,human,t1471504881882075140,17268418,2021-12-16 15:37:57+00:00,post,"RT @ElemPE1: ""Toys for Tots""🎁\n\nLess lecturin...",en,False,{},{},{907432611319914497},{},,False
300000,human,t1499064851347304449,33152005,2022-03-02 16:51:26+00:00,post,@DavidVimes Same.,en,False,{},{},{966462024145391616},{},9.66462e+17,False
300001,human,t1492208040795324425,76086369,2022-02-11 18:44:55+00:00,post,@BigelowLab I spy a @wellsreserve alumna!,en,False,{},{},"{260908660,76086369}",{},260908700.0,False
300002,human,t942173606888595457,709501413819293696,2017-12-16 23:24:27+00:00,post,RT @MPSHaringey: Who thinks #PDBoots deserves ...,en,False,{},{},{407149465},{},,False
300003,human,t1498058873269370880,2218413296,2022-02-27 22:14:02+00:00,post,"1 BTC = 37,535 USD",vi,False,{},{},{},{},,False


In [5]:
df["text"][23]

'RT @EntheosAi: If we show an AI model millions of images and ask it to learn to categorize the world around us, what underlying geometry of…'

### Data pre-processing

In [15]:
# Filter out tweets that aren't english
df = df.loc[df["language"] == "en"]
df.loc[20:28]

220152

In [16]:
df.label.value_counts() # unbalanced

human    202760
bot       17392
Name: label, dtype: int64

In [18]:
# # Download the stopwords collection from library
nltk.download('stopwords')
# Put it into a set to guarantee each word only appear once
STOPWORDS = list(set(stopwords.words('english')))
# Add punctuation to the stopwords list
STOPWORDS += [".", "!", "?", ",", ";", ":", "[", "]", "{", "}", "-", "+", 
    "_", "/", "#", "$", "%", "^", "&", "*", "(", ")", "<", ">", "|", "=",
    ".-", ".,", "'", '"', ',"', ".>", ".<"]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stefa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
"""
Questions : 
1. In general, what are the characteristics of a text written by a social bot?
2. Do they not contain account tags? 
3. How do we handle emojies? Do they indicate that an account is a bot?
4. Do we only stick to English?

Notes: 
The number of unique words are still too high?
Very slow when remove emojies

"""

def preprocess(df):
    """
    This function takes a dataframe with different features 
    and returns pre-processed texts from the tweets 
    """

    # update tweets by lowercase, strip and tokenize
    unique_word_freqs = set()
    df['text'].str.lower().str.split().apply(unique_word_freqs.update)

    # exclude stop words, tagged accounts, punctuation, links and numbers    
    unique_word_freqs = list(unique_word_freqs)
    data = [word for i,word in enumerate(unique_word_freqs) if ( '@' not in word) and ('http' not in word) \
            and (word not in STOPWORDS) and (not word.isnumeric())]
    print(data[:100])
    
    # remove emojies
    clean_text = " ".join([word for word in data if not any(i in word for i in emoji.EMOJI_DATA)])
    
    #remove punctuation inside of words
    clean_text = re.sub(r'[^\w\s]', '', clean_text).split()

    
    print(f'\nTotal unique words in all tweets: {len(clean_text)}\n')   
    return clean_text
    
data = preprocess(df)
print(data[:100])

['#sharpiegate', 'poland.…', '#healthinsurance.”', '#aievents', '#0339', '8-12,', '#tedtalks', 'fastmri.', 'minute-by-minute,', 'mencken', 'chills', '#tips', 'newsblocks', 't&amp;k', 'mobiles', '#thelastofuspart2', '#breakfastlover', 'nova?', '#authorsrock', 'implications.', 'suici…', '"3x', '#cseiiitd', 'changer.', 'stinger', 'brother#your', 'tamale,', '4(b)', 'kibale', 'bus/lirr/bike/walk/other', '#smallbusinessowner', 'globe.', 'spins!', '#shetland', 'server,', 'tuhina', '⬜⬜⬜⬜🟩', 'alleviate.', 'hamiltonian.', '#chayat', 'informative', 'emotion', 'hush', 'spectacle', 'unexplainable', '#nse', '#ool2020', '#worldtravelerer', '#clicklessanalytics4comparison', 'andy', 'nada', 'cecil', 'cvcs', 'fyi', 'lbp/usd.', 'retransitioned', '(aaveusdt)', 'hackney', '#freejazzz', '#gilford', 'narowal', '😱😱😱😱', 'online-only', '#machinery', '#instagamer', 'atos', 'hoagie', 'insulins', 'ompc/f', 'farsi', '#consumerprivacy', 'announcement....', 'source)', 'carbene', 'bex!!!', '🇬🇧:', 'krita', 'felons', 'r

### Rep zng

### Build classifier

In [None]:
def classifier():
    return 0

### Validation