# Detection of social bots using tweets


#### Group 3: Stefani Platakidou, Kinga Jenei, Alejandro Lozada, Liam Le Tran



### Instalations

In [1]:
# Uncommon the following line if you run the notebook for the first time

# %pip install emoji

### Imports

In [2]:
#Ignore print warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

import nltk
from nltk.corpus import stopwords
import emoji
import re

In [3]:
emoji.__version__

'2.2.0'

### Data presentation

In [4]:
df = pd.read_csv("data_sample_nlp.csv")
df.loc[20:28]

Unnamed: 0,label,id,author_id,created_at,type,text,language,geo_location,referenced_tweets_types,referenced_tweets_ids,user_mentions,media_types,in_reply_to_user_id,possibly_sensitive
20,bot,t1477711712191090689,2620097396,2022-01-02 18:41:41+00:00,post,@Quarry_Rock I had Harris as anytime scorer to...,en,False,{},{},{840817404},{},840817404.0,False
21,human,t1493923820893261831,1065878129963646976,2022-02-16 12:22:49+00:00,post,RT @forwardfooding: Alex Campos from @Nova_Mea...,en,False,{},{},"{2486723058,1065878129963646976}",{},,False
22,human,t1499799906508173314,32817728,2022-03-04 17:32:17+00:00,post,I go into this all in more detail as part of m...,en,False,{},{},{91478624},{},32817728.0,False
23,bot,t1423386273784467463,37692343,2021-08-05 20:51:47+00:00,post,RT @EntheosAi: If we show an AI model millions...,en,False,{},{},{793303720205230080},{},,False
24,human,t1499430143441440769,31373289,2022-03-03 17:02:58+00:00,post,Thanks to the Scottish Rugby Union and other E...,en,False,{},{},{},{photo},,False
25,human,t1499058948648939528,932779154084417537,2022-03-02 16:27:59+00:00,post,RT @chvyshld: @TheEagleist https://t.co/Q6dDjg...,und,False,{},{},"{22800603,932779154084417537}",{photo},,False
26,human,t1478759851337519108,839166270289362946,2022-01-05 16:06:37+00:00,post,Did a really rad in-class project today with @...,en,False,{},{},{1018830203768967170},{photo},,False
27,human,t1473458429440462856,3196118028,2021-12-22 01:00:39+00:00,post,直前に申し訳ないですが、CCC Winter 2021のオープニング開始は10:10〜とさせ...,ja,False,{},{},{},{},,False
28,human,t1499112813859688461,278206598,2022-03-02 20:02:01+00:00,post,📣 Attn #LincolnON - the Town is undergoing a w...,en,False,{},{},{},{photo},,False


In [5]:
df["text"][23]

'RT @EntheosAi: If we show an AI model millions of images and ask it to learn to categorize the world around us, what underlying geometry of…'

### Data pre-processing

In [6]:
# Filter out tweets that aren't english
df = df.loc[df["language"] == "en"]
df.loc[20:28]

Unnamed: 0,label,id,author_id,created_at,type,text,language,geo_location,referenced_tweets_types,referenced_tweets_ids,user_mentions,media_types,in_reply_to_user_id,possibly_sensitive
20,bot,t1477711712191090689,2620097396,2022-01-02 18:41:41+00:00,post,@Quarry_Rock I had Harris as anytime scorer to...,en,False,{},{},{840817404},{},840817404.0,False
21,human,t1493923820893261831,1065878129963646976,2022-02-16 12:22:49+00:00,post,RT @forwardfooding: Alex Campos from @Nova_Mea...,en,False,{},{},"{2486723058,1065878129963646976}",{},,False
22,human,t1499799906508173314,32817728,2022-03-04 17:32:17+00:00,post,I go into this all in more detail as part of m...,en,False,{},{},{91478624},{},32817728.0,False
23,bot,t1423386273784467463,37692343,2021-08-05 20:51:47+00:00,post,RT @EntheosAi: If we show an AI model millions...,en,False,{},{},{793303720205230080},{},,False
24,human,t1499430143441440769,31373289,2022-03-03 17:02:58+00:00,post,Thanks to the Scottish Rugby Union and other E...,en,False,{},{},{},{photo},,False
26,human,t1478759851337519108,839166270289362946,2022-01-05 16:06:37+00:00,post,Did a really rad in-class project today with @...,en,False,{},{},{1018830203768967170},{photo},,False
28,human,t1499112813859688461,278206598,2022-03-02 20:02:01+00:00,post,📣 Attn #LincolnON - the Town is undergoing a w...,en,False,{},{},{},{photo},,False


In [7]:
# # Download the stopwords collection from library
nltk.download('stopwords')
# Put it into a set to guarantee each word only appear once
STOPWORDS = list(set(stopwords.words('english')))
# Add punctuation to the stopwords list
STOPWORDS+=[".", "!", "?", ",", ";", ":", "[", "]", "{", "}", "-", "+", 
    "_", "/", "@", "#", "$", "%", "^", "&", "*", "(", ")", "<", ">", "|", "=",
    ".-", ".,", "'", '"', ',"', ".>", ".<"]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Liam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
"""
Questions : 
1. In general, what are the characteristics of a text written by a social bot?
2. Do they not contain account tags? 
3. How do we handle emojies? Do they indicate that an account is a bot?
4. Do we only stick to English?

Notes: 
The number of unique words are still too high?
Very slow when remove emojies

"""

def preprocess(df):
    """
    This function takes a dataframe with different features 
    and returns pre-processed texts from the tweets 
    """

    # update tweets by lowercase, strip and tokenize
    unique_word_freqs = set()
    df['text'].str.lower().str.split().apply(unique_word_freqs.update)

    # exclude stop words, tagged accounts, punctuation, links and numbers    
    unique_word_freqs = list(unique_word_freqs)
    data = [word for i,word in enumerate(unique_word_freqs) if ( '@' not in word) and ('http' not in word) \
            and (word not in STOPWORDS) and (not word.isnumeric())]
    print(data[:100])
    
    # remove emojies
    clean_text = " ".join([word for word in data if not any(i in word for i in emoji.EMOJI_DATA)])
    
    #remove punctuation inside of words
    clean_text = re.sub(r'[^\w\s]', '', clean_text).split()

    
    print(f'\nTotal unique words in all tweets: {len(clean_text)}\n')   
    return clean_text
    
data = preprocess(df)
print(data[:100])

['#linkbelow', '#alai', '1980!', '#bcafc', '🔹russian…', 'transparent,', 'impulse.', 'like”', 'imitate.', 'froze', 'rss', '1%).', 'polysiloxane-based', '#anti', 'ghazouani', '🤔🔍', 'translate,', 'slang,', 'roshan.', '#harriscounty', 'co-stars', '#trophyhusband', '15244:', 'asp.net', 'souvenir', '#jmp', 'hrc1155', 'coalition.', 'spectated', "george's", 'geos', 'daughter:', '#desantis', "#utsw's", 'dell,', 'angle”.', 'said--', 'checkout…', '#connecthome', 'vibe!', 'dirtier', 'asho', 'want/deserve', 'cityperc', '1990.', 'chiefly', '4096-bit', 'harris"', '#bpwinternational', 'think]', '#pushpatherule', 'barbies', 'burma', 'spook-tacular', '“meditate,', 'awis.', '#gwanderers', '#civilliability', 'high,', 'walaalo', 'technical,', 'wakrah', 'musical', 'desperate."', 'cesium', '#jmugivingday', 'gumming', 'nutanix,', 'action."', 'sideline', 'one-on-one,', '#cumslut', '#abujabusiness', 'birthdays', 'monday!)', '#arpinaction', '#sxswi', 'trusted.', "'simple'", 'gorge.', 'rac-ct,', 'salespeople),', 

### Rep zng

### Build classifier

In [None]:
def classifier():
    return 0

### Validation