# Data preprocessing

### Installations

In [1]:
# Uncomment the following line if you run the notebook for the first time

#%pip install emoji

### Imports

In [2]:
#Ignore print warnings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

import nltk
from nltk.corpus import stopwords
import emoji
import re

In [3]:
emoji.__version__

'2.2.0'

### Data presentation

In [4]:
df = pd.read_csv("data_sample_nlp.csv")
df.tail()

Unnamed: 0,label,id,author_id,created_at,type,text,language,geo_location,referenced_tweets_types,referenced_tweets_ids,user_mentions,media_types,in_reply_to_user_id,possibly_sensitive
299999,human,t1471504881882075140,17268418,2021-12-16 15:37:57+00:00,post,"RT @ElemPE1: ""Toys for Tots""🎁\n\nLess lecturin...",en,False,{},{},{907432611319914497},{},,False
300000,human,t1499064851347304449,33152005,2022-03-02 16:51:26+00:00,post,@DavidVimes Same.,en,False,{},{},{966462024145391616},{},9.66462e+17,False
300001,human,t1492208040795324425,76086369,2022-02-11 18:44:55+00:00,post,@BigelowLab I spy a @wellsreserve alumna!,en,False,{},{},"{260908660,76086369}",{},260908700.0,False
300002,human,t942173606888595457,709501413819293696,2017-12-16 23:24:27+00:00,post,RT @MPSHaringey: Who thinks #PDBoots deserves ...,en,False,{},{},{407149465},{},,False
300003,human,t1498058873269370880,2218413296,2022-02-27 22:14:02+00:00,post,"1 BTC = 37,535 USD",vi,False,{},{},{},{},,False


In [5]:
df["text"][23]

'RT @EntheosAi: If we show an AI model millions of images and ask it to learn to categorize the world around us, what underlying geometry of…'

### Data pre-processing

In [6]:
# Filter out tweets that aren't english
df = df.loc[df["language"] == "en"]
df.loc[20:28]

Unnamed: 0,label,id,author_id,created_at,type,text,language,geo_location,referenced_tweets_types,referenced_tweets_ids,user_mentions,media_types,in_reply_to_user_id,possibly_sensitive
20,bot,t1477711712191090689,2620097396,2022-01-02 18:41:41+00:00,post,@Quarry_Rock I had Harris as anytime scorer to...,en,False,{},{},{840817404},{},840817404.0,False
21,human,t1493923820893261831,1065878129963646976,2022-02-16 12:22:49+00:00,post,RT @forwardfooding: Alex Campos from @Nova_Mea...,en,False,{},{},"{2486723058,1065878129963646976}",{},,False
22,human,t1499799906508173314,32817728,2022-03-04 17:32:17+00:00,post,I go into this all in more detail as part of m...,en,False,{},{},{91478624},{},32817728.0,False
23,bot,t1423386273784467463,37692343,2021-08-05 20:51:47+00:00,post,RT @EntheosAi: If we show an AI model millions...,en,False,{},{},{793303720205230080},{},,False
24,human,t1499430143441440769,31373289,2022-03-03 17:02:58+00:00,post,Thanks to the Scottish Rugby Union and other E...,en,False,{},{},{},{photo},,False
26,human,t1478759851337519108,839166270289362946,2022-01-05 16:06:37+00:00,post,Did a really rad in-class project today with @...,en,False,{},{},{1018830203768967170},{photo},,False
28,human,t1499112813859688461,278206598,2022-03-02 20:02:01+00:00,post,📣 Attn #LincolnON - the Town is undergoing a w...,en,False,{},{},{},{photo},,False


In [7]:
df.label.value_counts() # unbalanced

human    202760
bot       17392
Name: label, dtype: int64

In [70]:
# # Download the stopwords collection from library
nltk.download('stopwords')
# Put it into a set to guarantee each word only appear once
STOPWORDS = list(set(stopwords.words('english')))
# Add punctuation to the stopwords list
STOPWORDS += [".", "!", "?", ",", ";", ":", "[", "]", "{", "}", "-", "+", 
    "_", "/", "#", "@", "$", "%", "^", "&", "*", "(", ")", "<", ">", "|", "=",
    ".-", ".,", "'", '"', ',"', ".>", ".<"]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kingu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [85]:
df['text'].head()

1    RT @Wieneraaron: When you need your emotional ...
3    RT @InfoSecSherpa: "The Thurgood Marshall Coll...
4    RT @NCIAorg: BREAKING – SAFE Banking Act Reint...
5           @L1nds Oooh I like https://t.co/Y5V50GCHdB
8          No war!.....PLEASE! https://t.co/YGxOZpN5Kk
Name: text, dtype: object

In [82]:
def preprocessing(df, lowercase=False, stopwords=False, links=False, tags=False, numbers=False, emojis=False, hashtag=False,
                 rt=False):
    
    new_df = df.copy()
    text = new_df['text']
    
    # lowercasing everything
    if lowercase:
        text = text.apply(lambda x: str.lower(x))
    
    # removing stopword
    if stopwords:
        # we have to look at the lowercase words, since the stopwords are lowercase
        text = text.apply(lambda x: " ".join([word for word in x.split() if str.lower(word) not in STOPWORDS]))
    
    # removing links
    if links:
        text = text.apply(lambda x: " ".join([word for word in x.split() if 'http' not in word]))
    
    # removing tags
    if tags:
        text = text.apply(lambda x: " ".join([word for word in x.split() if '@' not in word]))
    
    # removing numbers only if the whole word is numeric - eg. we remove 1123 but not 1123a
    if numbers:
        text = text.apply(lambda x: " ".join([word for word in x.split() if not word.isnumeric()]))
    
    # removing emojis (whole word if it contains an emoji)
    if emojis:
        text = text.apply(lambda x: " ".join([word for word in x.split() if not any(i in word for i in emoji.EMOJI_DATA)]))
    
    # removing hashtags
    if hashtag:
        text = text.apply(lambda x: " ".join([word for word in x.split() if '#' not in word]))
        
    # removing rt from the beginning
    if rt:
        text = text.apply(lambda x: " ".join([word for i,word in enumerate(x.split()) if not (i==0 and str.lower(word)=='rt')]))
    
    new_df['text'] = text
    
    return new_df

In [97]:
df_preprocessed=preprocessing(df.head(), lowercase=False, stopwords=True, links=True, tags=True, numbers=True, 
              emojis=True, hashtag=True, rt=True)
df_preprocessed['text']

1                      need emotional support chicken.
3    "The Thurgood Marshall College Fund (TMCF), Pa...
4    BREAKING – SAFE Banking Act Reintroduced House...
5                                            Oooh like
8                                     war!.....PLEASE!
Name: text, dtype: object

In [21]:
"""
Questions : 
1. In general, what are the characteristics of a text written by a social bot?
2. Do they not contain account tags? 
3. How do we handle emojies? Do they indicate that an account is a bot?
4. Do we only stick to English?

Notes: 
The number of unique words are still too high?
Very slow when remove emojies

"""

def preprocess(df):
    """
    This function takes a dataframe with different features 
    and returns pre-processed texts from the tweets 
    """

    # update tweets by lowercase, strip and tokenize
    unique_word_freqs = set()
    df['text'].str.lower().str.split().apply(unique_word_freqs.update)

    # exclude stop words, tagged accounts, punctuation, links and numbers    
    unique_word_freqs = list(unique_word_freqs)
    data = [word for i,word in enumerate(unique_word_freqs) if ( '@' not in word) and ('http' not in word) \
            and (word not in STOPWORDS) and (not word.isnumeric())]
    print(data[:100])
    
    # remove emojies
    clean_text = " ".join([word for word in data if not any(i in word for i in emoji.EMOJI_DATA)])
    
    #remove punctuation inside of words
    clean_text = re.sub(r'[^\w\s]', '', clean_text).split()

    
    print(f'\nTotal unique words in all tweets: {len(clean_text)}\n')   
    return clean_text
    
data = preprocess(df)
print(data[:100])

['#sharpiegate', 'poland.…', '#healthinsurance.”', '#aievents', '#0339', '8-12,', '#tedtalks', 'fastmri.', 'minute-by-minute,', 'mencken', 'chills', '#tips', 'newsblocks', 't&amp;k', 'mobiles', '#thelastofuspart2', '#breakfastlover', 'nova?', '#authorsrock', 'implications.', 'suici…', '"3x', '#cseiiitd', 'changer.', 'stinger', 'brother#your', 'tamale,', '4(b)', 'kibale', 'bus/lirr/bike/walk/other', '#smallbusinessowner', 'globe.', 'spins!', '#shetland', 'server,', 'tuhina', '⬜⬜⬜⬜🟩', 'alleviate.', 'hamiltonian.', '#chayat', 'informative', 'emotion', 'hush', 'spectacle', 'unexplainable', '#nse', '#ool2020', '#worldtravelerer', '#clicklessanalytics4comparison', 'andy', 'nada', 'cecil', 'cvcs', 'fyi', 'lbp/usd.', 'retransitioned', '(aaveusdt)', 'hackney', '#freejazzz', '#gilford', 'narowal', '😱😱😱😱', 'online-only', '#machinery', '#instagamer', 'atos', 'hoagie', 'insulins', 'ompc/f', 'farsi', '#consumerprivacy', 'announcement....', 'source)', 'carbene', 'bex!!!', '🇬🇧:', 'krita', 'felons', 'r