# Cleaning text

This notebook is to clean text (this processing step is applicable to all models and all EDA steps). Some models may have processing steps specific to them.

In [1]:
#import necessary libraries
import pandas as pd
import numpy as np

import nltk
import re

import emoji
import demoji

### Function for cleaning text
The below function is used to apply universal preprocessing steps. The lines of the function are explained below

In [2]:
#This is the wrapper preprocessing function that is universally applicable to all our text models/eda steps
CLEANR = re.compile('<.*?>') 
#replace the new line characters
def process_text(text):
    txt_lst = text.split()
    
    #A helper function to process emojis
    #Emojis are left in in order to 
    def process_emoji(emo):
        try:
            decoded = emoji.demojize(emo)
            decoded = decoded.replace(":", "")
            return decoded
        except UnicodeDecodeError:
            #if unable to decode emoji, just keep a place holder for it
            return "__emoji__"
    
    #store all emojis as the decoded form
    txt_lst = [process_emoji(x) if emoji.is_emoji(x) else x for x in txt_lst]
    
    #process all tagged accounts
    def process_tagged_accounts(account):
        #replace all tagged accounts with __user_mention__
        if account.startswith("@") and len(account) > 1:
            return "__user_mention__"
        return account
    
    #replace hashtags with place holders
    def process_hashtags(text):
        if text.startswith("#"):
            return "__hashtag__"
        return text
    
    
    #apply the functions above
    txt_lst = [process_tagged_accounts(x) for x in txt_lst]
    txt_lst = [process_hashtags(x) for x in txt_lst]
    
    #next, we process the urls
    def process_urls_html(text):
        pattern = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)'
        text = re.sub(pattern, "", text)
        text = re.sub(CLEANR, "", text)
        return text
    
    text = " ".join(txt_lst)
    
    final_text = process_urls_html(text)
    return final_text

In [6]:
#read in the bot tweets from the fake followers dataset
bot_tweets_fake = pd.read_csv('bot_tweets_fake_english.csv')

In [7]:
#read in the bot tweets from the social dataset
bot_tweets_social = pd.read_csv('bot_tweets_social_english.csv')

In [8]:
#finally, read in the bot tweets from the human tweets dataset
human_tweets = pd.read_csv('human_tweets_english.csv')

In [9]:
#add another column for the cleaned text
bot_tweets_fake['cleaned_text'] = bot_tweets_fake.text.apply(process_text)

In [10]:
bot_tweets_social['cleaned_text'] = bot_tweets_social.text.apply(process_text)

In [11]:
human_tweets['cleaned_text'] = human_tweets.text.apply(process_text)

In [12]:
#save these datasets to csv files for usage in training models
bot_tweets_fake.to_csv("bot_tweets_fake_processed.csv")
bot_tweets_social.to_csv('bot_tweets_social_processed.csv')
human_tweets.to_csv("human_tweets_processed.csv")