## Initialisation

In [1]:
import pandas as pd
import re

## Define functions


### 1. Emoticons

In [2]:
Train_tmp = pd.read_csv('./Data/Train.csv') # read in train temporarily to find all the emoticons

Emoticon search process

In [3]:
# First of all, get a set of all the words that have appeared in the tweets
single_words = set()
for tweet in Train_tmp['text']:
    for word in tweet.split(' '):
        single_words.add(word)

single_words = list(single_words)

In [4]:
# Find emoticons
emoticons = set()
for word in single_words:
    for elem in re.findall(r'[:;]-{0,1}[()]', word):
        emoticons.add(elem)
        
    for elem in re.findall(r'[()]-{0,1}[:;]', word):
        emoticons.add(elem)
        
    for elem in re.findall(r'[:;][^[a-zA-Z]]{1,2}', word):
        emoticons.add(elem)
        
    for elem in re.findall(r'[^[a-zA-Z]]{1,2}[:;]', word):
        emoticons.add(elem)

emoticons

{'(:', '):', ');', ':(', ':)', ':-(', ':-)', ';(', ';)', ';-)'}

In [5]:
# Find more emoticons
# source: https://stackoverflow.com/questions/28077049/regex-matching-emoticons; added my own to the regex term
emoticons2 = set()
for word in single_words:
    for elem in re.findall(r'(\:\)|\:\(|<3|\:\/|\:-\/|\:\||\:p|\:P|\;\)|\;\(|<3|\;\/|\;-\/|\;\||\;p|\;P|)', word):
        emoticons2.add(elem)
emoticons2

{'', ':(', ':)', ':-/', ':/', ':p', ';(', ';)', ';p', '<3'}

Keep in mind: We need to do same process for train cases too - so final set of emoticons more than what was picked up.

Didn't deal with :0, :p etc because they were ambiguous and also could mess up actual words 

Also, assume that all words are seperated by space

In [6]:
def emoticons(df):
    
    """ Changes emoticons into designated tags denoting positive or negative emoticons """
    
    posEmotic = ['(:', ':)', ':-)', '<3', ';)', '(;', ';-)']
    negEmotic = ['):', ':(', ':-(', ':-/', ':/', ');', ';(', ';-(', ';-/']

    newText = list()

    for tweet in df['text']:
        out = tweet

        trigger = False

        for etc in posEmotic:
            if len(out.split(etc)) > 1:

                trigger = True

                # print(out)
                # print('\n')

                splitted = out.split(etc)

                # print(splitted)
                # print('\n')

                tmp = str()

                for j in range(len(splitted)-1):
                    tmp += splitted[j]
                    tmp += ' PPOSEMOTICONN '
                tmp += splitted[-1]

                out = tmp

                # print(out)
                # print('\n')

        for etc in negEmotic:
            if len(out.split(etc)) > 1:

                # print(out)
                # print('\n')

                splitted = out.split(etc)

                # print(splitted)
                # print('\n')

                tmp = str()

                for j in range(len(splitted)-1):
                    tmp += splitted[j]
                    tmp += ' NNEGEMOTICONN '
                tmp += splitted[-1]

                out = tmp

                # print(out)
                # print('\n')

        newText.append(out)

    df['text'] = newText

    return df
            

In [7]:
def rm_mix_emoticons(df):
    """ Remove all emoticons if both positive and negative emoticons appear in a tweet """
    
    newText = list()

    for tweet in df['text']:

        if 'PPOSEMOTICONN' in tweet and 'NNEGEMOTICONN' in tweet:
            regex = re.compile(r'(PPOSEMOTICONN|NNEGEMOTICONN)')
            tweet = regex.sub('', tweet)

        newText.append(tweet)

    df['text'] = newText

    return df

### 2. Retweet

In [8]:
def replace_retweet(df):
    
    newText = list()
    
    for tweet in df['text']:
        regex = re.compile(r'^[ ]?rt @')
        if re.search(r'^[ ]?rt @', tweet):
            # all text are lower case!!
            tweet = regex.sub(r'@', tweet)
        newText.append(tweet)
    df['text'] = newText

    return df

### 3. Replace username

In [9]:
def replace_username(df):
    for i in range(len(df['text'])):
        df['text'][i] = re.sub(r"\B@\w+", "", df['text'][i])
        
    return df

### 4. Replace HTMLs

In [10]:
def replace_htmls(df):
    for i in range(len(df['text'])):
        df['text'][i] = re.sub(r"http[^\s]+", "", df['text'][i])  
    return df

### 5. Replace repeated letters

In [11]:
def replace_repeated_letters(df):
    
    for i in range(len(df['text'])):
        if re.match(r"([a-zA-Z])\1\1+", df['text'][i]):
            df['text'][i] = re.sub(r"([a-zA-Z])\1\1+", r"\1\1", df['text'][i])
        
    return df

### 6. Remove Duplicated Tweet

**note: test cases DON'T need to do this step**

In [12]:
def remove_duplicated_tweet(df):
    df = df.drop_duplicates(subset='text', keep='first')
    df = df.reset_index(drop=True)
    return df

### 7. Unify Numeric

In [13]:
def unify_numeric(df):

    tmp = list()
    for tweet in df['text']:
        tmp.append(re.sub(r'\d+', '', tweet))
    
    df['text'] = tmp
    
    return df

### 8. Unify Percentage

In [14]:
def unify_percentage(df):
    
    tmp = list()
    for tweet in df['text']:
        tmp.append(re.sub(r'\d+[ ]?(percent|%)', '', tweet))
    
    df['text'] = tmp
    
    return df

### 9. Expand contraction


In [15]:
def expand_contraction(df):
    df['text'] = df.text.apply(contractions_handel)
    return df

def contractions_handel(text):
    import contractions
    return contractions.fix(text)

## Pipeline 

In [16]:
def pipeline(df, mode):

    df = replace_htmls(df)

    df = replace_retweet(df)
    
    if mode == 'Train':
        df = remove_duplicated_tweet(df)

    df = replace_username(df)

    df = emoticons(df)
    df = rm_mix_emoticons(df)

    df = unify_percentage(df)

    df = unify_numeric(df)

    df = replace_repeated_letters(df)

    # df = expand_contraction(df)
    
    return df

In [17]:
# Train and output
Train = pd.read_csv('./Data/Train.csv')
Future = pd.read_csv('./Data/Future.csv')

Train = pipeline(Train, 'Train')
Future = pipeline(Future, 'Future')

Train.to_csv('./Data/Train_Preprocessed.csv', index = False)
Future.to_csv('./Data/Future_Preprocessed.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'][i] = re.sub(r"http[^\s]+", "", df['text'][i])
