## Data Preprocess

In [204]:
import nltk
import pandas as pd
import logging
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet

TRAIN_FILE = "../resources/data/train_tweets.txt"
TEST_FILE = "../resources/data/test_tweets_unlabeled.txt"

In [205]:
# read data

train_df = pd.read_csv(TRAIN_FILE, delimiter='\t', header=None, names=['ID','Text'])
test_df = pd.read_csv(TEST_FILE, delimiter='\t', header=None, names=['Text'])

# print(train_df.shape)
# print(test_df.shape)

In [206]:
# remove all RTs (reweets)
def filter_RT(df):
    rt = df['Text'].str.startswith('RT @handle')
    not_rt = [not i for i in rt]
    result_df = df[not_rt]
    result_df = result_df.reset_index(drop=True)
    return result_df

In [207]:
# remove special terms like "@handle", links
def rmv_special_term(df, rmv_all_spec=False):
    # remove @s
    result_df = df.replace(to_replace ='@handle', value = '', regex=True)
    # remove links and urls
    result_df = result_df.replace(to_replace ='\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', value = '', regex=True)
    
    # filter out all chars except 1-9/a-z/A-Z, such as :-( ' , . / \ 
    if rmv_all_spec:
        result_df = result_df.replace(to_replace ='([^0-9A-Za-z \t])|(\w+:\/\/\S+)', value = '', regex=True)
        
    return result_df

In [208]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [209]:
cached_stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# main call
def preprocess(df, rmv_all_spec=False, rmv_stop=False, lemmatize=False):
    logging.info('Preprocess starting')
    
    result_df = rmv_special_term(filter_RT(df), rmv_all_spec)
    
    #result_df['Text'] = result_df['Text'].str.lower()
    result_df['Text'] = result_df['Text'].apply(lambda x: x.lower().rstrip().lstrip())
    
    # tokenize sentence
    tknzr = TweetTokenizer()
    result_df['Text'] = result_df['Text'].apply(lambda x: tknzr.tokenize(x))
        
    # remove stop words
    if rmv_stop:
        result_df['Text'] = result_df['Text'].apply(lambda x: [i for i in x if i not in cached_stop_words])
    
    # stem words
    if lemmatize:
        result_df['Text'] = result_df['Text'].apply(lambda x: [lemmatizer.lemmatize(i, get_wordnet_pos(i)) for i in x])
    
    logging.info('Preprocess ending')  
    return result_df

In [210]:
new_train_df = preprocess(train_df)
print(new_train_df.shape)
print(new_train_df)

(307679, 2)
          ID                                               Text
0       8746                [let, try, catch, live, next, week]
1       8746  [go, watch, grey, big, screen, thursday, indul...
2       8746                      [pleasure, patrickhope, well]
3       8746  [hi, travel, lot, lot, come, next, month, reco...
4       8746                           [u, get, invite, justin]
5       8746                  [think, still, good, friend, lol]
6       8746                    [remember, fine, u, whats, new]
7       8746                        [thats, great, good, coach]
8       8746     [dont, want, picture, u, sit, lol, understand]
9       8746                [thanks, rtsare, go, womma, summit]
10      8746                         [grrryou, must, go, crazy]
11      8746                      [hi, catch, trip, news, dale]
12      8746            [still, carwant, jump, 45, minute, eta]
13      8746  [wish, could, 247, w, stus, family, drive, hom...
14      8746                