## LOAD LIBRARIES

In [1]:
#Import all the libraries we need for later use

import numpy as np
import pandas as pd
import csv
import nltk
import string
import re
# import sklearn as sk
# uncoment the next line the first time you runthis.
# nltk.download('all') 

### .....................................................................................................................................................................................

## LOAD DATA (Train File and Test File)

In [2]:
#Load train dataset
def load_data(file, col_names, n=0):
    #Read all data
    if n==0:
        data = pd.read_csv(file, sep="\t", header=None, names=col_names, quoting=csv.QUOTE_NONE, error_bad_lines=False)
    #Read specific number of rows of data
    else:
        data = pd.read_csv(file, nrows=n, sep="\t", header=None, names=col_names, quoting=csv.QUOTE_NONE, error_bad_lines=False)
    return data

In [3]:
#Retrieve rows of data and tweets for further exploitation
#Train file
train_file = '../twitter_data/train2017.tsv'
col_names = ['TweetID', 'ID', 'Sentiment', 'Tweet']
train_data = load_data(train_file, col_names)
train_data = train_data[['TweetID', 'Sentiment', 'Tweet']]
# train_data = train_data[:50]

#Test file
test_file = '../twitter_data/test2017.tsv'
col_names = ['TweetID', 'ID', 'Sentiment', 'Tweet']
test_data = load_data(test_file, col_names)
test_data = test_data[['TweetID', 'Tweet']]
# test_data = test_data[:50]

In [4]:
# Load lexica
valence_tweet = load_data("../lexica/emotweet/valence_tweet.txt", ["WORD","VALENCE"])
generic = load_data("../lexica/generic/generic.txt", ["WORD","VALENCE"])
affin = load_data("../lexica/affin/affin.txt", ["WORD","VALENCE"])

### .....................................................................................................................................................................................

## PRE-PROCESSING / DATA CLEANING

In [5]:
# Dataframe that contains further information about extra features
cols = ['numof_posemo', 'numof_negemo', 'good_similarity', 'bad_similarity', 'tomorrow_similarity',
       'tweet_length', 'sum_valence_1', 'sum_valence_2', 'sum_valence_3', 'max_valence', 'min_valence']

train_extra_features = pd.DataFrame(data=0.0, columns=cols, index=np.arange(0, train_data.shape[0]))
test_extra_features = pd.DataFrame(data=0.0, columns=cols, index=np.arange(0, train_data.shape[0]))

### TOKENIZE

In [6]:
train_data['Tweet'] = train_data['Tweet'].apply(lambda x: x.split())
test_data['Tweet'] = test_data['Tweet'].apply(lambda x: x.split())

### LOWERCASE

In [7]:
train_data['Tweet'] = train_data['Tweet'].apply(lambda word_list: [word.lower() for word in word_list])
test_data['Tweet'] = test_data['Tweet'].apply(lambda word_list: [word.lower() for word in word_list])

### REPLACE SPECIAL WORDS (@obama, urls, #hashtag)

In [8]:
def Replace_Special(tweet):  
    for i, word in enumerate(tweet):
        if word[0] in ['@', '$', '#'] or any(char.isdigit() for char in word) or "http://" in word:
            tweet[i] = ''
    return tweet
   
train_data['Tweet'] = train_data['Tweet'].apply(lambda tweet: Replace_Special(tweet))
# train_data['Tweet'].apply(lambda x: print(x))
test_data['Tweet'] = test_data['Tweet'].apply(lambda tweet: Replace_Special(tweet))

### REMOVE PUNCTUATIONS

In [9]:
# EXCLUDE EMOJIS
positive_emojis = [":)", ":D", ";)", ":-)", "<3", "(:", ":P", "XD", ":p", "^^", 
                   ";D", "o.o", ":O", ";s", "=)", ";-)", ":)))", ":3", ":')", "\m/", 
                   "(;", "^_^", ":o", "n.o", "o-o", "<333", "^.^", ":-d", "d:", ":s",
                   ":v", ":]", ";o", ";))", ":ddd", "=)))", "^^'"]
negative_emojis = [ ":(",  "):",  ":/",  "]:", ":'(",  ":-(",  ":(((",  "-___-", ":-/", "/:", "-__-", 
                  ":((",  "._.", ":|",  ">.>", "(-.-)",  ":-(((", ">_<",  ":,(", ">:)", ":\\" ]
emojis = positive_emojis + negative_emojis

def Remove_Punctuations(s):
    if s not in emojis:
        return s.translate(str.maketrans('', '', string.punctuation))
    else:
        return s

# Without seperating emojis
train_data['Tweet'] = train_data['Tweet'].apply(lambda word_list: [Remove_Punctuations(word) for word in word_list])
train_data['Tweet'] = train_data['Tweet'].apply(lambda word_list: list(filter(None, word_list)))
# train_data['Tweet'].apply(lambda x: print(x))
test_data['Tweet'] = test_data['Tweet'].apply(lambda word_list: [Remove_Punctuations(word) for word in word_list])
test_data['Tweet'] = test_data['Tweet'].apply(lambda word_list: list(filter(None, word_list)))

In [10]:
# Find positive and negative emojis
def Find_Emojis(data, extra_features):
    for index, row in data.iterrows():
        tweet = row['Tweet']
        for word in tweet:
            if word in positive_emojis:
                extra_features.at[index, 'numof_posemo'] += 1
            elif word in negative_emojis:
                extra_features.at[index, 'numof_negemo'] += 1

Find_Emojis(train_data, train_extra_features)
Find_Emojis(test_data, test_extra_features)

### STEMMING / LEMMETIZATION

In [11]:
word_lemmatizer = nltk.stem.WordNetLemmatizer()
word_stemmer = nltk.SnowballStemmer("english")

# Lemmatize with POS Tag
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                "N": nltk.corpus.wordnet.NOUN,
                "V": nltk.corpus.wordnet.VERB,
                "R": nltk.corpus.wordnet.ADV}

    return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)

def Lemmatize_Stem(word):
    word = word_stemmer.stem(word)
#     word = word_lemmatizer.lemmatize(word, get_wordnet_pos(word))
    return word

train_data['Tweet'] = train_data['Tweet'].apply(lambda word_list: [Lemmatize_Stem(word) if word not in emojis else word for word in word_list])
# train_data['Tweet'].apply(lambda x: print(x))
test_data['Tweet'] = test_data['Tweet'].apply(lambda word_list: [Lemmatize_Stem(word) if word not in emojis else word for word in word_list])

### SPELLING CORRECTION

In [12]:
# https://rustyonrampage.github.io/text-mining/2017/11/28/spelling-correction-with-python-and-nltk.html

def Spelling_Correction(word):
    # Reduce lengthening of word
    word = re.compile(r"(.)\1{2,}").sub(r"\1\1", word)
    Remove_Punctuations(word) # '\\\\at'
    return word

train_data['Tweet'] = train_data['Tweet'].apply(lambda word_list: [Spelling_Correction(word) if word not in emojis else word for word in word_list])
# train_data['Tweet'].apply(lambda x: print(x))
test_data['Tweet'] = test_data['Tweet'].apply(lambda word_list: [Spelling_Correction(word) if word not in emojis else word for word in word_list])

### REMOVE STOPWORDS

In [13]:
# https://www.ranks.nl/stopwords
# 153
stop_words = { "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "im", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" }
# 203
stop_words = stop_words.union(nltk.corpus.stopwords.words('english'))
# 227
stop_words = stop_words.union(set(string.ascii_lowercase))

def Remove_Stopwords(word_list):
    word_list = [word for word in word_list if word not in stop_words]
    return word_list

train_data['Tweet'] = train_data['Tweet'].apply(lambda word_list: Remove_Stopwords(word_list))
# train_data['Tweet'].apply(lambda x: print(x))
test_data['Tweet'] = test_data['Tweet'].apply(lambda word_list: Remove_Stopwords(word_list))

### .....................................................................................................................................................................................

In [14]:
# Create general lexica
lexico1 = dict()
lexico2 = dict()
lexico3 = dict()
for index, row in valence_tweet.iterrows():
    lexico1.update( {row["WORD"]:row["VALENCE"]})
for index, row in generic.iterrows():
    lexico2.update( {row["WORD"]:row["VALENCE"]})
for index, row in affin.iterrows():
    lexico3.update( {row["WORD"]:row["VALENCE"]})

In [15]:
# Extra features for each tweet
def Add_Features(data, extra_features):
    # traverse through words of tweet
    for index, row in data.iterrows():
        tweet = row['Tweet']
        value1 = 0
        value2 = 0
        value3 = 0
        max1 = max2 = max3 = -2
        min1 = min2 = min3 = 2
        for word in tweet:
            if word in lexico1.keys(): 
                value1 = value1 + lexico1[word] 
                if lexico1[word] > max1:
                    max1 = lexico1[word]
                if lexico1[word] < min1:
                    min1 = lexico1[word]       
            if word in lexico2.keys(): 
                value2 = value2 + lexico2[word]
                if lexico2[word] > max2:
                    max2 = lexico2[word]
                if lexico2[word] < min2:
                    min2 = lexico2[word] 
            if word in lexico3.keys(): 
                value3 = value3 + lexico3[word]
                if lexico3[word] > max3:
                    max3 = lexico3[word]
                if lexico3[word] < min3:
                    min3 = lexico3[word] 

        # UPDATE
        extra_features.at[index, 'tweet_length'] = len(tweet)
        extra_features.at[index, 'sum_valence_1'] = value1
        extra_features.at[index, 'sum_valence_2'] = value2
        extra_features.at[index, 'sum_valence_3'] = value3
        extra_features.at[index, 'max_valence'] = max(max1, max2, max3)
        extra_features.at[index, 'min_valence'] = min(min1, min2, min3)
    return extra_features

In [16]:
train_extra_features = Add_Features(train_data, train_extra_features)
test_extra_features = Add_Features(test_data, test_extra_features)

In [17]:
# Delete rows that contain tweets which ended up empty
train_data = train_data[train_data.Tweet.astype(bool)]
test_data = test_data[test_data.Tweet.astype(bool)]

## STORE RESULTS

In [18]:
# Store results in a .pickle file for later use
# Tweets are stored in list tokenized form
train_data.to_pickle("./train_tweets_cleaned.pkl")
test_data.to_pickle("./test_tweets_cleaned.pkl")

# Store Extra features
train_extra_features.to_pickle("./train_extra.pkl")
test_extra_features.to_pickle("./test_extra.pkl")