# Installing packages

In [None]:
!pip install --user -q nltk
!pip install --user -q ekphrasis
!apt install swig3.0 -yqq
!pip install --user -q jamspell
!pip install --user -q gdown

***Warning***: Depending on the runtime used, you might have to restart the kernel in order for the new libraries to be located properly.

# Import libraries

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import numpy as np
import string
import re
import pickle
import nltk
import json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from ekphrasis.classes.segmenter import Segmenter
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.dicts.noslang.slangdict import slangdict
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
import jamspell
from tqdm import tqdm
tqdm.pandas()

# Downloading helper datasets needed for processing the original dataset


In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

In [None]:
#Downloading pretrained spellchecker
!wget https://github.com/bakwc/JamSpell-models/raw/master/en.tar.gz && tar -xvzf en.tar.gz

In [None]:
!mkdir -p misspellings
for i in ['holbrook-missp.dat', 'aspell.dat', 'wikipedia.dat']:
    !wget https://www.dcs.bbk.ac.uk/~ROGER/{i} -P misspellings -O misspellings/{i}

In [None]:
import gdown
#Downloading extra_slang.json
id = "1eAYPSFxd6GjRstcWG9D4dnO_xbwfm2bZ"
url = f"https://drive.google.com/uc?id={id}"
gdown.download(url)

# Misspellings

In [None]:
# Construct dictionary with the misspelling as key and the correct word as value from a dat file
def dat2dict(file):
    misp_dict = {}
    with open(file) as f:
        key, val = None, None
        for index, line in enumerate(f):
            term = line.rstrip().split(" ")[0]
            if "_" in term or "'" in term:
                pass
            if term.startswith('$'):
                val = term[1:].lower()
            else:
                if len(term) > 3: # ignore small typos as they are hard to check
                    key = term.lower()
            if key and val:
                misp_dict[key] = val
    return misp_dict

# Combines two dictionaries to form full dictionary for misspelings 
def merge_two_dicts(x, y):
    z = x.copy()   
    z.update(y)    
    return z

In [None]:
# Loading the misspeling dictionaries
miss_dict = {}
if Path("roger_misp.pkl").exists():
    with open("roger_misp.pkl", "rb") as f:
        miss_dict = pickle.load(f)
else:
    for dat in Path("misspellings").glob("*.dat"):
        miss_dict = merge_two_dicts(miss_dict, dat2dict(dat))
    pickle.dump(miss_dict,open("roger_misp.pkl","wb"))

# Replacement dicts
Load and process the dictionaries used to replace slang, emojis and contractions

## Emojis

In [None]:
emojis = {
    ':*': '<kiss>',
    ':-*': '<kiss>',
    ':x': '<kiss>',
    ':-)': '<happy>',
    ':-))': '<happy>',
    ':)': '<happy>',
    ':))': '<happy>',
    ':o)': '<happy>',
    ':]': '<happy>',
    ':->': '<happy>',
    ':>': '<happy>',
    '8-)': '<happy>',
    '8)': '<happy>',
    ':-}': '<happy>',
    ':o)': '<happy>',
    ':-]': '<happy>',
    ':3': '<happy>',
    ':-3': '<happy>',
    ':c)': '<happy>',
    ':>': '<happy>',
    '=]': '<happy>',
    '8)': '<happy>',
    '=)': '<happy>',
    ':}': '<happy>',
    ':^)': '<happy>',
    '|;-)': '<happy>',
    ":'-)": '<happy>',
    ":')": '<happy>',
    '\o/': '<happy>',
    '^^': '<happy>',
    '^_^': '<happy>',
    '*\\0/*': '<happy>',
    ':-D': '<laugh>',
    ':D': '<laugh>',
    '8-D': '<laugh>',
    '8D': '<laugh>',
    'x-D': '<laugh>',
    'xD': '<laugh>',
    'X-D': '<laugh>',
    'XD': '<laugh>',
    '=-D': '<laugh>',
    '=D': '<laugh>',
    '=-3': '<laugh>',
    '=3': '<laugh>',
    'B^D': '<laugh>',
    "lool": '<laugh>',
    "lol": '<laugh>',
    '>:[': '<sad>',
    ':-(': '<sad>',
    ':-((': '<sad>',
    ':(': '<sad>',
    ':((': '<sad>',
    ':-c': '<sad>',
    ':c': '<sad>',
    ':-<': '<sad>',
    ':<': '<sad>',
    ':-[': '<sad>',
    ':[': '<sad>',
    ':{': '<sad>',
    ':-||': '<sad>',
    ':@': '<sad>',
    ":'-(": '<sad>',
    ":'(": '<sad>',
    'D:<': '<sad>',
    'D:': '<sad>',
    'D8': '<sad>',
    'D;': '<sad>',
    'D=': '<sad>',
    'DX': '<sad>',
    'v.v': '<sad>',
    "D-':": '<sad>',
    '(>_<)': '<sad>',
    ':|': '<sad>',
    '>:O': '<surprise>',
    ':-O': '<surprise>',
    ':-o': '<surprise>',
    ':O': '<surprise>',
    '°o°': '<surprise>',
    'o_O': '<surprise>',
    'o_0': '<surprise>',
    'o.O': '<surprise>',
    'o-o': '<surprise>',
    '8-0': '<surprise>',
    '|-O': '<surprise>',
    ';-)': '<wink>',
    ';)': '<wink>',
    '*-)': '<wink>',
    '*)': '<wink>',
    ';-]': '<wink>',
    ';]': '<wink>',
    ';D': '<wink>',
    ';^)': '<wink>',
    ':-,': '<wink>',
    '>:P': '<tong>',
    ':-P': '<tong>',
    ':P': '<tong>',
    'X-P': '<tong>',
    'x-p': '<tong>',
    'xp': '<tong>',
    'XP': '<tong>',
    ':-p': '<tong>',
    ':p': '<tong>',
    '=p': '<tong>',
    ':-Þ': '<tong>',
    ':Þ': '<tong>',
    ':-b': '<tong>',
    ':b': '<tong>',
    ':-&': '<tong>',
    '>:\\': '<annoyed>',
    '>:/': '<annoyed>',
    ':-/': '<annoyed>',
    ':-.': '<annoyed>',
    ':/': '<annoyed>',
    ':\\': '<annoyed>',
    '=/': '<annoyed>',
    '=\\': '<annoyed>',
    ':L': '<annoyed>',
    '=L': '<annoyed>',
    ':S': '<annoyed>',
    '>.<': '<annoyed>',
    ':-|': '<annoyed>',
    '<:-|': '<annoyed>',
    ':-X': '<seallips>',
    ':X': '<seallips>',
    ':-#': '<seallips>',
    ':#': '<seallips>',
    'O:-)': '<angel>',
    '0:-3': '<angel>',
    '0:3': '<angel>',
    '0:-)': '<angel>',
    '0:)': '<angel>',
    '0;^)': '<angel>',
    '>:)': '<devil>',
    '>:D': '<devil>',
    '>:-D': '<devil>',
    '>;)': '<devil>',
    '>:-)': '<devil>',
    '}:-)': '<devil>',
    '}:)': '<devil>',
    '3:-)': '<devil>',
    '3:)': '<devil>',
    'o/\o': '<highfive>',
    '^5': '<highfive>',
    '>_>^': '<highfive>',
    '^<_<': '<highfive>',  
    '<3': '<heart>',
    'xx': 'kiss'
}

In [None]:
nd = {}
for k,v in emojis.items():
    # create lowercased version of all emoticons
    if k.lower() not in emojis:
        nd[k.lower()] = v

emojis = merge_two_dicts(emojis, nd)
emojis2 = {}
#remove angle bracks from tags <heart> -> heart
for k, v in emojis.items():
    v = v.replace(">", "").replace("<",'')
    emojis2[k]=v
emojis = emojis2

## Slang

Tweak the original slang dictionary from nltk to fix some issues encountered on out dataset

In [None]:
slangdict.pop('im', None)
slangdict.pop('blog', None)
slangdict['dm'] = 'direct message'
slangdict['ed'] = 'stupid'
slangdict['x'] = 'kisses'
slangdict['urg'] = 'pissed'
slangdict['aww'] = 'affection'
slangdict['aw'] = 'affection'
slangdict['tf'] = 'what the fuck'
slangdict['r'] = 'are'
slangdict['ed'] = 'stupid'
slangdict['wah'] = 'wonder'
slangdict['tryna']= 'trying to'
slangdict['xo'] = 'kisses'
slangdict['frikken'] = 'freaking'
slangdict['kn0w'] = 'know'
slangdict['rt'] = ''

Load extra slang mappings

In [None]:
extra_slang = json.load(open('extra_slang.json'))
extra_slang = dict((k.lower(), v.lower()) for k, v in extra_slang.items())

## Contractions

In [None]:
negative_cont = {
    '/s': 'sarcasm',
    'rt': '',
    '2': 'to',
    'u': 'you',
    'wi-fi': 'wifi',
    'im': "i am",
    'bea': 'be a',
    'taks': 'tasks',
    'havea': 'have a',
    "dont": "do not", 
    "don't": "do not",
    "doesnt": "does not",
    "doesn't": "does not",
    "didnt": "did not", 
    "didn't": "did not",
    "aint": "am not",
    "ain't": "am not",
    "arent": "are not",
    "aren't": "are not",
    "isnt": "is not",
    "isn't": "is not",
    "wasn't": "was not",
    "wasnt": "was not",
    "haven't": "have not",
    "havent": "have not",
    "hasnt": "has not",
    "hasn't": "has not",
    "hadnt": "had not",
    "hadn't": "had not",
    "wont": "will not",
    "won't": "will not",
    "wouldnt": "would not",
    "wouldn't": "would not",
    "cant": "can not",
    "can't": "can not",
    "couldn't": "could not",
    "couldnt": "could not",
    "shan't": "shall not",
    "shant": "shall not",
    "shouldn't": "should not",
    "shouldnt": "should not",
    "mightn't": "might not",
    "mightnt": "might not",
    "wouldn't": "would not",
    "oughtn't": "ought not",
    "oughtnt": "ought not",
    "mustn't": "must not",
    "needn't": "need not",
    "wouldn't": "would not",
    "daren't": "dare not",
    "dang": "frustration",
    "dangg": "frustration",
    "ohh": "oh",
    "mee": "me",
    "ed": "stupid",
    "domt": "do not",
    "xo": "kisses"
}

# Text processing

In [None]:
seg= Segmenter("twitter")
lm = nltk.WordNetLemmatizer()
sm = EnglishStemmer()

def lemmatizer(data): # For processing, lemmatization is used
    return [lm.lemmatize(w) for w in data]

def stemmatizer(data): #Not used 
    return [sm.stem(w) for w in data]

## Utilities

In [None]:
def replace_numbers(text):
    return re.sub('\w*[0-9]+', '', text)

def replace_question(text):
    return re.sub(r'(\?)+', '', text)

def replace_exclamation(text):
    return re.sub(r'(\!)+', '', text)

def remove_punct(text):
    punctionation = string.punctuation
    text  = ''.join([char for char in text if char not in punctionation])
    return text

In [None]:
def unpack_hashtag(text):
    #Attempts to split a hashtag in segments
    words = text.split()
    return ' '.join([seg.segment(w[1:]) if (w[0] == '#') else w for w in words ])

def remove_stop_words(text):
    #Returns a text without any stop words
    #List of stop words is provided from nltk.corpus
    text= text.lower()
    stop_words = set(stopwords.words('english'))
    filtered_sentence = ' '.join([w for w in text.split() if not w in stop_words])
    return filtered_sentence

def handle_elong(text):
    #Prevent enlongations by allowing a character to repeat up to 2 times only.
    return re.sub(r'(.)\1+', r'\1\1', text)


def handle_emoticons(text):
    #Replace laugh expressions such as "haha","hihi" and "hehe" by 'laugh'
    #Replace emojis with explicit meaning from the dictionary
    text = re.sub('(h+ah+a*h*)+', "laugh", text)
    text = re.sub('(h*eh+e*h*)+', "laugh", text)
    text= re.sub('(h*ih+i*h*)+', "laugh", text)
    return ' '.join(emojis[w] if w in emojis else w for w in text.split())

In [None]:
def rm_slang(text):
    #Use the slang dict form ekphrasis to replace slang
    return ' '.join(slangdict[w] if w in slangdict else w for w in text.split())

def rm_extra_slang(text):
    #Use the extra slang dict to replace slang
    return ' '.join(extra_slang[w] if w in extra_slang else w for w in text.split())

In [None]:
negations = {'no': 'NOT', 'not': "NOT", 'none': "NOT_one", 'nobody': "NOT_body", 
            'noone': 'NOT_one', 'nothing': 'NOT_thing', 'nowhere': "NOT_where", 
             'never': "NOT_ever"}

def replace_negations(text):
    stext = []
    gets_prefix = False
    for t in text.split():
        negt = negations[t] if t in negations else ""
        if gets_prefix:
            t = "NOT_"+t
            gets_prefix=False
            stext.append(t)
            continue
        if negt == "NOT":
            gets_prefix = True
            continue
        elif len(negt) > 0:
            t = negt
        stext.append(t)
    return " ".join(stext)

In [None]:
tags = set()

#Remove tags and any ambigous 2 letter words remaining
def remove_tags(tweet):
    tw = []
    for w in tweet.split(" "):
        if "<" in w or len(w) < 3:
            tags.add(w)
        else:
            tw.append(w)
    return " ".join(tw)

## Social Tokenizer & Spellchecker

In [None]:
#Taken from ekphrasis docs

text_processor = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user','time', 'url', 'date', 'number'],
    annotate={"hashtag", "allcaps", "elongated", "repeated",
              'emphasis', 'censored'},
    segmenter="twitter",
    corrector="twitter",
    unpack_hashtags=True,  
    unpack_contractions=True,  
    spell_correct_elong=True,  
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons,negative_cont]
)

In [None]:
# Apply the above-mentioned preprocessor
def clean_processor(text) : 
    text = " ".join(text_processor.pre_process_doc(text))
    return text

stop = set(stopwords.words("english"))

# Removing the stopwords
def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

In [None]:
# Spell corrector 
corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel('en.bin')

In [None]:
# Shows the tweets before and after spell correction
def spell_tweet(tweet, debug=False):
    if debug:
        print("before spellcheck:", tweet)
    tweet = corrector.FixFragment(tweet)
    if debug:
        print("after check: ",tweet)
    return tweet

## Clean tweet

In [None]:
def clean_tweet(tweet, rm_stopwords=True):
    # lower case 
    tweet = tweet.lower()

    # remove more than 2 char repetitions
    tweet = handle_elong(tweet)

    # remove emoticons
    tweet = handle_emoticons(tweet)

    #apply pre cleaning from ekphrasis - removes slang
    tweet = rm_slang(tweet)
    tweet = rm_extra_slang(tweet)

    # process other tags left in the tweets - hashtags, emphasis, reps, contractions
    tweet = clean_processor(tweet)    
    #unpack slang words
    
    #replace ! and ? by "exclamation" and "question" resp
    tweet = replace_exclamation(tweet)
    tweet = replace_question(tweet)

    # replace any number by "number"
    tweet = replace_numbers(tweet)

    # Spellcheck tweet
    tweet = spell_tweet(tweet)

    # remove remaining tags
    tweet = re.sub('<\/*\w*\/*>', "", tweet)

    #remove all punctuation left
    tweet = remove_punct(tweet)

    # Replace negations and append them to the next token
    # I did not like -> I did not_like
    tweet = replace_negations(tweet)

    
    # lemmatize tweet
    tweet = word_tokenize(tweet)
    
    tweet = ' '.join(word for word in tweet)

    # Remove stopwords - it's set to True
    if rm_stopwords:
        tweet = remove_stopwords(tweet)

    tweet = remove_tags(tweet)
    return tweet 

In [None]:
# Helper function for applying the stemmatizer/lemmatizer function down below
def to_func(i, f):
    r = f(word_tokenize(i))
    r = " ".join(r)
    return r

# Loading data

## Loading labled tweets

In [None]:
fname_tweets = "orig_tweets_full.csv"
tweets_full = pd.read_csv(fname_tweets, sep=";")

tweets_full = tweets_full.drop_duplicates(subset=['tweet']) # remove duplicates

tweets_full['tweet_orig'] = tweets_full['tweet'] # copy for sanity

# Creating datasets

## Creating the tweets_clean_full_min.csv used for training

In [None]:
tweets_full['tweet'] =  tweets_full['tweet_orig'].progress_apply(lambda x: clean_tweet(x))
tweets_full['tweet_clean'] =  tweets_full['tweet']

tweets_full['tweet_stem'] = tweets_full['tweet'].progress_apply(lambda x: to_func(x, stemmatizer))

tweets_full['tweet_lemma'] = tweets_full['tweet'].progress_apply(lambda x: to_func(x, lemmatizer))

In [None]:
# Converting sentiments from 1 and -1 to 1 and 0 since some models needs increasing classes starting from 0
tweets_full['sentiment'] = tweets_full['sentiment'].apply(lambda x: 1 if x == 1 else 0)

In [None]:
#Removing empty tweets
tweets_full.dropna(subset=['tweet'], inplace=True)

In [None]:
tweets_full.to_csv('tweets_clean.tmp', sep=";", index=False)

In [None]:
# Remove empty tweets after the cleanup
td = pd.read_csv("tweets_clean.tmp", sep=';')
td.dropna(subset=['tweet'], inplace=True)

In [None]:
print("Saving tweets")
td.to_csv("tweets_clean_full.csv", sep=';', index=False)
print("Done")

print("Saving mimimized file")
td = td[['tweet', 'sentiment']]
td.to_csv("tweets_clean_full_min.csv", sep=';', index=False)
print("Done")