In [83]:
!pip install ekphrasis
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.over_sampling import RandomOverSampler 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import io, re
import pandas as pd
import numpy as np
nltk.download('stopwords')
nltk.download('wordnet')
corpus = [
     'This is the first test document is it better or fine!!!!??? https://www.google.com 10% £20 #chill :) :/',
     'Playing I am happy to take your donation; any amount will be greatly appreciated.',
     'Sometimes, all you need to do is completely make an ass of yourself and laugh it off to realise that life isn’t so bad after all.',
     'I will never be this young again. Ever. Oh damn… I just got older.',
 ]

[33mYou are using pip version 18.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/PhilipADSo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/PhilipADSo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [66]:
#Ekpharsis setup
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
   
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [67]:
# Testing Ekpharsis
tokenisedCorpus = []
for sentence in corpus:
    print(text_processor.pre_process_doc(sentence))
    tokenisedCorpus.append(text_processor.pre_process_doc(sentence))

['this', 'is', 'the', 'first', 'test', 'document', 'is', 'it', 'better', 'or', 'fine', '?', '!', '<repeated>', '<url>', '<percent>', '<money>', '<hashtag>', 'chill', '</hashtag>', '<happy>', '<annoyed>']
['playing', 'i', 'am', 'happy', 'to', 'take', 'your', 'donation', ';', 'any', 'amount', 'will', 'be', 'greatly', 'appreciated', '.']
['sometimes', ',', 'all', 'you', 'need', 'to', 'do', 'is', 'completely', 'make', 'an', 'ass', 'of', 'yourself', 'and', 'laugh', 'it', 'off', 'to', 'realise', 'that', 'life', 'isn', '’', 't', 'so', 'bad', 'after', 'all', '.']
['i', 'will', 'never', 'be', 'this', 'young', 'again', '.', 'ever', '.', 'oh', 'damn', '…', 'i', 'just', 'got', 'older', '.']


In [68]:
#need
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas


In [69]:
# Testing Stopwords
for sentence in tokenisedCorpus:
    print(remove_stopwords(sentence))
  

['first', 'test', 'document', 'better', 'fine', '?', '!', '<repeated>', '<url>', '<percent>', '<money>', '<hashtag>', 'chill', '</hashtag>', '<happy>', '<annoyed>']
['playing', 'happy', 'take', 'donation', ';', 'amount', 'greatly', 'appreciated', '.']
['sometimes', ',', 'need', 'completely', 'make', 'ass', 'laugh', 'realise', 'life', '’', 'bad', '.']
['never', 'young', '.', 'ever', '.', 'oh', 'damn', '…', 'got', 'older', '.']


In [71]:
# Testing Lemmatization
for sentence in tokenisedCorpus:
    print(lemmatize_verbs(sentence))

['this', 'be', 'the', 'first', 'test', 'document', 'be', 'it', 'better', 'or', 'fine', '?', '!', '<repeated>', '<url>', '<percent>', '<money>', '<hashtag>', 'chill', '</hashtag>', '<happy>', '<annoyed>']
['play', 'i', 'be', 'happy', 'to', 'take', 'your', 'donation', ';', 'any', 'amount', 'will', 'be', 'greatly', 'appreciate', '.']
['sometimes', ',', 'all', 'you', 'need', 'to', 'do', 'be', 'completely', 'make', 'an', 'ass', 'of', 'yourself', 'and', 'laugh', 'it', 'off', 'to', 'realise', 'that', 'life', 'isn', '’', 't', 'so', 'bad', 'after', 'all', '.']
['i', 'will', 'never', 'be', 'this', 'young', 'again', '.', 'ever', '.', 'oh', 'damn', '…', 'i', 'just', 'get', 'older', '.']


In [72]:
# Testing Stemming
for sentence in tokenisedCorpus:
    print(stem_words(sentence))

['thi', 'is', 'the', 'first', 'test', 'docu', 'is', 'it', 'bet', 'or', 'fin', '?', '!', '<repeated>', '<url>', '<percent>', '<money>', '<hashtag>', 'chil', '</hashtag>', '<happy>', '<annoyed>']
['play', 'i', 'am', 'happy', 'to', 'tak', 'yo', 'don', ';', 'any', 'amount', 'wil', 'be', 'gre', 'apprecy', '.']
['sometim', ',', 'al', 'you', 'nee', 'to', 'do', 'is', 'complet', 'mak', 'an', 'ass', 'of', 'yourself', 'and', 'laugh', 'it', 'off', 'to', 'real', 'that', 'lif', 'isn', '’', 't', 'so', 'bad', 'aft', 'al', '.']
['i', 'wil', 'nev', 'be', 'thi', 'young', 'again', '.', 'ev', '.', 'oh', 'damn', '…', 'i', 'just', 'got', 'old', '.']


In [79]:
# import data for over and undersampling

label2emotion = {0:"others", 1:"happy", 2: "sad", 3:"angry"}
emotion2label = {"others":0, "happy":1, "sad":2, "angry":3}

def preprocessData(dataFilePath, mode):
    """Load data from a file, process and return indices, conversations and labels in separate lists
    Input:
        dataFilePath : Path to train/test file to be processed
        mode : "train" mode returns labels. "test" mode doesn't return labels.
    Output:
        indices : Unique conversation ID list
        conversations : List of 3 turn conversations, processed and each turn separated by the <eos> tag
        labels : [Only available in "train" mode] List of labels
    """
    indices = []
    conversations = []
    labels = []
    with io.open(dataFilePath, encoding="utf8") as finput:
        finput.readline()
        for line in finput:
            # Convert multiple instances of . ? ! , to single instance
            # okay...sure -> okay . sure
            # okay???sure -> okay ? sure
            # Add whitespace around such punctuation
            # okay!sure -> okay ! sure
            repeatedChars = ['.', '?', '!', ',']
            for c in repeatedChars:
                lineSplit = line.split(c)
                while True:
                    try:
                        lineSplit.remove('')
                    except:
                        break
                cSpace = ' ' + c + ' '    
                line = cSpace.join(lineSplit)
            
            line = line.strip().split('\t')
            if mode == "train":
                # Train data contains id, 3 turns and label
                label = emotion2label[line[4]]
                labels.append(label)
            
            conv = ' <eos> '.join(line[1:4])
            
            # Remove any duplicate spaces
            duplicateSpacePattern = re.compile(r'\ +')
            conv = re.sub(duplicateSpacePattern, ' ', conv)
            
            indices.append(int(line[0]))
            conversations.append(conv.lower())
    
    if mode == "train":
        return indices, conversations, labels
    else:
        return indices, conversations
    
trainIndices, trainTexts, labels = preprocessData("train.txt", mode="train")
testIndices, testTexts, testLabels = preprocessData("test.txt", mode="train")
devIndices, devTexts, devLabels = preprocessData("test.txt", mode="train")

trainTexts += testTexts + devTexts
labels+= testLabels + devLabels

In [84]:
# Testing Over sampling
trainDF = pd.DataFrame({'convtrain': trainTexts,'labels': labels})
from collections import Counter

resampleTypeData = np.array(trainDF['convtrain'].values.tolist())
ros = RandomOverSampler(random_state=42)
resampleTypeData = resampleTypeData.reshape(-1, 1)
print('Original dataset shape %s' % Counter(trainDF['labels']))
X_res, y_res = ros.fit_resample(resampleTypeData, trainDF['labels'])
print('Resampled dataset shape %s' % Counter(y_res))

Original dataset shape Counter({0: 24302, 3: 6102, 2: 5963, 1: 4811})
Resampled dataset shape Counter({0: 24302, 3: 24302, 2: 24302, 1: 24302})


In [86]:
# Testing Undersampling
trainDF = pd.DataFrame({'convtrain': trainTexts,'labels': labels})
from collections import Counter

resampleTypeData = np.array(trainDF['convtrain'].values.tolist())
rus = RandomUnderSampler(random_state=42)
resampleTypeData = resampleTypeData.reshape(-1, 1)
print('Original dataset shape %s' % Counter(trainDF['labels']))
X_res, y_res = rus.fit_resample(resampleTypeData, trainDF['labels'])
print('Resampled dataset shape %s' % Counter(y_res))

Original dataset shape Counter({0: 24302, 3: 6102, 2: 5963, 1: 4811})
Resampled dataset shape Counter({0: 4811, 1: 4811, 2: 4811, 3: 4811})


In [91]:
# Testing TF-IDF
vectorizer = CountVectorizer()
print(vectorizer.fit_transform(corpus))
print(vectorizer.get_feature_names())

  (0, 15)	1
  (0, 1)	1
  (0, 0)	1
  (0, 16)	1
  (0, 25)	1
  (0, 55)	1
  (0, 29)	1
  (0, 23)	1
  (0, 43)	1
  (0, 14)	1
  (0, 32)	1
  (0, 20)	1
  (0, 49)	1
  (0, 24)	1
  (0, 51)	1
  (0, 30)	2
  (0, 52)	1
  (1, 10)	1
  (1, 27)	1
  (1, 13)	1
  (1, 54)	1
  (1, 6)	1
  (1, 9)	1
  (1, 21)	1
  (1, 58)	1
  :	:
  (2, 39)	1
  (2, 11)	1
  (2, 7)	1
  (2, 36)	1
  (2, 17)	1
  (2, 19)	1
  (2, 37)	1
  (2, 56)	1
  (2, 4)	2
  (2, 47)	1
  (2, 53)	2
  (2, 32)	1
  (2, 30)	1
  (3, 42)	1
  (3, 26)	1
  (3, 33)	1
  (3, 18)	1
  (3, 41)	1
  (3, 22)	1
  (3, 3)	1
  (3, 57)	1
  (3, 38)	1
  (3, 13)	1
  (3, 54)	1
  (3, 52)	1
['10', '20', 'after', 'again', 'all', 'am', 'amount', 'an', 'and', 'any', 'appreciated', 'ass', 'bad', 'be', 'better', 'chill', 'com', 'completely', 'damn', 'do', 'document', 'donation', 'ever', 'fine', 'first', 'google', 'got', 'greatly', 'happy', 'https', 'is', 'isn', 'it', 'just', 'laugh', 'life', 'make', 'need', 'never', 'of', 'off', 'oh', 'older', 'or', 'playing', 'realise', 'so', 'sometimes',

In [92]:
# Testing word count
vectorizer = TfidfVectorizer()
print(vectorizer.fit_transform(corpus))
print(vectorizer.get_feature_names())

  (0, 52)	0.18724231956240409
  (0, 30)	0.37448463912480817
  (0, 51)	0.23749314013993011
  (0, 24)	0.23749314013993011
  (0, 49)	0.23749314013993011
  (0, 20)	0.23749314013993011
  (0, 32)	0.18724231956240409
  (0, 14)	0.23749314013993011
  (0, 43)	0.23749314013993011
  (0, 23)	0.23749314013993011
  (0, 29)	0.23749314013993011
  (0, 55)	0.23749314013993011
  (0, 25)	0.23749314013993011
  (0, 16)	0.23749314013993011
  (0, 0)	0.23749314013993011
  (0, 1)	0.23749314013993011
  (0, 15)	0.23749314013993011
  (1, 44)	0.29031547855150247
  (1, 5)	0.29031547855150247
  (1, 28)	0.29031547855150247
  (1, 53)	0.22888805789011155
  (1, 48)	0.29031547855150247
  (1, 58)	0.29031547855150247
  (1, 21)	0.29031547855150247
  (1, 9)	0.29031547855150247
  :	:
  (2, 11)	0.18990156414931925
  (2, 39)	0.18990156414931925
  (2, 59)	0.18990156414931925
  (2, 8)	0.18990156414931925
  (2, 34)	0.18990156414931925
  (2, 40)	0.18990156414931925
  (2, 45)	0.18990156414931925
  (2, 50)	0.18990156414931925
  (2, 35)