# Fasttext for text classification

In [1]:
import fasttext
from preprocessing import *
from helpers import *
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
tweets = load_tweets(full = True)
tweets = remove_tags(tweets)

In [3]:
tweets.head()

Unnamed: 0,tweet,polarity
0,i dunno justin read my mention or not . only j...,1
1,"because your logic is so dumb , i won't even c...",1
2,just put casper in a box ! looved the battle ...,1
3,thanks sir > > don't trip lil mama ... just ke...,1
4,visiting my brother tmr is the bestest birthda...,1


## 1. Finding the best preprocessing for fasttext

_We will first try to find the preprocessing that works best for fasttext before tuning hyperparameters._

1. Without preprocessing

In [4]:
train_basic, test_basic = train_test_split(tweets, test_size = 0.2, random_state = 1)

In [5]:
construct_fasttext_input(train_basic, 'tweets_basic_train.csv')
construct_fasttext_input(test_basic, 'tweets_basic_test.csv')

In [6]:
model = fasttext.train_supervised(input = 'tweets_basic_train.csv')

In [7]:
model.test('tweets_basic_train.csv')

(1966637, 0.861019089948984, 0.861019089948984)

In [8]:
model.test('tweets_basic_test.csv')

(491660, 0.8354899727453932, 0.8354899727453932)

2. Removing punctuation & neutral stop words

In [9]:
from nltk.corpus import stopwords

In [10]:
nltk_stopwords = set(stopwords.words('english'))

In [11]:
negative_stopwords = set(["above", "against", "ain", "aren't", "but", "couldn", "couldn't", "didn", "didn't", "doesn't", "don", "don't", "hadn", "hadn't",
"hasn't", "haven", "haven't", "isn", "isn't", "mightn", "mightn't", "mustn", "mustn't", "needn", "needn't", "no",
"nor", "not", "over", "shan", "shan't", "shouldn", "shouldn't", "t", "under", "wasn", "wasn't", "weren", "weren't",
"won", "won't", "wouldn", "wouldn't"])

In [12]:
stopwords = nltk_stopwords - negative_stopwords

In [13]:
stopwords

{'a',
 'about',
 'after',
 'again',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'by',
 'can',
 'd',
 'did',
 'do',
 'does',
 'doesn',
 'doing',
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'has',
 'hasn',
 'have',
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'more',
 'most',
 'my',
 'myself',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'own',
 're',
 's',
 'same',
 'she',
 "she's",
 'should',
 "should've",
 'so',
 'some',
 'such',
 'than',
 'that',
 "that'll",
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'there',
 'these',
 'they',
 'this',
 'those',
 'through',
 'to',
 'too',
 'until',
 'up',
 've',
 'very',
 'was',

In [14]:
def remove_neutral_stopwords(tweets):
    tweets_2 = tokenize_tweets(tweets, stop_words = False, stemming = False)
    tweets_2['tokens_stopwords'] = tweets_2['tokens'].copy().apply(lambda tokens: [token for token in tokens if token not in stopwords])
    tweets_2['tweet'] = tweets_2['tokens_stopwords'].copy().apply(lambda tokens: ' '.join(tokens))
    
    return tweets_2

In [15]:
tweets_2 = remove_neutral_stopwords(tweets)

In [16]:
tweets_2.head()

Unnamed: 0,tweet,polarity,tokens,tokens_stopwords
0,dunno justin read mention not justin god knows...,1,"[i, dunno, justin, read, my, mention, or, not,...","[dunno, justin, read, mention, not, justin, go..."
1,logic dumb wo nt even crop name photo tsk,1,"[because, your, logic, is, so, dumb, i, wo, nt...","[logic, dumb, wo, nt, even, crop, name, photo,..."
2,put casper box looved battle crakkbitch,1,"[just, put, casper, in, a, box, looved, the, b...","[put, casper, box, looved, battle, crakkbitch]"
3,thanks sir nt trip lil mama keep doin ya thang,1,"[thanks, sir, do, nt, trip, lil, mama, just, k...","[thanks, sir, nt, trip, lil, mama, keep, doin,..."
4,visiting brother tmr bestest birthday gift eve...,1,"[visiting, my, brother, tmr, is, the, bestest,...","[visiting, brother, tmr, bestest, birthday, gi..."


In [17]:
train_wo_stopwords, test_wo_stopwords = train_test_split(tweets_2, test_size = 0.2, random_state = 1)

In [18]:
construct_fasttext_input(train_wo_stopwords, 'tweets_wo_stopwords_train.csv')
construct_fasttext_input(test_wo_stopwords, 'tweets_wo_stopwords_test.csv')

In [19]:
model_2 = fasttext.train_supervised(input = 'tweets_wo_stopwords_train.csv')

In [20]:
model_2.test('tweets_wo_stopwords_train.csv')

(1966637, 0.9063716384874281, 0.9063716384874281)

In [21]:
model_2.test('tweets_wo_stopwords_test.csv')

(491660, 0.8110584550298987, 0.8110584550298987)

***Conclusion: Removing neutral stopwords decreases performance.***

3. Adding word stemming

In [22]:
def stem_tweets(tweets):
    tweets_2 = tokenize_tweets(tweets, stop_words = False, stemming = True)
    tweets_2['tweet'] = tweets_2['tokens'].copy().apply(lambda tokens: ' '.join(tokens))
    
    return tweets_2

In [23]:
tweets_3 = stem_tweets(tweets)

In [24]:
tweets_3.head()

Unnamed: 0,tweet,polarity,tokens
0,i dunno justin read my mention or not onli jus...,1,"[i, dunno, justin, read, my, mention, or, not,..."
1,becaus your logic is so dumb i wo nt even crop...,1,"[becaus, your, logic, is, so, dumb, i, wo, nt,..."
2,just put casper in a box loov the battl crakkb...,1,"[just, put, casper, in, a, box, loov, the, bat..."
3,thank sir do nt trip lil mama just keep doin y...,1,"[thank, sir, do, nt, trip, lil, mama, just, ke..."
4,visit my brother tmr is the bestest birthday g...,1,"[visit, my, brother, tmr, is, the, bestest, bi..."


In [25]:
train_stemming, test_stemming = train_test_split(tweets_3, test_size = 0.2, random_state = 1)

In [26]:
construct_fasttext_input(train_stemming, 'tweets_stemming_train.csv')
construct_fasttext_input(test_stemming, 'tweets_stemming_test.csv')

In [27]:
model_3 = fasttext.train_supervised(input = 'tweets_stemming_train.csv')

In [28]:
model_3.test('tweets_stemming_train.csv')

(1966637, 0.8445407057835279, 0.8445407057835279)

In [29]:
model_3.test('tweets_stemming_test.csv')

(491660, 0.8157730952284099, 0.8157730952284099)

***Conclusion: Stemming did not increase performance.***

4. Removing punctuation / special characters

In [30]:
import re

def remove_punctuations_hashtags_mentions(tweets):
    tweets_2 = tweets.copy()
    tweets_2['tweet'] = tweets_2['tweet'].copy().apply(lambda tweet: ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|(\w+:\/\/\S+)|[\.\,\!\?\:\;\-\=\<\>]", " ", tweet).split()))
    
    return tweets_2

In [31]:
tweets_4 = remove_punctuations_hashtags_mentions(tweets)

In [32]:
tweets_4.head()

Unnamed: 0,tweet,polarity
0,i dunno justin read my mention or not only jus...,1
1,because your logic is so dumb i won't even cro...,1
2,just put casper in a box looved the battle,1
3,thanks sir don't trip lil mama just keep doin ...,1
4,visiting my brother tmr is the bestest birthda...,1


In [33]:
train_chars, test_chars = train_test_split(tweets_4, test_size = 0.2, random_state = 1)

In [34]:
construct_fasttext_input(train_chars, 'tweets_chars_train.csv')
construct_fasttext_input(test_chars, 'tweets_chars_test.csv')

In [35]:
model_4 = fasttext.train_supervised(input = 'tweets_chars_train.csv')

In [36]:
model_4.test('tweets_chars_train.csv')

(1966637, 0.8539832211028268, 0.8539832211028268)

In [37]:
model_4.test('tweets_chars_test.csv')

(491660, 0.8295305699060326, 0.8295305699060326)

***Conclusion: removing special chars such as punctuation, hashtags and mentions did not increase performance.***

5. Replacing smileys

In [38]:
# This dictionary was taken from the following GitHub
# https://github.com/charlesmalafosse/FastText-sentiment-analysis-for-tweets/blob/master/betsentiment_sentiment_analysis_fasttext.py
def replace_smiley(tweet):
    smileys = {
        ":‑)":"smile",
        ":-]":"smile",
        ":-3":"smile",
        ":->":"smile",
        "8-)":"smile",
        ":-}":"smile",
        ":)":"smile",
        ":]":"smile",
        ":3":"smile",
        ":>":"smile",
        "8)":"smile",
        ":}":"smile",
        ":o)":"smile",
        ":c)":"smile",
        ":^)":"smile",
        "=]":"smile",
        "=)":"smile",
        ":-))":"smile",
        ":‑D":"smile",
        "8‑D":"smile",
        "x‑D":"smile",
        "X‑D":"smile",
        ":D":"smile",
        "8D":"smile",
        "xD":"smile",
        "XD":"smile",
        ":‑(":"sad",
        ":‑c":"sad",
        ":‑<":"sad",
        ":‑[":"sad",
        ":(":"sad",
        ":c":"sad",
        ":<":"sad",
        ":[":"sad",
        ":-||":"sad",
        ">:[":"sad",
        ":{":"sad",
        ":@":"sad",
        ">:(":"sad",
        ":'‑(":"sad",
        ":'(":"sad",
        "<3":"love"
        }

    for smiley, val in smileys.items():
        tweet = tweet.replace(smiley, val)
    return tweet


def replace_smileys_tweets(tweets):
    tweets_2 = tweets.copy()
    tweets_2['tweet'] = tweets_2['tweet'].copy().apply(lambda tweet: replace_smiley(tweet))
    
    return tweets_2


In [39]:
tweets_5 = replace_smileys_tweets(tweets)

In [40]:
tweets_5.head()

Unnamed: 0,tweet,polarity
0,i dunno justin read my mention or not . only j...,1
1,"because your logic is so dumb , i won't even c...",1
2,just put casper in a box ! looved the battle ...,1
3,thanks sir > > don't trip lil mama ... just ke...,1
4,visiting my brother tmr is the bestest birthda...,1


In [41]:
train_smileys, test_smileys = train_test_split(tweets_5, test_size = 0.2, random_state = 1)

In [42]:
construct_fasttext_input(train_smileys, 'tweets_smileys_train.csv')
construct_fasttext_input(test_smileys, 'tweets_smileys_test.csv')

In [43]:
model_5 = fasttext.train_supervised(input = 'tweets_smileys_train.csv')

In [44]:
model_5.test('tweets_smileys_train.csv')

(1966637, 0.8604058603595884, 0.8604058603595884)

In [45]:
model_5.test('tweets_smileys_test.csv')

(491660, 0.8351197982345523, 0.8351197982345523)

***Conclusion:Perfomances are similar than without replacing smileys.***

## 2. Finding the best hyperparameters for fasttext

_We will now tune the hyperparameters to increase the performance of fasttext. As seen in the previous keeping raw tweets got us the best performance so we won't take any preprocessing for the task._

In [97]:
train, validation_test = train_test_split(tweets_5, test_size = 0.3)

In [99]:
validation, test = train_test_split(validation_test, test_size = 0.5)

In [101]:
print('train contains {x} samples'.format(x = len(train)))
print('validation contains {x} samples'.format(x = len(validation)))
print('test contains {x} samples'.format(x = len(test)))

train contains 1720807 samples
validation contains 368745 samples
test contains 368745 samples


_We will use train for training the fasttext model, validation for perform test to find the best hyperparameters, and test to evaluate the final model._

In [102]:
construct_fasttext_input(train, 'tweets_train.csv')
construct_fasttext_input(validation, 'tweets_validation.csv')
construct_fasttext_input(test, 'tweets_test.csv')

_Here are the hyperparameters that we will tune and their previous default values:_

 - **minCount:**           minimal number of word occurrences [1]
 - **wordNgrams:**         max length of word ngram [1]
 - **lr:**              learning rate [0.1]
 - **dim:**               size of word vectors [100]
 - **ws:**                 size of the context window [5]
 - **epoch:**              number of epochs [5]
 - **neg:**                number of negatives sampled [5]

In [129]:
minCount_val = [1] #, 3, 5]
wordNgrams_val = [3, 4, 5]
lr_val = [0.01]
dim_val = [125, 150]
ws_val = [1, 2, 3]
epoch_val = [7]
bucket_val = [2000000, 3000000, 4000000]
#neg_val = [3, 5, 7, 9]

accuracy = 0
best_params = None

best_params = {}
for minCount in minCount_val:
    for wordNgrams in wordNgrams_val:
        for lr in lr_val:
            for dim in dim_val:
                for ws in ws_val:
                    for epoch in epoch_val:
                        for bucket in bucket_val:
                            model = fasttext.train_supervised(input = 'tweets_train.csv',
                                                             minCount = minCount,
                                                             wordNgrams = wordNgrams,
                                                             lr = lr,
                                                             dim = dim,
                                                             ws = ws,
                                                             epoch = epoch,
                                                             bucket = bucket)
                            
                            new_accuracy = model.test('tweets_validation.csv')
                            
                            params = {'minCount': minCount,
                                      'wordNgrams': wordNgrams,
                                      'lr': lr,
                                      'dim': dim,
                                      'ws': ws,
                                      'epoch': epoch,
                                      'bucket':bucket,
                                     }
                            print('Using params: {p} - Accuracy = {a}'.format(p = params, a = new_accuracy[1]))
                            
                            if new_accuracy[1] > accuracy:
                                accuracy = new_accuracy[1]
                                best_params = params
            
print('best accuracy = {a}'.format(a = accuracy))    
print(best_params)

Using params: {'minCount': 1, 'wordNgrams': 3, 'lr': 0.01, 'dim': 125, 'ws': 1, 'epoch': 7, 'bucket': 2000000} - Accuracy = 0.8662625933910968
Using params: {'minCount': 1, 'wordNgrams': 3, 'lr': 0.01, 'dim': 125, 'ws': 1, 'epoch': 7, 'bucket': 3000000} - Accuracy = 0.8683453334960474
Using params: {'minCount': 1, 'wordNgrams': 3, 'lr': 0.01, 'dim': 125, 'ws': 1, 'epoch': 7, 'bucket': 4000000} - Accuracy = 0.8688443233128584
Using params: {'minCount': 1, 'wordNgrams': 3, 'lr': 0.01, 'dim': 125, 'ws': 2, 'epoch': 7, 'bucket': 2000000} - Accuracy = 0.8662436100828486
Using params: {'minCount': 1, 'wordNgrams': 3, 'lr': 0.01, 'dim': 125, 'ws': 2, 'epoch': 7, 'bucket': 3000000} - Accuracy = 0.8683399096936908
Using params: {'minCount': 1, 'wordNgrams': 3, 'lr': 0.01, 'dim': 125, 'ws': 2, 'epoch': 7, 'bucket': 4000000} - Accuracy = 0.8688226281034319
Using params: {'minCount': 1, 'wordNgrams': 3, 'lr': 0.01, 'dim': 125, 'ws': 3, 'epoch': 7, 'bucket': 2000000} - Accuracy = 0.8661785244545689

In [None]:
epoch_val = [7]

bucket_val = [10000000, 11000000]

accuracy = 0
best_params = None

best_params = {}
for epoch in epoch_val:
    for bucket in bucket_val:
        model = fasttext.train_supervised(input = 'tweets_train.csv',
                                                   minCount = 1,
                                                   wordNgrams = 3,
                                                   lr = 0.01,
                                                   dim = 200,
                                                   ws = 1,
                                                   epoch = epoch,
                                                   bucket = bucket)
                            
        new_accuracy = model.test('tweets_validation.csv')
                            
        params = {'epoch': epoch,
                  'bucket':bucket,
                 }
        print('Using params: {p} - Accuracy = {a}'.format(p = params, a = new_accuracy[1]))
                            
        if new_accuracy[1] > accuracy:
            accuracy = new_accuracy[1]
            best_params = params
            
print('best accuracy = {a}'.format(a = accuracy))    
print(best_params)

Using params: {'epoch': 7, 'bucket': 10000000} - Accuracy = 0.8710084204531587


In [8]:
best_model = fasttext.train_supervised(input = 'tweets_train.csv',
                                                   minCount = 1,
                                                   wordNgrams = 3,
                                                   lr = 0.01,
                                                   dim = 200,
                                                   ws = 1,
                                                   epoch = 7,
                                                   bucket = 10000000)

In [9]:
accuracy = best_model.test('tweets_test.csv')
print('The accuracy of our best model is {a}.'.format(a = accuracy))

The accuracy of our best model is (368745, 0.8732321794193819, 0.8732321794193819).
