# FastText System

### 1. Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import emoji
import fasttext
from nltk.stem.lancaster import LancasterStemmer
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


train = pd.read_csv('data/training.tsv', sep='\t')
dev = pd.read_csv('data/dev.tsv', sep='\t')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/tommcdonald/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/tommcdonald/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tommcdonald/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u'', text)

url_pattern = r"https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}"

def text_processor(df):
    for i in range(len(df)):
        # 1. remove url.
        text = re.sub(url_pattern, "", df.loc[i, 'tweet_text'])
        # 2. tokenisation.
        token_word = word_tokenize(text)
        # 3. POS.
        token_words = pos_tag(token_word)
        # 4. WordNetLemmatizer
        words_lematizer = []
        wordnet_lematizer = WordNetLemmatizer()
        for word, tag in token_words:
            if tag.startswith('NN'):
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='n')  
            elif tag.startswith('VB'): 
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='v')  
            elif tag.startswith('JJ'): 
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='a')  
            elif tag.startswith('R'): 
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='r')
            else: 
                word_lematizer =  wordnet_lematizer.lemmatize(word)
            words_lematizer.append(word_lematizer)
        # 5. lower case and remove stop words.
        cleaned_words = [word.lower() for word in words_lematizer if word.lower() not in stopwords.words('english')]
        # 6. remove punctuation.
        characters = [',','’', '\'','.','DBSCAN', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}']
        words_lists = [word for word in cleaned_words if word not in characters]
        text = str()
        for w in words_lists:
            text+=w
            text+=' '
        # 7. remove emoji
        df.loc[i, 'tweet_text'] = remove_emoji(text)
    return df

In [3]:
train_fast = text_processor(train).drop(['claim', 'topic_id', 'tweet_id', 'tweet_url'], axis=1)
dev_fast = text_processor(dev).drop(['claim', 'topic_id', 'tweet_id', 'tweet_url'], axis=1)

In [4]:
train_fast['claim_worthiness'] = np.where(train_fast['claim_worthiness'] == 1, '__label__Yes', '__label__No') 
dev_fast['claim_worthiness'] = np.where(dev_fast['claim_worthiness'] == 1, '__label__Yes', '__label__No')

train_fast.loc[:3]

Unnamed: 0,tweet_text,claim_worthiness
0,since never get report medium want share copy ...,__label__Yes
1,thanks michaelbloomberg handy little unintenti...,__label__No
2,folks say `` corona virus n't big deal kill di...,__label__No
3,1 case corona virus india people crazy mask da...,__label__Yes


In [5]:
train_fast[['claim_worthiness', 'tweet_text']].to_csv('data/train_fast.txt', sep=' ', header=False, index=False)

In [6]:
dev_fast[['claim_worthiness', 'tweet_text']].to_csv('data/dev_fast.txt', sep=' ', header=False, index=False)

### 2. Modeling

In [7]:
model = fasttext.train_supervised(input="data/train_fast.txt", epoch=51, lr=0.065, wordNgrams=4)

In [8]:
preds_proba = []

for tweet in dev_fast['tweet_text']:
    pred = model.predict(tweet)
    if pred[0][0] == '__label__Yes':
        preds_proba.append(pred[1][0])
    else:
        preds_proba.append(1 - pred[1][0])

In [9]:
results = pd.DataFrame(columns=['topic_id', 'tweet_id', 'score', 'run_id'])
results['tweet_id'] = dev['tweet_id']
results['score'] = preds_proba
results['topic_id'] = 'covid-19'
results['run_id'] = 'Model_4'

results

Unnamed: 0,topic_id,tweet_id,score,run_id
0,covid-19,1235714275752267776,0.545583,Model_4
1,covid-19,1235256530728972290,0.424394,Model_4
2,covid-19,1235648554338791427,0.351625,Model_4
3,covid-19,1235674258858061825,0.548934,Model_4
4,covid-19,1235663306246860800,0.501607,Model_4
...,...,...,...,...
145,covid-19,1235914080931766274,0.532314,Model_4
146,covid-19,1235770706765451264,0.488773,Model_4
147,covid-19,1235973416995315712,0.526114,Model_4
148,covid-19,1235675024738185239,0.414187,Model_4


In [10]:
results.to_csv('golf_system_results_4.tsv', sep='\t', header=False, index=False, float_format='%.15f')

### 3. Grid Search

In [23]:
import itertools
import time
from sklearn.metrics import average_precision_score

epoch_list = [10, 20, 40, 60, 80]
lr_list = [0.001, 0.01, 0.1, 0.3, 0.5]
ngrams_list = [1, 2, 3, 4]
ws_list = [3, 5, 10]
loss_list = ['softmax', 'ns']

best_p, best_e, best_lr, best_n, best_ws, best_loss = 0, 0, 0, 0, 0, 'NA'
combinations = len([x for x in itertools.product(epoch_list, lr_list, ngrams_list, ws_list, loss_list)])
counter = 0
start = time.time()

for e in epoch_list:
    for lr in lr_list:
        for n in ngrams_list:
            for ws in ws_list:
                for loss in loss_list:
                    
                    model = fasttext.train_supervised(input="data/train_fast.txt",
                                                      epoch=e, lr=lr, wordNgrams=n,
                                                      ws=ws, loss=loss)
                    preds_proba = []
                    
                    for tweet in dev_fast['tweet_text']:
                        pred = model.predict(tweet)
                        if pred[0][0] == '__label__Yes':
                            preds_proba.append(pred[1][0])
                        else:
                            preds_proba.append(1 - pred[1][0])
                            
                    avgp = average_precision_score(dev['claim_worthiness'].to_list(), preds_proba)
                    
                    if avgp > best_p:
                        best_p = avgp
                        best_e = e
                        best_lr = lr
                        best_n = n
                        best_ws = ws
                        best_loss = loss
                        
                    counter += 1
                    t = time.time()
                    mins = round((t - start) // 60)
                    secs = round((t - start) % 60)
                    if counter % 100 == 0:
                        print('%d of %d combinations complete in %d minutes and %d seconds' % \
                              (counter, combinations, mins, secs))

100 of 600 combinations complete in 3 minutes and 13 seconds
200 of 600 combinations complete in 6 minutes and 30 seconds
300 of 600 combinations complete in 9 minutes and 55 seconds
400 of 600 combinations complete in 13 minutes and 24 seconds
500 of 600 combinations complete in 16 minutes and 57 seconds
600 of 600 combinations complete in 20 minutes and 32 seconds


In [24]:
print('Best model parameters: %d epochs, %f lr, %d - grams, ws = %d, %s loss' % \
      (best_e, best_lr, best_n, best_ws, best_loss))
print('Average Precision of %.3f' % best_p)  

Best model parameters: 10 epochs, 0.300000 lr, 4 - grams, ws = 5, softmax loss
Average Precision of 0.813


In [25]:
model = fasttext.train_supervised(input="data/train_fast.txt",
                                  epoch=best_e, lr=best_lr, wordNgrams=best_n,
                                  ws=best_ws, loss=best_loss)
preds_proba = []

for tweet in dev_fast['tweet_text']:
    pred = model.predict(tweet)
    if pred[0][0] == '__label__Yes':
        preds_proba.append(pred[1][0])
    else:
        preds_proba.append(1 - pred[1][0])

results = pd.DataFrame(columns=['topic_id', 'tweet_id', 'score', 'run_id'])
results['tweet_id'] = dev['tweet_id']
results['score'] = preds_proba
results['topic_id'] = 'covid-19'
results['run_id'] = 'Model_4'

results

Unnamed: 0,topic_id,tweet_id,score,run_id
0,covid-19,1235714275752267776,0.511541,Model_4
1,covid-19,1235256530728972290,0.435760,Model_4
2,covid-19,1235648554338791427,0.296876,Model_4
3,covid-19,1235674258858061825,0.513744,Model_4
4,covid-19,1235663306246860800,0.484887,Model_4
...,...,...,...,...
145,covid-19,1235914080931766274,0.455108,Model_4
146,covid-19,1235770706765451264,0.480614,Model_4
147,covid-19,1235973416995315712,0.501188,Model_4
148,covid-19,1235675024738185239,0.428038,Model_4


In [26]:
results.to_csv('golf_system_results_4.tsv', sep='\t', header=False, index=False, float_format='%.15f')