## Final System 2 - fastText w/ TFIDF on V2 Data

### 1. Data Preprocessing

In [36]:
import pandas as pd
import numpy as np
import re
import nltk
import emoji
import fasttext
from nltk.stem.lancaster import LancasterStemmer
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


train = pd.read_csv('final_data/training_v2.tsv', sep='\t')
dev = pd.read_csv('final_data/dev_v2.tsv', sep='\t')
train = train.append(dev, ignore_index=True)
test = pd.read_csv('final_data/test-input.tsv', sep='\t')

print(len(train), 'training instances')
print(len(test), 'test instances')

822 training instances
140 test instances


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/tommcdonald/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/tommcdonald/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tommcdonald/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [38]:
def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u'', text)

url_pattern = r"https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}"

def text_processor(df):
    for i in range(len(df)):
        # 1. remove url.
        text = re.sub(url_pattern, "", df.loc[i, 'tweet_text'])
        # 2. tokenisation.
        token_word = word_tokenize(text)
        # 3. POS.
        token_words = pos_tag(token_word)
        # 4. WordNetLemmatizer
        words_lematizer = []
        wordnet_lematizer = WordNetLemmatizer()
        for word, tag in token_words:
            if tag.startswith('NN'):
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='n')  
            elif tag.startswith('VB'): 
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='v')  
            elif tag.startswith('JJ'): 
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='a')  
            elif tag.startswith('R'): 
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='r')
            else: 
                word_lematizer =  wordnet_lematizer.lemmatize(word)
            words_lematizer.append(word_lematizer)
        # 5. lower case and remove stop words.
        cleaned_words = [word.lower() for word in words_lematizer if word.lower() not in stopwords.words('english')]
        # 6. remove punctuation.
        characters = [',','’', '\'','.','DBSCAN', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}']
        words_lists = [word for word in cleaned_words if word not in characters]
        text = str()
        for w in words_lists:
            text+=w
            text+=' '
        # 7. remove emoji
        df.loc[i, 'tweet_text'] = remove_emoji(text)
    return df

In [39]:
train_fast = text_processor(train).drop(['claim', 'topic_id', 'tweet_id', 'tweet_url'], axis=1)
test_fast = text_processor(test).drop(['topic_id', 'tweet_id', 'tweet_url'], axis=1)

In [42]:
train_fast['check_worthiness'] = np.where(train_fast['check_worthiness'] == 1, '__label__Yes', '__label__No') 

train_fast.loc[:3]

Unnamed: 0,tweet_text,check_worthiness
0,since never get report medium want share copy ...,__label__Yes
1,thanks michaelbloomberg handy little unintenti...,__label__No
2,folks say `` corona virus n't big deal kill di...,__label__No
3,1 case corona virus india people crazy mask da...,__label__No


In [44]:
train_fast[['check_worthiness', 'tweet_text']].to_csv('final_data/train_fast.txt', sep=' ', header=False, index=False)

In [45]:
test_fast[['tweet_text']].to_csv('final_data/test_fast.txt', sep=' ', header=False, index=False)

### 2. Modeling

In [24]:
print('Best model parameters: %d epochs, %f lr, %d - grams, ws = %d, %s loss' % \
      (best_e, best_lr, best_n, best_ws, best_loss))
print('Average Precision of %.3f' % best_p)  

Best model parameters: 10 epochs, 0.300000 lr, 4 - grams, ws = 5, softmax loss
Average Precision of 0.813


In [64]:
model = fasttext.train_supervised(input="data/train_fast.txt",
                                  epoch=10, lr=0.3, wordNgrams=4,
                                  ws=5, loss='softmax')
preds_proba = []

for tweet in test_fast['tweet_text']:
    pred = model.predict(tweet)
    if pred[0][0] == '__label__Yes':
        preds_proba.append(pred[1][0])
    else:
        preds_proba.append(1 - pred[1][0])

results = pd.DataFrame(columns=['topic_id',  'tweet_id', 'score', 'run_id'])
results['tweet_id'] = test['tweet_id']
results['score'] = preds_proba
results['topic_id'] = 'covid-19'
results['run_id'] = 'TeamGolfModel2'
results = results.sort_values(['score'], ascending=False)
#results['rank'] = [x for x in range(1, len(test)+1)]
results

Unnamed: 0,topic_id,tweet_id,score,run_id
24,covid-19,1236947198971125761,0.619843,TeamGolfModel2
110,covid-19,1237382759812861952,0.614859,TeamGolfModel2
76,covid-19,1237576863901290496,0.592716,TeamGolfModel2
65,covid-19,1237148966204125186,0.567603,TeamGolfModel2
34,covid-19,1237512557088260097,0.562691,TeamGolfModel2
...,...,...,...,...
35,covid-19,1237226775148662784,0.268967,TeamGolfModel2
133,covid-19,1237005180199219203,0.234906,TeamGolfModel2
80,covid-19,1237179694815969280,0.198092,TeamGolfModel2
66,covid-19,1236859724353716224,0.126218,TeamGolfModel2


In [65]:
results.to_csv('final_results/system_2_results.tsv', sep='\t', header=False, index=False, float_format='%.15f')