# FastText System

### 1. Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import emoji
import fasttext
from nltk.stem.lancaster import LancasterStemmer
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


train = pd.read_csv('data/training.tsv', sep='\t')
dev = pd.read_csv('data/dev.tsv', sep='\t')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/tommcdonald/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/tommcdonald/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tommcdonald/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u'', text)

url_pattern = r"https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}"

def text_processor(df):
    for i in range(len(df)):
        # 1. remove url.
        text = re.sub(url_pattern, "", df.loc[i, 'tweet_text'])
        # 2. tokenisation.
        token_word = word_tokenize(text)
        # 3. POS.
        token_words = pos_tag(token_word)
        # 4. WordNetLemmatizer
        words_lematizer = []
        wordnet_lematizer = WordNetLemmatizer()
        for word, tag in token_words:
            if tag.startswith('NN'):
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='n')  
            elif tag.startswith('VB'): 
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='v')  
            elif tag.startswith('JJ'): 
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='a')  
            elif tag.startswith('R'): 
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='r')
            else: 
                word_lematizer =  wordnet_lematizer.lemmatize(word)
            words_lematizer.append(word_lematizer)
        # 5. lower case and remove stop words.
        cleaned_words = [word.lower() for word in words_lematizer if word.lower() not in stopwords.words('english')]
        # 6. remove punctuation.
        characters = [',','’', '\'','.','DBSCAN', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}']
        words_lists = [word for word in cleaned_words if word not in characters]
        text = str()
        for w in words_lists:
            text+=w
            text+=' '
        # 7. remove emoji
        df.loc[i, 'tweet_text'] = remove_emoji(text)
    return df

In [3]:
train_fast = text_processor(train).drop(['claim', 'topic_id', 'tweet_id', 'tweet_url'], axis=1)
dev_fast = text_processor(dev).drop(['claim', 'topic_id', 'tweet_id', 'tweet_url'], axis=1)

In [4]:
train_fast['claim_worthiness'] = np.where(train_fast['claim_worthiness'] == 1, '__label__Yes', '__label__No') 
dev_fast['claim_worthiness'] = np.where(dev_fast['claim_worthiness'] == 1, '__label__Yes', '__label__No')

train_fast.loc[:3]

Unnamed: 0,tweet_text,claim_worthiness
0,since never get report medium want share copy ...,__label__Yes
1,thanks michaelbloomberg handy little unintenti...,__label__No
2,folks say `` corona virus n't big deal kill di...,__label__No
3,1 case corona virus india people crazy mask da...,__label__Yes


In [5]:
train_fast[['claim_worthiness', 'tweet_text']].to_csv('data/train_fast.txt', sep=' ', header=False, index=False)

In [6]:
dev_fast[['claim_worthiness', 'tweet_text']].to_csv('data/dev_fast.txt', sep=' ', header=False, index=False)

### 2. Modeling

In [137]:
model = fasttext.train_supervised(input="data/train_fast.txt", epoch=51, lr=0.065, wordNgrams=4)

In [138]:
preds_proba = []

for tweet in dev_fast['tweet_text']:
    pred = model.predict(tweet)
    if pred[0][0] == '__label__Yes':
        preds_proba.append(pred[1][0])
    else:
        preds_proba.append(1 - pred[1][0])

In [139]:
results = pd.DataFrame(columns=['topic_id', 'tweet_id', 'score', 'run_id'])
results['tweet_id'] = dev['tweet_id']
results['score'] = preds_proba
results['topic_id'] = 'covid-19'
results['run_id'] = 'Model_4'

results

Unnamed: 0,topic_id,tweet_id,score,run_id
0,covid-19,1235714275752267776,0.544319,Model_4
1,covid-19,1235256530728972290,0.424780,Model_4
2,covid-19,1235648554338791427,0.348335,Model_4
3,covid-19,1235674258858061825,0.547828,Model_4
4,covid-19,1235663306246860800,0.501010,Model_4
...,...,...,...,...
145,covid-19,1235914080931766274,0.529747,Model_4
146,covid-19,1235770706765451264,0.488351,Model_4
147,covid-19,1235973416995315712,0.525457,Model_4
148,covid-19,1235675024738185239,0.414652,Model_4


In [140]:
results.to_csv('golf_system_results_4.tsv', sep='\t', header=False, index=False, float_format='%.15f')