# FastText

In [None]:
# import packages
import numpy as np
import pandas as pd

# model training
import fasttext
from sklearn.metrics import classification_report

# for display
from IPython.display import display, HTML

## Load Data

In [None]:
# define path and filenames
path = 'data/emoticon/' # 'data/emoticon/'
train = 'train'
val = 'val'
test = 'test'
suffix = '_newpreproc_emoticon.csv'

In [None]:
# create data dict
data = {}
data_names = [train, val, test]
# change this accordingly: 'phrase', 'phrase_lemma', 'phrase_stem'
# text_column = 'phrase_emoticon_unique'
text_column = 'phrase_emoticon_unique'
old_new = 'new_preproc_emoticon'

for name in data_names:
    # read data
    df = pd.read_csv(path+name+suffix)
    df['label'] = df['label'].astype('int32')
    # add to data dict
    data[f'{name}_{text_column}'] = df

In [None]:
for df_name, df in data.items():
    print(df_name)
    print(df.shape)
    print(df.label.value_counts())
    display(df.head(3))
    print('\n')

In [None]:
# combine train and val
train_all = pd.concat([data['train_'+text_column], data['val_'+text_column]])
data['train_all_'+text_column] = train_all

In [None]:
# convert into txt file for training
train_all_txt = train_all[[text_column, 'label']]

with open(f"data/fasttext/{old_new}/train_all_{text_column}.txt", "w") as f:
    for i in range(len(train_all_txt)):
        row = train_all_txt.iloc[i]
        label = row['label']
        label_txt = ''
        if label == 1:
            label_txt = 'pos'
        elif label == -1:
            label_txt = 'neg'
        elif label == 0:
            label_txt = 'zer'
        line = f'__label__{label_txt} {row[text_column]} \n'
        f.write(line)

## Custom Model

Useful Links:
- https://fasttext.cc/docs/en/supervised-tutorial.html
- https://pypi.org/project/fasttext/
- https://towardsdatascience.com/natural-language-processing-with-fasttext-part-1-an-intro-to-text-classification-with-fasttext-11b9771722d8
- https://fasttext.cc/docs/en/crawl-vectors.html

In [None]:
model = fasttext.train_supervised(input=f"data/fasttext/{old_new}/train_all_{text_column}.txt")

## Predict on Test

In [None]:
# predict on test
test_pred_df = data['test_'+text_column].copy()
test_pred_df['raw_output'] = test_pred_df.apply(lambda x: model.predict(x[text_column].replace("\n", "")), axis=1)
test_pred_df['raw_pred'] = test_pred_df.apply(lambda x: x.raw_output[0][0][-3:], axis=1)
test_pred_df['raw_prob'] = test_pred_df.apply(lambda x: x.raw_output[1][0], axis=1)

In [None]:
test_pred_df.raw_pred.value_counts()

In [None]:
def label_from_txt(x):
    if x == 'pos':
        return 1
    elif x == 'neg':
        return -1
    else:
        return 0

In [None]:
test_pred_df['pred'] = test_pred_df.apply(lambda x: label_from_txt(x['raw_pred']), axis=1)

In [None]:
test_results = classification_report(test_pred_df.label, test_pred_df.pred, output_dict=True)
# save to txt
f = open(f'model_results/fasttext/custom/{old_new}/test_{text_column}.txt', "w")
f.write( str(test_results) )
f.close()

print(classification_report(test_pred_df.label, test_pred_df.pred, digits=4))

In [None]:
test_pred_df.label.value_counts()

In [None]:
test_pred_df.pred.value_counts()

# Stacking Code

## Load Data

In [None]:
# define path and filenames
path = 'data/stacking_folds/'
fold_num = 5
suffix = '.csv'

text_column = 'phrase_stem' # DO NOT CHANGE THIS

In [None]:
# create data dict
data = {}
data_names = [f'train{fold_num}', f'fold{fold_num}']
# data_names = ['train_all', 'test']

for name in data_names:
    # read data
    df = pd.read_csv(path+name+suffix)
    df['label'] = df['label'].astype('int32')
    # add to data dict
    data[f'{name}'] = df

In [None]:
for df_name, df in data.items():
    print(df_name)
    print(df.shape)
    print(df.label.value_counts())
    display(df.head(3))
    print('\n')

In [None]:
# convert into txt file for training
train = data[f'train{fold_num}']
# train = data['train_all']
train_txt = train[[text_column, 'label']]

# with open(f"data/fasttext/stacking_folds/train_all.txt", "w") as f:
with open(f"data/fasttext/stacking_folds/train_{fold_num}.txt", "w") as f:
    for i in range(len(train_txt)):
        row = train_txt.iloc[i]
        label = row['label']
        label_txt = ''
        if label == 1:
            label_txt = 'pos'
        elif label == -1:
            label_txt = 'neg'
        elif label == 0:
            label_txt = 'zer'
        line = f'__label__{label_txt} {row[text_column]} \n'
        f.write(line)

## Custom Model

Useful Links:
- https://fasttext.cc/docs/en/supervised-tutorial.html
- https://pypi.org/project/fasttext/
- https://towardsdatascience.com/natural-language-processing-with-fasttext-part-1-an-intro-to-text-classification-with-fasttext-11b9771722d8
- https://fasttext.cc/docs/en/crawl-vectors.html

In [None]:
model = fasttext.train_supervised(input=f"data/fasttext/stacking_folds/train_{fold_num}.txt")
# model = fasttext.train_supervised(input=f"data/fasttext/stacking_folds/train_all.txt")

## Predict on Folds

In [None]:
from scipy.special import softmax

def get_index(lst, tag):
    for i in range(len(lst)):
        if lst[i][-3:] == tag:
            return i
        
def label_from_txt(x):
    if x == 'pos':
        return 1
    elif x == 'neg':
        return -1
    else:
        return 0

In [None]:
# predict on test
test_pred_df = data[f'fold{fold_num}'].copy()
# test_pred_df = data['test'].copy()
test_pred_df['raw_output'] = test_pred_df.apply(lambda x: model.predict(x[text_column].replace("\n", ""), k=-1), axis=1)
test_pred_df['raw_pred'] = test_pred_df.apply(lambda x: model.predict(x[text_column].replace("\n", ""))[0][0][-3:], axis=1)
test_pred_df['pred'] = test_pred_df.apply(lambda x: label_from_txt(x['raw_pred']), axis=1)

test_pred_df['raw_prob'] = test_pred_df.apply(lambda x: softmax(list(x.raw_output[1])), axis=1)
test_pred_df['pos_index'] = test_pred_df.apply(lambda x: get_index(list(x.raw_output[0]), 'pos'), axis=1)
test_pred_df['neg_index'] = test_pred_df.apply(lambda x: get_index(list(x.raw_output[0]), 'neg'), axis=1)
test_pred_df['fasttext_prob_pos'] = test_pred_df.apply(lambda x: x.raw_prob[x.pos_index], axis=1)
test_pred_df['fasttext_prob_neg'] = test_pred_df.apply(lambda x: x.raw_prob[x.neg_index], axis=1)

In [None]:
test_pred_df.to_csv(f'stacking_preds/fasttext/fasttext_fold{fold_num}_full.csv', index=False)
# test_pred_df.to_csv(f'stacking_preds/fasttext/fasttext_test_full.csv', index=False)

In [None]:
test_preds_only = test_pred_df[['fasttext_prob_pos', 'fasttext_prob_neg']]
test_preds_only.to_csv(f'stacking_preds/fasttext/fasttext_fold{fold_num}.csv', index=False)
# test_preds_only.to_csv(f'stacking_preds/fasttext/fasttext_test.csv', index=False)

In [None]:
print(classification_report(test_pred_df.label, test_pred_df.pred, digits=4))

In [None]:
test_pred_df.label.value_counts()

In [None]:
test_pred_df.pred.value_counts()

# Train Final Model

In [None]:
# read data
df = pd.read_csv('data/stacking_folds/ALL_LABELLED_DATA.csv')
# convert to int type
df['label'] = df['label'].astype('int32')

In [None]:
print(df.shape)
df.head()

In [None]:
# convert into txt file for training
df_txt = df[['phrase_stem', 'label']]

with open(f"data/fasttext/all_labelled_data.txt", "w") as f:
    for i in range(len(df_txt)):
        row = df_txt.iloc[i]
        label = row['label']
        label_txt = ''
        if label == 1:
            label_txt = 'pos'
        elif label == -1:
            label_txt = 'neg'
        elif label == 0:
            label_txt = 'zer'
        line = f'__label__{label_txt} {row["phrase_stem"]} \n'
        f.write(line)

In [None]:
model = fasttext.train_supervised(input=f"data/fasttext/all_labelled_data.txt")

In [None]:
# predict on data for checking
test_pred_df = df.copy()
test_pred_df['raw_output'] = test_pred_df.apply(lambda x: model.predict(x['phrase_stem'].replace("\n", "")), axis=1)
test_pred_df['raw_pred'] = test_pred_df.apply(lambda x: x.raw_output[0][0][-3:], axis=1)
test_pred_df['raw_prob'] = test_pred_df.apply(lambda x: x.raw_output[1][0], axis=1)

In [None]:
test_pred_df.raw_pred.value_counts()

In [None]:
def label_from_txt(x):
    if x == 'pos':
        return 1
    elif x == 'neg':
        return -1
    else:
        return 0

In [None]:
test_pred_df['pred'] = test_pred_df.apply(lambda x: label_from_txt(x['raw_pred']), axis=1)

In [None]:
print(classification_report(test_pred_df.label, test_pred_df.pred, digits=4))

In [None]:
model.save_model("model_fasttext.bin")