# FastText

In [None]:
# import packages
import numpy as np
import pandas as pd

# model training
import fasttext
from sklearn.metrics import classification_report

# for display
from IPython.display import display, HTML

## Load Data

In [None]:
# define path and filenames
path = 'data/old_labels/'
train = 'train'
val = 'val'
test = 'test'
suffix = '_oldpreproc.csv'

In [None]:
# create data dict
data = {}
data_names = [train, val, test]
# change this accordingly: 'phrase', 'phrase_lemma', 'phrase_stem'
text_column = 'phrase_stem'
old_new = 'old_preproc'

for name in data_names:
    # read data
    df = pd.read_csv(path+name+suffix)
    # add to data dict
    data[f'{name}_{text_column}'] = df

In [None]:
for df_name, df in data.items():
    print(df_name)
    print(df.shape)
    print(df.label.value_counts())
    display(df.head(3))
    print('\n')

## Baseline: Yelp Polarity Dataset

In [None]:
# load pre-trained yelp model
model_yelp = fasttext.load_model("utils/fasttext/yelp_review_polarity.bin")

### Predict on Validation
* To find the best threshold to classify phrases into 3 categories: -1, 0, 1

In [None]:
# predict on val set
val_pred_df = data['val_'+text_column].copy()
val_pred_df['raw_output'] = val_pred_df.apply(lambda x: model_yelp.predict(x[text_column].replace("\n", "")), axis=1)
val_pred_df['raw_pred'] = val_pred_df.apply(lambda x: int(x.raw_output[0][0][-1]), axis=1)
val_pred_df['raw_prob'] = val_pred_df.apply(lambda x: x.raw_output[1][0], axis=1)
val_pred_df['prob_pos'] = val_pred_df.apply(lambda x: x.raw_prob if x.raw_pred == 2 else (1-x.raw_prob), axis=1)

In [None]:
val_pred_df.head(3)

In [None]:
# function that assigns class labels based on prob (positive sentiment)
def label_from_prob(x, lower, upper):    
    if x < lower: # negative sentiment
        return -1
    elif x < upper: # neutral sentiment
        return 0
    else:
        return 1

In [None]:
# grid search for best threshold
lower_lst = [round(i*0.05, 2) for i in range(1, 9)]
upper_lst = [round(1 - i, 2) for i in lower_lst]

# dataframe to store results
val_threshold_results = pd.DataFrame(columns = ['lower_thresh', 'upper_thresh', 'accuracy', \
                                                'weighted_ave_f1', 'f1_neg', 'f1_zero', 'f1_pos'])

for i in range(len(lower_lst)):
    df = val_pred_df.copy()
    # assign labels based on threshold definition
    lower = lower_lst[i]
    upper = upper_lst[i]
    df['pred'] = df.apply(lambda x: label_from_prob(x.prob_pos, lower, upper), axis=1)
    
    # classification report
    report = classification_report(df.label, df.pred, output_dict=True)
    # retrieve metrics
    accuracy = report['accuracy']
    weighted_ave_f1 = report['weighted avg']['f1-score']
    f1_neg = report['-1']['f1-score']
    f1_zero = report['0']['f1-score']
    f1_pos = report['1']['f1-score']
    
    row = {'lower_thresh': lower, 'upper_thresh': upper, 'accuracy': accuracy, \
           'weighted_ave_f1': weighted_ave_f1, 'f1_neg': f1_neg, 'f1_zero': f1_zero, 'f1_pos': f1_pos}
    
    val_threshold_results = val_threshold_results.append(row, ignore_index=True)

In [None]:
val_threshold_results.to_csv(f'model_results/fasttext/baseline/{old_new}/val_threshold_{text_column}.csv')
val_threshold_results

### Apply Best Threshold on Test Set

In [None]:
best_row = val_threshold_results.loc[val_threshold_results['weighted_ave_f1'] == max(val_threshold_results['weighted_ave_f1'])]
best_lower = best_row['lower_thresh'][0]
best_upper = best_row['upper_thresh'][0]

In [None]:
# predict on test
test_pred_df = data['test_'+text_column].copy()
test_pred_df['raw_output'] = test_pred_df.apply(lambda x: model_yelp.predict(x[text_column].replace("\n", "")), axis=1)
test_pred_df['raw_pred'] = test_pred_df.apply(lambda x: int(x.raw_output[0][0][-1]), axis=1)
test_pred_df['raw_prob'] = test_pred_df.apply(lambda x: x.raw_output[1][0], axis=1)
test_pred_df['prob_pos'] = test_pred_df.apply(lambda x: x.raw_prob if x.raw_pred == 2 else (1-x.raw_prob), axis=1)

In [None]:
test_pred_df['pred'] = test_pred_df.apply(lambda x: label_from_prob(x['prob_pos'], best_lower, best_upper), axis=1)

In [None]:
test_results = classification_report(test_pred_df.label, test_pred_df.pred, output_dict=True)
# save to txt
f = open(f'model_results/fasttext/baseline/{old_new}/test_{text_column}.txt', "w")
f.write( str(test_results) )
f.close()

print(classification_report(test_pred_df.label, test_pred_df.pred, digits=4))

In [None]:
test_pred_df.label.value_counts()

In [None]:
test_pred_df.pred.value_counts()