# FastText Sentiment Analysis Model

In [1]:
# read and pre-process data
import pandas as pd

# modelling
import fasttext

# evaluation metrics
from sklearn.metrics import f1_score, precision_score, recall_score

In [2]:
# instantiate postfix (common to all train, test, validation sets)
filename = ["sample_crypto_lemmatize_title", "sample_crypto_lemmatize_excerpt", "sample_crypto_lemmatize_text", "sample_crypto_stem_title", "sample_crypto_stem_excerpt", "sample_crypto_stem_text", "sample_reddit_lemmatize", "sample_reddit_stem", "sample_twitter_lemmatize", "sample_twitter_stem", "sample_socialmedia_lemmatize", "sample_socialmedia_stem", "sample_all_lemmatize", "sample_all_stem"]

## Train and Test Model (Default Parameters)
Perform initial testing to see which model performs best (before hyperparameter tuning)

In [3]:
# retrieve filenames
train_all_filename_prefix = "data/fasttext_date/train_all/"
train_all_filename_postfix = ".txt"
train_all_filename_list = [train_all_filename_prefix + filename[i] + train_all_filename_postfix for i in range(len(filename))]

test_filename_prefix = "data/fasttext_date/test/"
test_filename_postfix = ".csv"
test_filename_list = [test_filename_prefix + filename[i] + test_filename_postfix for i in range(len(filename))]

# set model save dir
model_filename_prefix = "models/fasttext/default_date/"
model_filename_postfix = ".bin"
model_filename_list = [model_filename_prefix + filename[i] + model_filename_postfix for i in range(len(filename))]

In [4]:
# generate all models, output metrics and save model
for i in range(len(train_all_filename_list)):
    model = fasttext.train_supervised(train_all_filename_list[i], dim=300, pretrainedVectors="utils/fasttext/wiki-news-300d-1M.vec")

    test_df = pd.read_csv(test_filename_list[i], header=0)
    y_test_pred = [int(model.predict(x)[0][0][-1]) for x in test_df["text"]]
    y_test_actual = list(test_df["label"])
    
    # print metrics
    print("metrics for:", filename[i])
    print("precision score:", precision_score(y_true=y_test_actual, y_pred=y_test_pred, average="binary", pos_label=1))
    print("recall score:", recall_score(y_true=y_test_actual, y_pred=y_test_pred, average="binary", pos_label=1))
    print("f1 score:", f1_score(y_true=y_test_actual, y_pred=y_test_pred, average="binary", pos_label=1))
    print("_____________________________________________________")
    # save model
    model.save_model(model_filename_list[i])

metrics for: sample_crypto_lemmatize_title
precision score: 0.7254901960784313
recall score: 0.5522388059701493
f1 score: 0.6271186440677965
_____________________________________________________
metrics for: sample_crypto_lemmatize_excerpt
precision score: 0.5
recall score: 0.35
f1 score: 0.4117647058823529
_____________________________________________________
metrics for: sample_crypto_lemmatize_text
precision score: 0.6507936507936508
recall score: 0.6119402985074627
f1 score: 0.6307692307692307
_____________________________________________________
metrics for: sample_crypto_stem_title
precision score: 0.6481481481481481
recall score: 0.5223880597014925
f1 score: 0.5785123966942148
_____________________________________________________
metrics for: sample_crypto_stem_excerpt
precision score: 0.5454545454545454
recall score: 0.4
f1 score: 0.4615384615384615
_____________________________________________________
metrics for: sample_crypto_stem_text
precision score: 0.6721311475409836
rec