# FastText Sentiment Analysis Model

In [1]:
# read and pre-process data
import pandas as pd

# modelling
import fasttext

# evaluation metrics
from sklearn.metrics import f1_score, precision_score, recall_score

In [2]:
test_lemmatize_filenames = ["data/fasttext/lemmatize/all_test_lemmatize.csv", "data/fasttext/lemmatize/news_test_lemmatize.csv", "data/fasttext/lemmatize/reddit_test_lemmatize.csv", "data/fasttext/lemmatize/twitter_test_lemmatize.csv"]
test_stem_filenames = ["data/fasttext/stem/all_test_stem.csv", "data/fasttext/stem/news_test_stem.csv", "data/fasttext/stem/reddit_test_stem.csv", "data/fasttext/stem/twitter_test_stem.csv"]
test_filenames = ["data/fasttext/normal/all_test.csv", "data/fasttext/normal/news_test.csv", "data/fasttext/normal/reddit_test.csv", "data/fasttext/normal/twitter_test.csv"]

## Train and Test Model (Default Parameters)
Perform initial testing to see which model performs best (before hyperparameter tuning)

In [3]:
# no pre-processing model
model = fasttext.train_supervised("data/fasttext/normal/all_train.txt", dim=300, pretrainedVectors="utils/fasttext/wiki-news-300d-1M.vec")
print("default fasttext model")
print("----------------------------------------------------------------------")
for i in range(len(test_filenames)):
    print("results for:", test_filenames[i])

    # load data
    test_df = pd.read_csv(test_filenames[i], header=0)
    y_actual = list(test_df["label"])

    # predictions
    y_pred = [int(model.predict(x)[0][0][-1]) for x in test_df["text"]]

    # print results
    precision = precision_score(y_actual, y_pred, average="binary", pos_label=1)
    recall = recall_score(y_actual, y_pred, average="binary", pos_label=1)
    f1 = f1_score(y_actual, y_pred, average="binary", pos_label=1)

    print("precision:", precision)
    print("recall:", recall)
    print("f1:", f1)
    print("----------------------------------------------------------------------")

# save model
model.save_model("models/fasttext/fasttext_default.bin")

default fasttext model
----------------------------------------------------------------------
results for: data/fasttext/normal/all_test.csv
precision: 0.7232704402515723
recall: 0.6534090909090909
f1: 0.6865671641791045
----------------------------------------------------------------------
results for: data/fasttext/normal/news_test.csv
precision: 0.7065217391304348
recall: 0.5508474576271186
f1: 0.6190476190476191
----------------------------------------------------------------------
results for: data/fasttext/normal/reddit_test.csv
precision: 0.6827956989247311
recall: 0.6939890710382514
f1: 0.6883468834688347
----------------------------------------------------------------------
results for: data/fasttext/normal/twitter_test.csv
precision: 0.95
recall: 0.7450980392156863
f1: 0.8351648351648352
----------------------------------------------------------------------


In [6]:
# lemmatized model
model = fasttext.train_supervised("data/fasttext/lemmatize/all_train_lemmatize.txt", dim=300, pretrainedVectors="utils/fasttext/wiki-news-300d-1M.vec")
print("lemmatized fasttext model")
print("----------------------------------------------------------------------")
for i in range(len(test_lemmatize_filenames)):
    print("results for:", test_lemmatize_filenames[i])

    # load data
    test_df = pd.read_csv(test_lemmatize_filenames[i], header=0)
    y_actual = list(test_df["label"])

    # predictions
    y_pred = [int(model.predict(x)[0][0][-1]) for x in test_df["text"]]

    # print results
    precision = precision_score(y_actual, y_pred, average="binary", pos_label=1)
    recall = recall_score(y_actual, y_pred, average="binary", pos_label=1)
    f1 = f1_score(y_actual, y_pred, average="binary", pos_label=1)

    print("precision:", precision)
    print("recall:", recall)
    print("f1:", f1)
    print("----------------------------------------------------------------------")

# save model
model.save_model("models/fasttext/fasttext_default_lemmatize.bin")

lemmatized fasttext model
----------------------------------------------------------------------
results for: data/fasttext/lemmatize/all_test_lemmatize.csv
precision: 0.6447368421052632
recall: 0.5568181818181818
f1: 0.5975609756097561
----------------------------------------------------------------------
results for: data/fasttext/lemmatize/news_test_lemmatize.csv
precision: 0.6436781609195402
recall: 0.4745762711864407
f1: 0.5463414634146342
----------------------------------------------------------------------
results for: data/fasttext/lemmatize/reddit_test_lemmatize.csv
precision: 0.5842696629213483
recall: 0.5683060109289617
f1: 0.5761772853185596
----------------------------------------------------------------------
results for: data/fasttext/lemmatize/twitter_test_lemmatize.csv
precision: 0.9230769230769231
recall: 0.7058823529411765
f1: 0.8000000000000002
----------------------------------------------------------------------


In [5]:
# stemmed model
model = fasttext.train_supervised("data/fasttext/stem/all_train_stem.txt", dim=300, pretrainedVectors="utils/fasttext/wiki-news-300d-1M.vec")
print("stem fasttext model")
print("----------------------------------------------------------------------")
for i in range(len(test_stem_filenames)):
    print("results for:", test_stem_filenames[i])

    # load data
    test_df = pd.read_csv(test_stem_filenames[i], header=0)
    y_actual = list(test_df["label"])

    # predictions
    y_pred = [int(model.predict(x)[0][0][-1]) for x in test_df["text"]]

    # print results
    precision = precision_score(y_actual, y_pred, average="binary", pos_label=1)
    recall = recall_score(y_actual, y_pred, average="binary", pos_label=1)
    f1 = f1_score(y_actual, y_pred, average="binary", pos_label=1)

    print("precision:", precision)
    print("recall:", recall)
    print("f1:", f1)
    print("----------------------------------------------------------------------")

# save model
model.save_model("models/fasttext/fasttext_default_stem.bin")

stem fasttext model
----------------------------------------------------------------------
results for: data/fasttext/stem/all_test_stem.csv
precision: 0.6870967741935484
recall: 0.6051136363636364
f1: 0.6435045317220544
----------------------------------------------------------------------
results for: data/fasttext/stem/news_test_stem.csv
precision: 0.6210526315789474
recall: 0.5
f1: 0.5539906103286385
----------------------------------------------------------------------
results for: data/fasttext/stem/reddit_test_stem.csv
precision: 0.6759776536312849
recall: 0.6612021857923497
f1: 0.6685082872928177
----------------------------------------------------------------------
results for: data/fasttext/stem/twitter_test_stem.csv
precision: 0.9166666666666666
recall: 0.6470588235294118
f1: 0.7586206896551724
----------------------------------------------------------------------


AttributeError: '_FastText' object has no attribute 'save'