# Fasttext

In [2]:
import fasttext
import pandas as pd
from sklearn.model_selection import train_test_split

## Load data and create fasttext data

In [3]:
coal = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/coal.csv")
oil = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/oil.csv")
gas = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/gas.csv")
df = pd.read_csv("~/dev/hist-aware/notebooks/sentiment/df.csv")

In [4]:
df.columns

Index(['Unnamed: 0', 'labels', 'text', 'energy', 'article_filepath',
       'article_name', 'count', 'date', 'dir', 'index_article',
       'index_metadata', 'metadata_filepath', 'newspaper_language',
       'newspaper_publisher', 'newspaper_source', 'newspaper_title',
       'newspaper_volume', 'newspaper_issuenumber', 'newspaper_city',
       'text_clean', 'type', 'text_split'],
      dtype='object')

In [5]:
train_x, valid_x, train_y, valid_y = train_test_split(
    df.text_split,
    df.labels,
    stratify=df.labels,
    random_state=42, 
    test_size=0.2,
    shuffle=True
)

In [6]:
file = open('sentiment/fasttext.df.train', 'w+')
for idx, row in train_x.items():
    line = '__label__' + str(train_y[idx]) + ' ' + train_x[idx]
    file.write(line + '\n')
    
file = open('sentiment/fasttext.df.valid', 'w+')
for idx, row in valid_x.items():
    line = '__label__' + str(valid_y[idx]) + ' ' + valid_x[idx]
    file.write(line + '\n')

## Create Fasttext model

In [54]:
model = fasttext.train_supervised(
    input="sentiment/fasttext.df.train",
    lr=0.2,               # learning rate [0.1]
    dim=100,               # size of word vectors [100]
    ws=10,                # size of the context window [5]
    epoch=50,            # number of epochs [5]
    minCount=20,          # minimal number of word occurences [1]
    wordNgrams=3,        # max length of word ngram [1]
    loss="ova",              # loss function {ns, hs, softmax, ova} [softmax]
    bucket=5000000,           # number of buckets [2000000]
    thread=6,          # number of threads [number of cpus]
    lrUpdateRate=100,     # change the rate of updates for the learning rate [100]
    t=0.0001,                 # sampling threshold [0.0001]
    autotuneValidationFile='sentiment/fasttext.df.valid',
    #verbose           # verbose [2]
)
model.save_model("FASTTEXT_model_manual.bin")

In [None]:
model_2 = fasttext.train_supervised(
    input="sentiment/fasttext.df.train",
    autotuneValidationFile='sentiment/fasttext.df.valid',
    autotuneDuration=1000)
model_2.save_model("FASTTEXT_model_auto.bin")

## Predict model

In [55]:
res = model.test('sentiment/fasttext.df.valid')
print(f"Manually trained model, results on validation:\n- Sample {res[0]} \n- Accuracy: {res[1]}")

print("\n---\n")

res = model_2.test('sentiment/fasttext.df.valid')
print(f"Automatically trained model, results on validation:\n- Sample {res[0]} \n- Accuracy: {res[1]}")

Manually trained model, results on validation:
- Sample 1413 
- Accuracy: 0.7183297947629158

---

Automatically trained model, results on validation:
- Sample 1413 
- Accuracy: 0.7098372257607927


In [61]:
predict_phrase = "coal"
print(f"We want as many predictions as possible (k-1) for the prediction of:\n - {predict_phrase}")
pred = model.predict(predict_phrase, k=-1)
for p, l in zip(pred[1], pred[0]):
    print(f"Label: {l} - Prob: {int(p)}")

We want as many predictions as possible (k-1) for the prediction of:
 - coal
Label: __label__1 - Prob: 1
Label: __label__2 - Prob: 0
Label: __label__0 - Prob: 0
