# Facebook FASTTEXT

In [63]:
import os

import fasttext
import pandas as pd
from sklearn.model_selection import train_test_split

Set PATH

In [64]:
PATH_DATA = "/home/leonardovida/dev/hist-aware/notebooks/data/labeled-full/split_labeled/merged"

## Load data and create fasttext data

In [26]:
# Full data
coal = pd.read_csv(os.path.join(PATH_DATA, "labeled_coal_1970_1990.csv"))
oil = pd.read_csv(os.path.join(PATH_DATA, "labeled_oil_1970_1990.csv"))
gas = pd.read_csv(os.path.join(PATH_DATA, "labeled_gas_1970_1990.csv"))
df = pd.read_csv(os.path.join(PATH_DATA, "labeled_energy_1970_1990.csv"))

---

In [66]:
for DECADE in ["1970", "1980", "1990"]:    
    # create train and valid
    for TOPIC in ["coal", "oil", "gas", "df"]:
        df = pd.read_csv(os.path.join(PATH_DATA, f"edo_{DECADE}s_{TOPIC}_cleaned.csv"))
        
        train_x_, valid_x, train_y, valid_y = train_test_split(
            df.text_clean,
            df.labels,
            stratify=df.labels,
            random_state=42, 
            test_size=0.2,
            shuffle=True
        )
        
        print(train_x_)
    
        file = open(os.path.join(PATH_DATA, f"{DECADE}", f'fasttext.train.{TOPIC}'), 'w+')
        for idx, row in train_x.items():
            line = '__label__' + str(train_y[idx]) + ' ' + train_x[idx]
            file.write(line + '\n')

        file = open(os.path.join(PATH_DATA, f"{DECADE}", f'fasttext.valid.{TOPIC}'), 'w+')
        for idx, row in valid_x.items():
            line = '__label__' + str(valid_y[idx]) + ' ' + valid_x[idx]
            file.write(line + '\n')

['Unnamed: 0' 'labels' 'text' 'energy' 'article_filepath' 'article_name'
 'count' 'date' 'dir' 'index_article' 'index_metadata' 'metadata_filepath'
 'newspaper_language' 'newspaper_publisher' 'newspaper_source'
 'newspaper_title' 'newspaper_volume' 'newspaper_issuenumber'
 'newspaper_city' 'text_clean' 'type']


KeyError: 1860

## Create Fasttext model

In [37]:
model = fasttext.train_supervised(
    input=os.path.join(PATH_DATA, 'fasttext.df.train'),
    lr=0.2,               # learning rate [0.1]
    dim=100,               # size of word vectors [100]
    ws=10,                # size of the context window [5]
    epoch=50,            # number of epochs [5]
    minCount=20,          # minimal number of word occurences [1]
    wordNgrams=3,        # max length of word ngram [1]
    loss="ova",              # loss function {ns, hs, softmax, ova} [softmax]
    bucket=5000000,           # number of buckets [2000000]
    thread=6,          # number of threads [number of cpus]
    lrUpdateRate=100,     # change the rate of updates for the learning rate [100]
    t=0.0001,                 # sampling threshold [0.0001]
    autotuneValidationFile=os.path.join(PATH_DATA, 'fasttext.df.valid'),
    #verbose           # verbose [2]
)
model.save_model("FASTTEXT_model_manual.bin")

In [38]:
model_2 = fasttext.train_supervised(
    input=os.path.join(PATH_DATA, 'fasttext.df.train'),
    autotuneValidationFile=os.path.join(PATH_DATA, 'fasttext.df.valid'),
    autotuneDuration=1000)
model_2.save_model("FASTTEXT_model_auto.bin")

## Predict model

In [42]:
res = model.test(os.path.join(PATH_DATA, 'fasttext.df.valid'))
print(f"Manually trained model, results on validation:\n- Sample {res[0]} \n- Accuracy: {res[1]}")

print("\n---\n")

res = model_2.test(os.path.join(PATH_DATA, 'fasttext.df.valid'),)
print(f"Automatically trained model, results on validation:\n- Sample {res[0]} \n- Accuracy: {res[1]}")

Manually trained model, results on validation:
- Sample 1239 
- Accuracy: 0.6795803066989508

---

Automatically trained model, results on validation:
- Sample 1239 
- Accuracy: 0.6852300242130751


In [49]:
predict_phrase = "oil"
print(f"We want as many predictions as possible (k-1) for the prediction of:\n - {predict_phrase}")
pred = model.predict(predict_phrase, k=-1)
print(pred)

We want as many predictions as possible (k-1) for the prediction of:
 - oil
(('__label__1', '__label__2', '__label__0'), array([1.00001001e+00, 1.00000034e-05, 1.00000034e-05]))
