In [1]:
import fasttext.FastText as ftt
import pandas as pd
import numpy as np
import nltk as text
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_score
from contractions import fix
import string
SEED = 42

In [2]:
class PreproccessText:
    def __init__(self):
        self.stop_words = set(text.corpus.stopwords.words("english") + list(string.punctuation))
        self.lemm = text.stem.WordNetLemmatizer().lemmatize
    
    def fit(self, corpus):
        # ? normalization of str to lower case
        corpus = corpus.lower()
        # ? expand contraction like can't -> cannot
        corpus = fix(corpus)

        # ? tokenize
        word_tokens = text.tokenize.word_tokenize(corpus)
        # ? lemmatize
        word_tokens = [self.lemm(w) for w in word_tokens]
        # ? remove stopwords & punct & 
        filtered_sentence = [self.lemm(w) for w in word_tokens if w.isalpha and not w in self.stop_words ]

        return " ".join(filtered_sentence)


In [3]:
# Prepare data to fast text
cleanText = PreproccessText().fit

data = pd.read_csv("./roars.csv")
data.dropna(inplace=True)

labels = [f"__label__{i}" for i in range(1, 11)]
data.category = data["category"].apply(lambda i: labels[i-1])
data = data[["category", "roar_text"]]

data.roar_text = data["roar_text"].apply(cleanText)


In [4]:
data.head()

Unnamed: 0,category,roar_text
0,__label__2,doe zebra stripe provides camouflage predator ...
1,__label__4,itsy bitsy sipder climb waterspout
2,__label__3,woman get pm premenstrual syndrome pm group sy...
3,__label__3,co-worker guilty unsanitary hygiene would righ...
4,__label__3,risk alternative medicine gigantic question --...


In [5]:
np.savetxt("./dataset/all.txt", data.values, fmt='%s')
X_tr, X_t, Y_tr, Y_t = train_test_split(data["roar_text"], data["category"], train_size=0.8, random_state=SEED)
skf = StratifiedKFold(n_splits=len(set(Y_tr)),random_state=SEED, shuffle=True)

np.savetxt(f"./dataset/train.txt", pd.DataFrame(list(zip(Y_tr, X_tr))).values, fmt="%s")
np.savetxt(f"./dataset/test.txt", pd.DataFrame(list(zip(Y_t, X_t))).values, fmt="%s")

X_tr = X_tr.reset_index()["roar_text"]
X_t = X_t.reset_index()["roar_text"]
Y_tr = Y_tr.reset_index()["category"]
Y_t = Y_t.reset_index()["category"]

y_val = []

i = 0
for t, v in skf.split(X_tr, Y_tr):
    xt, yt, xv, yv = X_tr[t], Y_tr[t], X_tr[v], Y_tr[v]
    y_val.append(yv)
    train = pd.DataFrame(list(zip(yt, xt)), columns=["category", "roar_text"])
    validation = pd.DataFrame(list(zip(yv, xv)), columns=["category", "roar_text"])
    np.savetxt(f"./dataset/train/train_{i}.txt", train.values, fmt="%s")
    np.savetxt(f"./dataset/validation/validation_{i}.txt", validation.values, fmt="%s")
    i += 1

In [6]:
def baseline(x):
    pb = [ 0.11435443314748005, 0.2252760467912977, 0.32592106701650814, 0.42612878539411825, 0.5262271783098283, 0.6256258882693779, 0.7210670165081448, 0.8151087788345905, 0.9077074450639554, 1.0 ]
    lb = [8,6,5,7,4,9,1,10,3,2]
    U = np.random.uniform(0,1)
    for i in range(10):
        if U < pb[i]:
            return f"__label__{lb[i]}"

In [12]:
metrics = []
b_metrics = []

for i in range(10):
   model =  ftt.train_supervised(input=f"./dataset/train/train_{i}.txt")
   metrics.append(model.test(f"./dataset/validation/validation_{i}.txt"))
   bl = [baseline(i) for i in range(metrics[-1][0])]
   b_metrics.append(precision_score(bl, y_val[i], average="weighted"))


Read 0M words
Number of words:  73481
Number of labels: 10
Progress: 100.0% words/sec/thread:  659463 lr:  0.000000 avg.loss:  1.306133 ETA:   0h 0m 0s
Read 0M words
Number of words:  73311
Number of labels: 10
Progress: 100.0% words/sec/thread:  882157 lr:  0.000000 avg.loss:  1.293636 ETA:   0h 0m 0s
Read 0M words
Number of words:  73507
Number of labels: 10
Progress: 100.0% words/sec/thread:  665057 lr:  0.000000 avg.loss:  1.319578 ETA:   0h 0m 0s
Read 0M words
Number of words:  73260
Number of labels: 10
Progress: 100.0% words/sec/thread:  757880 lr:  0.000000 avg.loss:  1.332956 ETA:   0h 0m 0s
Read 0M words
Number of words:  73352
Number of labels: 10
Progress: 100.0% words/sec/thread:  886044 lr:  0.000000 avg.loss:  1.304363 ETA:   0h 0m 0s
Read 0M words
Number of words:  73412
Number of labels: 10
Progress: 100.0% words/sec/thread:  885738 lr:  0.000000 avg.loss:  1.294916 ETA:   0h 0m 0s
Read 0M words
Number of words:  73203
Number of labels: 10
Progress: 100.0% words/sec/th

In [13]:
model = ftt.train_supervised(input="./dataset/train.txt")

Read 0M words
Number of words:  79023
Number of labels: 10
Progress: 100.0% words/sec/thread:  842099 lr:  0.000000 avg.loss:  1.256958 ETA:   0h 0m 0ss


In [14]:
model.test("./dataset/test.txt")

(9147, 0.6180168361211326, 0.6180168361211326)

In [15]:
print("FastText precission", np.mean(pd.DataFrame(metrics, columns=["count", "pr", "rec"])['pr']))
print("Baseline precission",np.mean(b_metrics))

FastText precission 0.6158575490589124
Baseline precission 0.10110014847022777


In [16]:
class TextClassifier: 
    def __init__(self, model, threshold, preprocess: PreproccessText = None):
        self.model = model
        self.threshold = threshold
        self.preprocess = PreproccessText().fit if preprocess == None or not isinstance(preprocess, PreproccessText) else preprocess
    
    def predict(self, corpus):
        corpus = self.preprocess(corpus)
        result = self.model.predict(corpus)
        if result[1][0] >= self.threshold:
            return int(result[0][0].split("__")[-1])
        return -1

In [17]:
clf = TextClassifier(ftt.train_supervised(input="./dataset/all.txt"), 0.6158028444882493)

Read 1M words
Number of words:  92478
Number of labels: 10
Progress: 100.0% words/sec/thread:  817587 lr:  0.000000 avg.loss:  1.198228 ETA:   0h 0m 0s


In [139]:
model.save_model("./model.bin")