In [140]:
import fastText #fasttext==0.8.22
import pandas as pd
import os
from fastText import train_supervised
import numpy as np

In [7]:
# To show the output of all lines in a cell rather that just the last line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [8]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [9]:
DATA_DIR = "~/Downloads/sentiment_labelled_sentences/"

In [109]:
data_amazon = pd.read_table(DATA_DIR+"amazon_cells_labelled.txt"
                          , sep='\t', header=None, names=['sentence', 'sentiment'])
data_imdb = pd.read_table(DATA_DIR+"imdb_labelled.txt"
                          , sep='\t', header=None, names=['sentence', 'sentiment'])
data_yelp = pd.read_table(DATA_DIR+"yelp_labelled.txt"
                          , sep='\t', header=None, names=['sentence', 'sentiment'])

In [110]:
all_data = pd.concat([data_amazon, data_imdb, data_yelp])
all_data.head(2)

Unnamed: 0,sentence,sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1


### Formatting the label as the default for fasttext

In [111]:
all_data['label'] = all_data.apply(lambda row: '__label__' + str(row.sentiment), axis=1)

In [112]:
all_data.head(2)

Unnamed: 0,sentence,sentiment,label
0,So there is no way for me to plug it in here i...,0,__label__0
1,"Good case, Excellent value.",1,__label__1


In [113]:
import re

def normalize(row):
    lower = row['sentence'].lower()
    #correct all multiple white spaces to a single white space
    no_mult_ws = re.sub('[\s]+', ' ', lower)
    text = no_mult_ws.strip()
    return text

all_data['normalized_sentence'] = all_data.apply(normalize, axis=1)

In [114]:
all_data.head(2)

Unnamed: 0,sentence,sentiment,label,normalized_sentence
0,So there is no way for me to plug it in here i...,0,__label__0,so there is no way for me to plug it in here i...
1,"Good case, Excellent value.",1,__label__1,"good case, excellent value."


In [115]:
labeled_data = all_data.drop(['sentence', 'sentiment'], axis=1)
labeled_data.head(2)

Unnamed: 0,label,normalized_sentence
0,__label__0,so there is no way for me to plug it in here i...
1,__label__1,"good case, excellent value."


In [122]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(labeled_data, test_size=0.35, random_state=4)

In [117]:
test_data.head(2)

Unnamed: 0,label,normalized_sentence
177,__label__1,"the atmosphere is modern and hip, while mainta..."
720,__label__1,"cute, quaint, simple, honest."


#### Save model to disk to be read by fasttext

In [118]:
train_data.to_csv(path_or_buf='./sentiment.train', header=False, index=False, sep='\t')
test_data.to_csv(path_or_buf='./sentiment.test', header=False, index=False, sep='\t')

### Fasttext model training/eval/etc.
##### This model is a binary classifier.

In [72]:
train_data_path = os.path.join("./", 'sentiment.train')
test_data_path = os.path.join("./", 'sentiment.test')

sentiment_model = train_supervised(
    input=train_data_path,
    lr=1,
    dim=100,
    ws=5,
    epoch=5,
    minCount=1,
    minCountLabel=0,
    minn=2,
    maxn=3,
    neg=5,
    wordNgrams=2,
    loss="softmax",
    bucket=200000,
    lrUpdateRate=100,
    t=1e-4,
    label="__label__",
    verbose=2,
    pretrainedVectors="",
)
print("On train")
print_results(*sentiment_model.test(train_data_path))
print("On test")
print_results(*sentiment_model.test(test_data_path))
sentiment_model.save_model("sentiment_model.bin")

On train
N	1786
P@1	0.937
R@1	0.937
On test
N	962
P@1	0.794
R@1	0.794


In [73]:
# Testing trained model on some random document
sentiment_model.predict("you are not a cool guy but i really like you", k=2)
sentiment_model.predict("yeah..", k=2)

(('__label__0', '__label__1'), array([ 0.53439021,  0.46562988]))

(('__label__1', '__label__0'), array([ 0.74658495,  0.25343508]))

Quantizing sacrifices a bit of performance to reduce the size of model. Good when size matters, like when deploying to
edge device, e.g. mobile.

In [74]:
sentiment_model.quantize(input=train_data_path, qnorm=True, retrain=True, cutoff=200000)
sentiment_model.save_model("sent_model.ftz")

In [75]:
print_results(*sentiment_model.test(test_data_path))

N	962
P@1	0.802
R@1	0.802


In [76]:
sentiment_model.predict("you are not a cool guy but i really like you", k=2)
sentiment_model.predict("yeah..", k=2)

(('__label__1', '__label__0'), array([ 0.65207916,  0.34794083]))

(('__label__1', '__label__0'), array([ 0.84964049,  0.15037954]))

In [84]:
pr = sentiment_model.predict("you are not a cool guy but i really like you", k=2)

In [85]:
pr[1][0]

0.65207916498184204

In [130]:
test_data['prediction'] = test_data['normalized_sentence'].apply(lambda row: sentiment_model.predict(row, k=2)[1][0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [120]:
test_data[0:5]

Unnamed: 0,label,normalized_sentence,prediction
177,__label__1,"the atmosphere is modern and hip, while mainta...",0.869793
720,__label__1,"cute, quaint, simple, honest.",0.698338
525,__label__0,"it's an empty, hollow shell of a movie.",0.555004
630,__label__0,don't bother - go to the store.,0.990627
137,__label__0,to those who find this movie intelligent or ev...,0.829476


In [57]:
??sentiment_model.predict

### Fasttext model training/eval/etc.
##### This model is a multi-label classifier.

In [108]:
def grid_search(lr, dim, ws, epoch, minn, maxn, wordNgrams):
    for l_rate in lr:
        for d in dim:
            for s in ws:
                for ep in epoch:
                    for mi in minn:
                        for ma in maxn:
                            if (ma >= mi):
                                for n in wordNgrams:
                                    sent_model_mc = train_supervised(
                                                        input=train_data,
                                                        lr=l_rate,
                                                        dim=d,
                                                        ws=s,
                                                        epoch=ep,
                                                        minCount=1,
                                                        minCountLabel=0,
                                                        minn=mi,
                                                        maxn=ma,
                                                        neg=5,
                                                        wordNgrams=n,
                                                        loss="ova",
                                                        bucket=200000,
                                                        lrUpdateRate=100,
                                                        t=1e-4,
                                                        label="__label__",
                                                        verbose=2,
                                                        pretrainedVectors="",
                                                    )
                                    t_res = sent_model_mc.test(train_data)
                                    v_res = sent_model_mc.test(valid_data)
                                    if (t_res[1] > 0.83) & (v_res[1] > 0.83):
                                        print("lr = {}, dim = {}, ws = {}, epoch = {}, minn = {}, maxn = {}, wordNgrams = {} *******".format(l_rate, d, s, ep, mi, ma, n))
                                    else:
                                        print("lr = {}, dim = {}, ws = {}, epoch = {}, minn = {}, maxn = {}, wordNgrams = {}".format(l_rate, d, s, ep, mi, ma, n))
                                    print(*t_res)
                                    print(*v_res)
                                

In [106]:
grid_search(lr=[0.1, 0.5, 0.9], dim=[20, 100], ws=[4,5,6], 
            epoch=[2,5,7], minn=[2,3,4,5], maxn=[2,3,4,5,6,7], wordNgrams=[1,2,3])

lr = 0.1, dim = 20, ws = 4, epoch = 2, minn = 2, maxn = 2, wordNgrams = 1
1937 0.5823438306659783 0.5823438306659783
831 0.5643802647412756 0.5643802647412756
lr = 0.1, dim = 20, ws = 4, epoch = 2, minn = 2, maxn = 2, wordNgrams = 2
1937 0.5849251419721219 0.5849251419721219
831 0.5619735258724429 0.5619735258724429
lr = 0.1, dim = 20, ws = 4, epoch = 2, minn = 2, maxn = 2, wordNgrams = 3
1937 0.5709860609189468 0.5709860609189468
831 0.5535499398315282 0.5535499398315282
lr = 0.1, dim = 20, ws = 4, epoch = 2, minn = 2, maxn = 3, wordNgrams = 1
1937 0.6329375322663914 0.6329375322663914
831 0.601684717208183 0.601684717208183
lr = 0.1, dim = 20, ws = 4, epoch = 2, minn = 2, maxn = 3, wordNgrams = 2
1937 0.6262261228704181 0.6262261228704181
831 0.5752105896510229 0.5752105896510229
lr = 0.1, dim = 20, ws = 4, epoch = 2, minn = 2, maxn = 3, wordNgrams = 3
1937 0.628291171915333 0.628291171915333
831 0.5788206979542719 0.5788206979542719
lr = 0.1, dim = 20, ws = 4, epoch = 2, minn = 2, m

In [110]:
grid_search(lr=[0.95, 1], dim=[10, 50], ws=[5], 
            epoch=[3, 5], minn=[3,4], maxn=[3,4,5,6], wordNgrams=[2])

lr = 0.95, dim = 10, ws = 5, epoch = 3, minn = 3, maxn = 3, wordNgrams = 2
1937 0.9550851832731028 0.9550851832731028
831 0.8014440433212996 0.8014440433212996
lr = 0.95, dim = 10, ws = 5, epoch = 3, minn = 3, maxn = 4, wordNgrams = 2
1937 0.9488900361383583 0.9488900361383583
831 0.8038507821901324 0.8038507821901324
lr = 0.95, dim = 10, ws = 5, epoch = 3, minn = 3, maxn = 5, wordNgrams = 2
1937 0.9452762003097573 0.9452762003097573
831 0.8110709987966306 0.8110709987966306
lr = 0.95, dim = 10, ws = 5, epoch = 3, minn = 3, maxn = 6, wordNgrams = 2
1937 0.9395973154362416 0.9395973154362416
831 0.8134777376654633 0.8134777376654633
lr = 0.95, dim = 10, ws = 5, epoch = 3, minn = 4, maxn = 4, wordNgrams = 2
1937 0.9855446566855963 0.9855446566855963
831 0.8182912154031288 0.8182912154031288
lr = 0.95, dim = 10, ws = 5, epoch = 3, minn = 4, maxn = 5, wordNgrams = 2
1937 0.9752194114610222 0.9752194114610222
831 0.8267148014440433 0.8267148014440433
lr = 0.95, dim = 10, ws = 5, epoch = 3, 

In [17]:
sent_model_mc.predict("you are not a cool guy but still i like you", k=2)
sent_model_mc.predict("yeah..", k=2)
sent_model_mc.predict("k", k=2)

(('__label__0', '__label__1'), array([ 0.93046826,  0.06755669]))

(('__label__0', '__label__1'), array([ 0.9124462 ,  0.08510906]))

(('__label__0', '__label__1'), array([  1.00001001e+00,   1.00000034e-05]))

In [18]:
sent_model_mc.quantize(input=train_data, qnorm=True, retrain=True, cutoff=200000)
sent_model_mc.save_model("sent_model_mc.ftz")

In [19]:
print_results(*sent_model_mc.test(train_data))
print_results(*sent_model_mc.test(valid_data))

N	1937
P@1	1.000
R@1	1.000
N	831
P@1	0.821
R@1	0.821


In [20]:
sent_model_mc.predict("you are not a cool guy but still i like you", k=2)
sent_model_mc.predict("yeah..", k=2)
sent_model_mc.predict("k", k=2)
sent_model_mc.predict("k. Don't talk to me. wtf? love is coming", k=2)
sent_model_mc.predict("I'm sad. But sometimes very happy to see people smile", k=2)

(('__label__0', '__label__1'), array([ 0.8840493 ,  0.11280541]))

(('__label__0', '__label__1'), array([ 0.9466067 ,  0.05185546]))

(('__label__0', '__label__1'), array([  1.00001001e+00,   1.00000034e-05]))

(('__label__0', '__label__1'), array([ 0.74317801,  0.2509228 ]))

(('__label__1', '__label__0'), array([ 0.94335759,  0.05501529]))

##### Sample from fasttext github `https://github.com/facebookresearch/fastText/blob/master/python/doc/examples/train_supervised.py`

In [5]:
DATA_DIR = "../data/cooking/"
train_data = os.path.join(DATA_DIR, 'cooking.train')
valid_data = os.path.join(DATA_DIR, 'cooking.valid')
# train_supervised uses the same arguments and defaults as the fastText cli
model = train_supervised(
    input=train_data, epoch=5, lr=1.0, wordNgrams=2, verbose=2, minCount=1
)
print_results(*model.test(valid_data))
model = train_supervised(
    input=train_data, epoch=5, lr=1.0, wordNgrams=2, verbose=2, minCount=1,
    loss="hs"
)
print_results(*model.test(valid_data))
model.save_model("cooking.bin")

# model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000)
# print_results(*model.test(valid_data))
# model.save_model("cooking.ftz")

N	3000
P@1	0.468
R@1	0.202
N	3000
P@1	0.486
R@1	0.210


In [9]:
sample_model = fastText.load_model("cooking.bin")

In [10]:
txt = "cooking apple pie is nice with knife."
sample_model.predict(txt, k=6)

(('__label__pie',
  '__label__baking',
  '__label__dessert',
  '__label__temperature',
  '__label__beans',
  '__label__eggs'),
 array([ 0.17311673,  0.166216  ,  0.05306365,  0.05296602,  0.0366074 ,
         0.01968909]))

In [6]:
txt = "cooking apple pie is nice with knife."
model.predict(txt, k=6)

(('__label__pie',
  '__label__baking',
  '__label__dessert',
  '__label__temperature',
  '__label__beans',
  '__label__eggs'),
 array([ 0.17311673,  0.166216  ,  0.05306365,  0.05296602,  0.0366074 ,
         0.01968909]))

In [51]:
df = pd.DataFrame(np.random.randn(100, 2))

In [52]:
df.head(3)

Unnamed: 0,0,1
0,1.549507,-1.618582
1,-0.565005,-0.481846
2,0.396015,-0.488801


In [21]:
import collections

print(collections.Counter(['a', 'b', 'c', 'a', 'b', 'b']))
freq = collections.Counter(['a', 'b', 'c', 'a', 'b', 'b'])

Counter({'b': 3, 'a': 2, 'c': 1})


In [19]:
x = ['a', 'b', 'c']


['a', 'b', 'c']

In [28]:
itos = [o for o,p in freq.most_common(2) if p>2]
itos.insert(4, 'h')
itos

['b', 'h']