### Sentiment analysis via Embedding (further normalizing text)
#### Tool: fastText

In [37]:
import fastText #version 0.8.22
import pandas as pd
import os
from fastText import train_supervised
import numpy as np

In [2]:
# To show the output of all lines in a cell rather that just the last line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

The architecture of fasttext supervised learning is like word2vec CBOW where the target word is replaced with the label.

In [3]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [4]:
DATA_DIR = "~/Downloads/sentiment_labelled_sentences/"

In [5]:
data_amazon = pd.read_table(DATA_DIR+"amazon_cells_labelled.txt"
                          , sep='\t', header=None, names=['sentence', 'sentiment'])
data_imdb = pd.read_table(DATA_DIR+"imdb_labelled.txt"
                          , sep='\t', header=None, names=['sentence', 'sentiment'])
data_yelp = pd.read_table(DATA_DIR+"yelp_labelled.txt"
                          , sep='\t', header=None, names=['sentence', 'sentiment'])

In [6]:
all_data = pd.concat([data_amazon, data_imdb, data_yelp])
all_data.head(2)

Unnamed: 0,sentence,sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1


### Formatting the label as the default for fasttext

In [7]:
all_data['label'] = all_data.apply(lambda row: '__label__' + str(row.sentiment), axis=1)

In [8]:
all_data.head(2)

Unnamed: 0,sentence,sentiment,label
0,So there is no way for me to plug it in here i...,0,__label__0
1,"Good case, Excellent value.",1,__label__1


In [6]:
import re
import string

def normalize(row):
    lower = row['sentence'].lower()
    #correct all multiple white spaces to a single white space
    no_mult_ws = re.sub('[\s]+', ' ', lower)
    striped = no_mult_ws.strip()
    #remove punctuations
    no_punc = striped.translate(str.maketrans('', '', string.punctuation))
    return no_punc

In [12]:
all_data['normalized_sentence'] = all_data.apply(normalize, axis=1)

In [13]:
all_data.head(2)

Unnamed: 0,sentence,sentiment,label,normalized_sentence
0,So there is no way for me to plug it in here i...,0,__label__0,so there is no way for me to plug it in here i...
1,"Good case, Excellent value.",1,__label__1,good case excellent value


In [14]:
labeled_data = all_data.drop(['sentence', 'sentiment'], axis=1)
labeled_data.head(2)

Unnamed: 0,label,normalized_sentence
0,__label__0,so there is no way for me to plug it in here i...
1,__label__1,good case excellent value


In [15]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(labeled_data, test_size=0.35, random_state=4)

In [16]:
test_data.head(2)

Unnamed: 0,label,normalized_sentence
177,__label__1,the atmosphere is modern and hip while maintai...
720,__label__1,cute quaint simple honest


#### Save model to disk to be read by fasttext

In [17]:
labeled_data.to_csv(path_or_buf='./sentiment_v2.all', header=False, index=False, sep='\t')
train_data.to_csv(path_or_buf='./sentiment_v2.train', header=False, index=False, sep='\t')
test_data.to_csv(path_or_buf='./sentiment_v2.test', header=False, index=False, sep='\t')

### Fasttext model training/eval/etc.
##### This model is a binary classifier.

In [18]:
train_data_path = os.path.join("./", 'sentiment_v2.train')
test_data_path = os.path.join("./", 'sentiment_v2.test')

sentiment_model = train_supervised(
    input=train_data_path,
    lr=1,
    dim=100,
    ws=5,
    epoch=5,
    minCount=1,
    minCountLabel=0,
    minn=2,
    maxn=3,
    neg=5,
    wordNgrams=2,
    loss="softmax",
    bucket=200000,
    lrUpdateRate=100,
    t=1e-4,
    label="__label__",
    verbose=2,
    pretrainedVectors="",
)
print("On train")
print_results(*sentiment_model.test(train_data_path))
print("On test")
print_results(*sentiment_model.test(test_data_path))
sentiment_model.save_model("sentiment_model_v2.bin")

On train
N	1786
P@1	0.932
R@1	0.932
On test
N	962
P@1	0.796
R@1	0.796


Check here for the input arguments explanations: 'https://fasttext.cc/docs/en/options.html'

In [19]:
# Testing trained model on some random document
sentiment_model.predict("you are not a cool guy but i really like you", k=2)
sentiment_model.predict("yeah..", k=2)

(('__label__0', '__label__1'), array([ 0.65752089,  0.34249911]))

(('__label__1', '__label__0'), array([ 0.74657267,  0.25344735]))

In [20]:
sentiment_model.predict("you are not a cool guy but i really like you", k=2)
sentiment_model.predict("yeah..", k=2)

(('__label__0', '__label__1'), array([ 0.65752089,  0.34249911]))

(('__label__1', '__label__0'), array([ 0.74657267,  0.25344735]))

Model parameter search/tuning..

In [31]:
train_data_path = os.path.join("./", 'sentiment_v2.train')
test_data_path = os.path.join("./", 'sentiment_v2.test')

def grid_search(lr, dim, ws, epoch, minn, maxn, wordNgrams):
    for l_rate in lr:
        for d in dim:
            for s in ws:
                for ep in epoch:
                    for mi in minn:
                        for ma in maxn:
                            if (ma >= mi):
                                for n in wordNgrams:
                                    sentiment_model_mc = train_supervised(
                                                        input=train_data_path,
                                                        lr=l_rate,
                                                        dim=d,
                                                        ws=s,
                                                        epoch=ep,
                                                        minCount=1,
                                                        minCountLabel=0,
                                                        minn=mi,
                                                        maxn=ma,
                                                        neg=5,
                                                        wordNgrams=n,
                                                        loss='softmax', # ns, hs, softmax, ova (for multilabel classification)
                                                        bucket=200000,
                                                        lrUpdateRate=100,
                                                        t=1e-4,
                                                        label="__label__",
                                                        verbose=2,
                                                        pretrainedVectors="",
                                                    )
                                    train_res = round(sentiment_model_mc.test(train_data_path)[1], 2) #precision
                                    test_res = round(sentiment_model_mc.test(test_data_path)[1], 2)
                                    ratio = train_res/test_res
                                    if (ratio > 0.95) & (ratio < 1.06) & (train_res > 0.8):
                                        print("{}, {}: *** lr = {}, dim = {}, ws = {}, epoch = {}, minn = {}, maxn = {}, wordNgrams = {}".format(train_res, test_res, l_rate, d, s, ep, mi, ma, n))
                                    else:
                                        print("{}, {}:     lr = {}, dim = {}, ws = {}, epoch = {}, minn = {}, maxn = {}, wordNgrams = {}".format(train_res, test_res, l_rate, d, s, ep, mi, ma, n))
                                

In [None]:
# loss='softmax', minCount=1, neg=5

In [33]:
grid_search(lr=[0.9, 0.95], dim=[5, 10, 20, 40], ws=[4, 5, 6], 
            epoch=[1, 2, 3], minn=[2, 3, 4], maxn=[3, 4, 5, 6], wordNgrams=[1, 2, 3])

0.74, 0.72:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 3, wordNgrams = 1
0.74, 0.69:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 3, wordNgrams = 2
0.72, 0.68:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 3, wordNgrams = 3
0.68, 0.66:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 4, wordNgrams = 1
0.67, 0.64:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 4, wordNgrams = 2
0.67, 0.64:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 4, wordNgrams = 3
0.67, 0.63:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 5, wordNgrams = 1
0.68, 0.64:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 5, wordNgrams = 2
0.67, 0.63:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 5, wordNgrams = 3
0.67, 0.65:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 6, wordNgrams = 1
0.66, 0.64:     lr = 0.9, dim = 5, ws = 4, epoch = 1, minn = 2, maxn = 6, wordNgrams = 2
0.66, 0.65:     lr = 

In [34]:
grid_search(lr=[1], dim=[5, 20], ws=[5, 6], 
            epoch=[1, 2], minn=[3,4], maxn=[3, 4, 5, 6], wordNgrams=[1, 2, 3])

0.82, 0.76:     lr = 1, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 1
0.81, 0.76:     lr = 1, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 2
0.8, 0.76:     lr = 1, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 3
0.79, 0.75:     lr = 1, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 1
0.78, 0.75:     lr = 1, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 2
0.77, 0.75:     lr = 1, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 3
0.79, 0.76:     lr = 1, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 1
0.77, 0.74:     lr = 1, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 2
0.78, 0.75:     lr = 1, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 3
0.77, 0.75:     lr = 1, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 6, wordNgrams = 1
0.77, 0.74:     lr = 1, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 6, wordNgrams = 2
0.77, 0.73:     lr = 1, dim = 5, ws = 5, epo

In [35]:
grid_search(lr=[0.05, 0.1], dim=[5, 20], ws=[5, 6], 
            epoch=[1, 2], minn=[3,4], maxn=[3, 4, 5, 6], wordNgrams=[1, 2, 3])

0.53, 0.54:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 1
0.53, 0.55:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 2
0.53, 0.55:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 3
0.53, 0.51:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 1
0.54, 0.52:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 2
0.53, 0.52:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 3
0.54, 0.52:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 1
0.53, 0.52:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 2
0.54, 0.53:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 3
0.54, 0.53:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 6, wordNgrams = 1
0.54, 0.54:     lr = 0.05, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 6, wordNgrams = 2
0.55, 0.54

In [36]:
grid_search(lr=[0.5, 2], dim=[5, 20], ws=[5, 6], 
            epoch=[1, 2], minn=[3,4], maxn=[3, 4, 5, 6], wordNgrams=[1, 2, 3])

0.72, 0.68:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 1
0.68, 0.65:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 2
0.65, 0.62:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 3, wordNgrams = 3
0.66, 0.63:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 1
0.64, 0.62:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 2
0.63, 0.6:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 4, wordNgrams = 3
0.62, 0.62:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 1
0.61, 0.61:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 2
0.62, 0.6:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 5, wordNgrams = 3
0.63, 0.61:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 6, wordNgrams = 1
0.63, 0.6:     lr = 0.5, dim = 5, ws = 5, epoch = 1, minn = 3, maxn = 6, wordNgrams = 2
0.62, 0.6:     lr = 0.5,

`0.83, 0.79: *** lr = 0.95, dim = 20, ws = 5, epoch = 1, minn = 3, maxn = 6, wordNgrams = 3`

In [23]:
all_data_path = os.path.join('./', 'sentiment_v2.all')

sentiment_model_all = train_supervised(
    input=all_data_path,
    lr=0.95,
    dim=20,
    ws=5,
    epoch=1,
    minCount=1,
    minCountLabel=0,
    minn=3,
    maxn=6,
    neg=5,
    wordNgrams=3,
    loss='softmax',
    bucket=200000,
    lrUpdateRate=100,
    t=1e-4,
    label="__label__",
    verbose=2,
    pretrainedVectors=""
)
print_results(*sentiment_model_all.test(all_data_path))
sentiment_model_all.save_model("sentiment_model_all_v2.bin")

N	2748
P@1	0.796
R@1	0.796


In [24]:
train_data_path = os.path.join("./", 'sentiment_v2.train')
test_data_path = os.path.join("./", 'sentiment_v2.test')

sentiment_model_transfer = train_supervised(
    input=train_data_path,
    lr=0.95,
    dim=300,
    epoch=1,
    loss='softmax',
    bucket=1000,
    label="__label__",
    pretrainedVectors=''
)
print_results(*sentiment_model_transfer.test(train_data_path))
print_results(*sentiment_model_transfer.test(test_data_path))

N	1786
P@1	0.845
R@1	0.845
N	962
P@1	0.765
R@1	0.765


In [26]:
train_data_path = os.path.join("./", 'sentiment_v2.train')
test_data_path = os.path.join("./", 'sentiment_v2.test')

sentiment_model_transfer = train_supervised(
    input=train_data_path,
    lr=0.95,
    dim=300,
    epoch=1,
    loss='softmax',
    bucket=1000,
    label="__label__",
    pretrainedVectors="/Users/hadi.minooei/Downloads/wiki-news-300d-1M.vec"
)
print_results(*sentiment_model_transfer.test(train_data_path))
print_results(*sentiment_model_transfer.test(test_data_path))
sentiment_model_all.save_model("sentiment_model_transfer_v2.bin")

N	1786
P@1	1.000
R@1	1.000
N	962
P@1	0.819
R@1	0.819


In [27]:
train_data_path = os.path.join("./", 'sentiment_v2.train')
test_data_path = os.path.join("./", 'sentiment_v2.test')

sentiment_model_transfer = train_supervised(
    input=train_data_path,
    lr=0.005,
    dim=300,
    epoch=1,
    loss='softmax',
    bucket=1000,
    label="__label__",
    pretrainedVectors="/Users/hadi.minooei/Downloads/wiki-news-300d-1M.vec"
)
print_results(*sentiment_model_transfer.test(train_data_path))
print_results(*sentiment_model_transfer.test(test_data_path))
sentiment_model_all.save_model("sentiment_model_transfer_v2.bin")

N	1786
P@1	0.855
R@1	0.855
N	962
P@1	0.822
R@1	0.822


In [28]:
train_data_path = os.path.join("./", 'sentiment_v2.train')
test_data_path = os.path.join("./", 'sentiment_v2.test')

sentiment_model_transfer = train_supervised(
    input=train_data_path,
    lr=0.005,
    dim=300,
    epoch=2,
    loss='softmax',
    bucket=2000,
    label="__label__",
    pretrainedVectors="/Users/hadi.minooei/Downloads/wiki-news-300d-1M.vec"
)
print_results(*sentiment_model_transfer.test(train_data_path))
print_results(*sentiment_model_transfer.test(test_data_path))
sentiment_model_all.save_model("sentiment_model_transfer_v2.bin")

N	1786
P@1	0.918
R@1	0.918
N	962
P@1	0.840
R@1	0.840


In [29]:
train_data_path = os.path.join("./", 'sentiment_v2.train')
test_data_path = os.path.join("./", 'sentiment_v2.test')

sentiment_model_transfer = train_supervised(
    input=train_data_path,
    lr=0.004,
    dim=300,
    epoch=2,
    loss='softmax',
    bucket=2000,
    label="__label__",
    pretrainedVectors="/Users/hadi.minooei/Downloads/wiki-news-300d-1M.vec"
)
print_results(*sentiment_model_transfer.test(train_data_path))
print_results(*sentiment_model_transfer.test(test_data_path))
sentiment_model_all.save_model("sentiment_model_transfer_v2.bin")

N	1786
P@1	0.894
R@1	0.894
N	962
P@1	0.833
R@1	0.833


In [30]:
train_data_path = os.path.join("./", 'sentiment_v2.train')
test_data_path = os.path.join("./", 'sentiment_v2.test')

sentiment_model_transfer = train_supervised(
    input=train_data_path,
    lr=0.003,
    dim=300,
    epoch=2,
    loss='softmax',
    bucket=2000,
    label="__label__",
    pretrainedVectors="/Users/hadi.minooei/Downloads/wiki-news-300d-1M.vec"
)
print_results(*sentiment_model_transfer.test(train_data_path))
print_results(*sentiment_model_transfer.test(test_data_path))
sentiment_model_all.save_model("sentiment_model_transfer_v2.bin")

N	1786
P@1	0.871
R@1	0.871
N	962
P@1	0.830
R@1	0.830


### Conclusion: 

There is no fixed rule about using pretrained vectors (transfer learning). In my experience, sometimes you might get better results or sometimes worst (or the same), requires experimentation for your specific case!


Also in here
* One can see some of the many iterations required to find a proper set of parameters.
* The above iterations could be done like the grid_search function. (skipped it in here..) --> So theoretically there could be a better transfer learning parameters set.