# NBSVM Improvement and Test
The Goal is to imporve the algorithm efficiency and adpt to sklearn standard

## 0) Prepare Data

In [1]:
from loadData import loadLabeled
# Data contains the list of text and Class the corresponding Label (1=POS, 0=NEG)
data, Class = loadLabeled('./train')

from loadData import review_to_wordlist
data_cleaned = []
for i in xrange(len(data)):
    data_cleaned.append(" ".join(review_to_wordlist(data[i])))
    
from sklearn.cross_validation import train_test_split
test_ratio = 0.4
data_train, data_test, labels_train, labels_test = train_test_split(data_cleaned, Class, test_size = test_ratio, random_state=42)

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1,2),binary=True)
count_vect.fit(data_train)
count_matrix = count_vect.transform(data_train)
count_test = count_vect.transform(data_test)

In [8]:
print "Train: ",labels_train.shape
print "Test: ",labels_test.shape

Train:  (15000,)
Test:  (10000,)


## 1) Fit part optimization

In [16]:
from time import time
import scipy.sparse as sp

In [11]:
y = labels_train
X = count_matrix

In [44]:
t0 = time()
print 1*[y==1][0]
print time() - t0

[0 1 1 ..., 1 0 0]
0.00120401382446


In [45]:
t0 = time()
print [y==1][0].astype(int)
print time() - t0

[0 1 1 ..., 1 0 0]
0.00124502182007


### a - Version 1.0

In [66]:
t0 = time()
alpha = 1.0
nb_doc, voc_length = X.shape
pos_idx = [y==1][0].astype(int)
neg_idx = [y==0][0].astype(int)
#Store the indicator vectors in sparse format to accelerate the computations
pos_idx = sp.csr_matrix(pos_idx.T)
neg_idx = sp.csr_matrix(neg_idx.T)
#Use sparse format dot product to get a weightning vector stored in sparse format
alpha_vec = sp.csr_matrix(alpha*np.ones(voc_length))
p = (alpha_vec + pos_idx.dot(X)) 
norm_p = p.sum()
p = p.multiply(1/norm_p)
#print p.toarray()
q = (alpha_vec + neg_idx.dot(X))
norm_q = q.sum()
q = q.multiply(1/norm_q)
#print q.toarray()
ratio = sp.csr_matrix(np.log((p.multiply(sp.csr_matrix(np.expand_dims(q.toarray()[0]**(-1),axis=0)))).data))
print "Time: ",time()-t0, "s"

Time:  0.394751787186 s


### a - Version 1.1

In [55]:
from sklearn.utils import sparsefuncs
from sklearn.utils.extmath import safe_sparse_dot, _fast_dot

In [106]:
t0 = time()
pos_idx.dot(X)
print time() - t0

0.0931611061096


In [105]:
t0 = time()
safe_sparse_dot(neg_idx,X)
print time() - t0

0.0936689376831


In [67]:
t0 = time()
alpha = 1.0
nb_doc, voc_length = X.shape
pos_idx = [y==1][0].astype(int)
neg_idx = [y==0][0].astype(int)
#Store the indicator vectors in sparse format to accelerate the computations
pos_idx = sp.csr_matrix(pos_idx.T)
neg_idx = sp.csr_matrix(neg_idx.T)
#Use sparse format dot product to get a weightning vector stored in sparse format
alpha_vec = sp.csr_matrix(alpha*np.ones(voc_length))
p = (alpha_vec + safe_sparse_dot(pos_idx,X)) 
norm_p = p.sum()
p = p.multiply(1.0/norm_p)
q = (alpha_vec + safe_sparse_dot(neg_idx,X))
norm_q = q.sum()
q = q.multiply(1.0/norm_q)
inverseQ = sp.csr_matrix(np.expand_dims(q.toarray()[0]**(-1),axis=0))
firstRatio = p.multiply(inverseQ).data
ratio = sp.csr_matrix(np.log(firstRatio))
print "Time: ",time()-t0, "s"

Time:  0.399273872375 s


In [71]:
np.log(np.min(firstRatio))

-4.266833990282862

## 1) NBSVM Class

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import scipy.sparse as sp
from sklearn.preprocessing import binarize
import numpy as np
    
class NBmatrix(BaseEstimator, TransformerMixin):
    
    def __init__(self, alpha, bina, n_jobs = 1):
        self.alpha = alpha
        self.bina = bina
        self.n_jobs = n_jobs
        self.r = []

    def fit(self, X, y):
        alpha = self.alpha
        nb_doc, voc_length = X.shape
        pos_idx = [y==1][0].astype(int)
        neg_idx = [y==0][0].astype(int)
        #Store the indicator vectors in sparse format to accelerate the computations
        pos_idx = sp.csr_matrix(pos_idx.T)
        neg_idx = sp.csr_matrix(neg_idx.T)
        #Use sparse format dot product to get a weightning vector stored in sparse format
        alpha_vec = sp.csr_matrix(alpha*np.ones(voc_length))
        p = (alpha_vec + pos_idx.dot(X)) 
        norm_p = p.sum()
        p = p.multiply(1/norm_p)
        #print p.toarray()
        q = (alpha_vec + neg_idx.dot(X))
        norm_q = q.sum()
        q = q.multiply(1/norm_q)
        #print q.toarray()
        
        ratio = sp.csr_matrix(np.log((p.multiply(sp.csr_matrix(np.expand_dims(q.toarray()[0]**(-1),axis=0)))).data))
        #print ratio.toarray()
        self.r = ratio #Stock the ratio vector to re-use it for transforming unlablled data
        return self

    def transform(self, X):
        #If the binarize option is set to true, we need now to recompute "f", our binarized word counter
        if(self.bina == True):
            f_hat = binarize(X, threshold = 0.0)
        else :
            f_hat = X
        
        f_tilde = f_hat.multiply(self.r)
        return f_tilde
    
    def fit_transform(self, X, y):
        self.fit(X,y)
        return self.transform(X)