In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

Using TensorFlow backend.


In [2]:
drug_train = pd.read_csv("Data/drugLibTrain_raw.tsv",delimiter='\t',encoding='utf-8')
drug_test = pd.read_csv("Data/drugLibTest_raw.tsv",delimiter='\t',encoding='utf-8')

drug=pd.concat([drug_train,drug_test], axis=0, join='outer',)

In [3]:
drug.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4143 entries, 0 to 1035
Data columns (total 9 columns):
Unnamed: 0           4143 non-null int64
urlDrugName          4143 non-null object
rating               4143 non-null int64
effectiveness        4143 non-null object
sideEffects          4143 non-null object
condition            4142 non-null object
benefitsReview       4143 non-null object
sideEffectsReview    4141 non-null object
commentsReview       4135 non-null object
dtypes: int64(2), object(7)
memory usage: 323.7+ KB


In [4]:
drug=drug.dropna(axis=0,how='any')
drug.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4132 entries, 0 to 1035
Data columns (total 9 columns):
Unnamed: 0           4132 non-null int64
urlDrugName          4132 non-null object
rating               4132 non-null int64
effectiveness        4132 non-null object
sideEffects          4132 non-null object
condition            4132 non-null object
benefitsReview       4132 non-null object
sideEffectsReview    4132 non-null object
commentsReview       4132 non-null object
dtypes: int64(2), object(7)
memory usage: 322.8+ KB


In [5]:
def convert_text(max_words, data):
    
    data['Alltext'] = data['benefitsReview']#+' '+data['benefitsReview']\
                #+' ' +data['commentsReview']+''+data['sideEffectsReview']
    data['Alltext'] = data['Alltext'].apply(lambda x : str(x))
    texts_tr = data.Alltext

    tokenizer_tr = Tokenizer(num_words=max_words)
    tokenizer_tr.fit_on_texts(texts_tr)
    X = tokenizer_tr.texts_to_matrix(texts_tr, mode='binary')
    y=data['rating'].values - 1
    word_index = tokenizer_tr.word_index
    print('Found {} unique tokens.'.format( len(set(word_index))))
    return X,y

In [6]:
X,y = convert_text(500,drug)

Found 9108 unique tokens.


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [8]:
X_train.shape

(3305, 500)

# Gaussian Discriminant Analysis(GDA)

In [9]:
class GaussianDiscriminantAnalysis:
    def __init__(self,
                 epsilon=2e-1
                ):
        
        self.epsilon=epsilon
        
    def ph(self,y):
        phi=[]
        for k in range(self.nb_class):
            phi.append((np.sum(y==k))/len(y))
        return phi   
    
    def mu_computed(self,X,y):
        list_of_mu=[]
        for k in range(self.nb_class):
            list_of_mu.append((np.sum(X[y==k],axis=0))/np.sum(y==k))
        return list_of_mu
    
    def covariance(self,X,y):
        sigma= np.zeros((len(X),len(X)))
        M = np.zeros_like(X)
        for k in range(self.nb_class):
            M[y== k]=self.list_mu[k]
        return (((X-M).T)@(X-M))/len(y)
    
    def prob_class(self,X):
        half_len = X.shape[1]/2
        det_cvar = np.sqrt(np.linalg.det(self.cvar + (self.epsilon*np.eye(len(self.cvar)))))
        inv_cvar = np.linalg.inv(self.cvar + (self.epsilon*np.eye(len(self.cvar))))
        list_prob=[]
        for k in range(self.nb_class):
            A=((X - self.list_mu[k])@inv_cvar)
            #print('dima',A)
            B=(X-self.list_mu[k])
            #print('dimb',B)
            C=np.sum(np.multiply(A,B),axis=1)
            e = np.exp( -0.5*C)
            p = (2*np.pi)**half_len
            #print('cccc',det_cvar)
            list_prob.append(np.exp( -0.5*C)*(1/((2*np.pi)**half_len)*np.sqrt(det_cvar)))
            #print(list_prob)
        return list_prob
    
    def train(self, X, y):
        self.nb_class = len(np.unique(y))
        self.classes = np.unique(y)
        self.fi = self.ph(y)
        self.list_mu = self.mu_computed(X,y)
        self.cvar = self.covariance(X, y)
        
    def predict(self, X):        
        proby=self.fi
        #print(np.array(proby).shape)
        list_probclass= np.array(self.prob_class(X)).T
        predict_prob= list_probclass*proby
        #print(list_probclass)
        #print(predict_prob[0:2])
        if 0 in self.classes:
            return np.argmax(np.array(predict_prob),axis=1) 
        else:
            return np.argmax(np.array(predict_prob),axis=1) +1

In [151]:
GDA = GaussianDiscriminantAnalysis()

In [152]:
GDA.train(X_train,y_train)

In [153]:
prediction=GDA.predict(X_test)+1

cccc 3.3092988843132725e-160
cccc 3.3092988843132725e-160
cccc 3.3092988843132725e-160
cccc 3.3092988843132725e-160
cccc 3.3092988843132725e-160
cccc 3.3092988843132725e-160
cccc 3.3092988843132725e-160
cccc 3.3092988843132725e-160
cccc 3.3092988843132725e-160
cccc 3.3092988843132725e-160


In [156]:
np.sum(prediction==y_test+1)/len(y_test)*100

31.318016928657798

In [157]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [139]:
clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)



LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [140]:
pred=clf.predict(X_test)

In [141]:
np.sum(pred==y_test)/len(y_test)*100

25.755743651753328

#  Naive Bayes (NB)

In [19]:
import ipdb #ipython debogeur

In [10]:
class BernoulliNaiveBayes:
    '''
    multivariate Bernoulli Naive bayes
    '''
    
    def __init__(self):
        pass
        
        
    def phiy(self,y): 
        phi=[]
        for k in self.classes:
            phi.append(np.sum(y==k)/len(y))
        return phi
    
    
    def phix1y(self,X,y):
        phi_list=[]
        for k in self.classes:
            phix=[]
            for i in range(X.shape[1]):
                X_i=X[:,i]
                select_y=X_i[y==k]
                phix.append((np.sum(select_y==1)+1)/(np.sum(y==k)+self.nb_class))
            phi_list.append(phix)
        return phi_list
    
    def phix0y(self,X,y):
        phi_list=[]
        for k in self.classes:
            phix=[]
            for i in range(X.shape[1]):
                X_i=X[:,i]
                select_y=X_i[y==k]
                phix.append((np.sum(select_y==0)+1)/(np.sum(y==k)+self.nb_class))
            phi_list.append(phix)
        return phi_list
    
    def probabxy(self,X):
        
        probxyk=[]
        arr1=np.zeros_like(X)
        for k in range(self.nb_class):
            probxy1=[]
            for i in range(X.shape[0]):
                X_i=X[i,:]
                prob=1
                for j in range(len(X_i)):

                    if X_i[j]==1:
                        prob *=  self.fix1y[int(k)][j]
                    else:
                        
                        prob *= self.fix0y[int(k)][j]
                    arr1[i,j]=prob
                probxy1.append(prob)
            probxyk.append(probxy1)
        return  probxyk 

    def train(self, X,y):
        self.nb_class = len(np.unique(y))
        self.classes = np.unique(y)
        self.fiy = self.phiy(y)
        self.fix1y = np.array(self.phix1y(X,y))
        self.fix0y = np.array(self.phix0y(X,y))
        
    def predict(self,X) :
        proby=self.fiy
        list_probclass= self.probabxy(X)
        
        predict_prob=[]
        predict_prob=np.array(list_probclass).T*proby
        if 0 in self.classes:
            return  np.argmax(predict_prob,axis=1)
        else:
            return  np.argmax(predict_prob,axis=1)+1         

In [166]:
NB = BernoulliNaiveBayes()

In [22]:
NB.train(X_train,y_train)

In [23]:
prediction=NB.predict(X_test)

In [24]:

np.sum(prediction==y_test+1)/len(y_test)*100

15.114873035066505

In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [26]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [27]:
prediction=clf.predict(X_test)

In [28]:

np.sum(prediction==y_test)/len(y_test)*100

29.020556227327692

# Logistic regression


In [11]:
class SoftmaxRegression:
    '''
    Logistic regression for multi class
    '''
    
    
    def __init__(self, 
                 lr=0.0001, 
                 epoch=1,
                 tolerence=10e-8,
                 minibatchsize=30,
                 lambd=0):
        self.lr=lr
        self.epoch=epoch
        self.tolerence=tolerence
        self.minibatchsize=minibatchsize
        self.lambd = lambd
        
    def softmax(self,X,theta):
        z = X@theta
        z -= np.max(z)
        return np.exp(z)/np.sum(np.exp(z), axis=1, keepdims=True)
    
    # compute the grandient     
    def get_gradient(self, X, y, theta):
        n, _ = X.shape
        theta_wo_bias = theta.copy()
        theta_wo_bias[0, :] = 0
        
        grad = (-1 / n) * X.T@(y - self.softmax(X, theta)) + self.lambd * theta_wo_bias
        return grad
     
    #compute the loss
    def loss(self, X, y, theta):
        n, d = X.shape
        y_hat = self.softmax(X, theta)
 
        temp_theta = theta.copy()
        temp_theta[0, :] = 0 # Not including bias in regularization
       
        loss = (-1 / n) * np.sum(y * np.log(y_hat)) \
                + (self.lambd/2)*np.sum(temp_theta*temp_theta)

        return loss
    
    def one_hot_encode(self, y):
        '''
        one hot encode the target
        to be able to work with softmax
        '''
        n = len(y)
        # Turn y into one-hot-labels if number of classes is greater than 2
        y_encode = np.zeros((n, self.nb_classes))
        y_encode[range(n), y] = 1 #numpy advanced indexing
        y = y_encode
        return y
   
    #mini batch gradient descent        
    def fit(self,X,y):
        '''
        This method take tha data, 
        use minibactch gradient descent to update the weight
        and compute the coresponding cost
        '''
        #make a copy of our data
        X_copy=X.copy()
        y_copy=y.copy()
        self.classes = np.unique(y_copy)
        self.nb_classes = len(self.classes)
        
         #add the intercept column
        intercept=np.ones((X_copy.shape[0],1))
        X_copy=np.concatenate((intercept,X_copy),axis=1)
        
        
        #initialise the weight
        self.theta=np.zeros((X_copy.shape[1], self.nb_classes))
        
        # One-hot encode y
        y_copy = self.one_hot_encode(y_copy)
       
        diff=1
        current_iter=1
        #number of minibacth
        minibatch = int(len(X_copy)/self.minibatchsize)
        while (diff >= self.tolerence) and (current_iter<self.epoch) :
            prev_theta=self.theta.copy()
            random_vector=np.random.permutation(X_copy.shape[0])
            X_cop=X_copy[random_vector]
            y_cop=y_copy[random_vector]
            for j in range(minibatch):
                X_=X_cop[j*self.minibatchsize:(j+1)*self.minibatchsize]
                y_=y_cop[j*self.minibatchsize:(j+1)*self.minibatchsize]
                #compute the gradient
                grad = self.get_gradient(X_, y_, self.theta)
                #update the weight
                self.theta = self.theta - self.lr*grad 
            cur_theta=self.theta.copy()
            diff=np.linalg.norm(prev_theta - cur_theta)
            current_iter+=1
            #print('the loss function is ',self.loss(X_,y_,self.theta))

    def predict_proba(self, X):
        """
        Returns probability of predictions.
        """
        X_copy = X.copy()
        intercept=np.ones((X_copy.shape[0],1))
        X_copy=np.concatenate((intercept,X_copy), axis=1)

        return self.softmax(X_copy, self.theta) 
    
    def predict(self, X):
        '''
            gives the prediction using the softmax function
        '''
        prob = self.predict_proba(X)
        y_predict = np.argmax(prob, axis=1)
        return y_predict     
    

In [30]:
logReg = SoftmaxRegression(lr=0.1, 
                           epoch=300,
                           tolerence=10e-8,
                           minibatchsize=50,
                           lambd=0)

In [31]:
logReg.fit(X_train,y_train)

In [32]:
predict=logReg.predict(X_test)+1

In [33]:
np.sum(predict == y_test+1)/ len(y_test)*100

24.18379685610641

In [12]:
def test(size,data):
    max_words=500
    data_copy=data.copy()
    n, _ = data_copy.shape
    #print(n)
    size_data = int((n*size) /100)
    #print(size_data)
    split = size_data-20
    drug_data=data_copy[ :size_data]
    print(drug_data.shape)
    X,y =convert_text(max_words, drug_data)
    print (X.shape)
    #X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)
    X_train = X[:split]
    y_train =y[:split]
    X_test = X[split:]
    y_test =y[split:]  
    
    GDA = GaussianDiscriminantAnalysis()
    NB = BernoulliNaiveBayes()
    logReg = SoftmaxRegression(lr=0.1, 
                           epoch=300,
                           tolerence=10e-8,
                           minibatchsize=50,
                           lambd=0)
    
    
    GDA.train(X_train,y_train)
    predictionGDA = GDA.predict(X_test) +1

    print(y_test.shape)
    print('The accuracy for GDA is ', np.sum(predictionGDA == (y_test+1))/ len(y_test)*100)
    
    NB.train(X_train,y_train)
    predictionNB = NB.predict(X_test) +1
    print('The accuracy for NB is ', np.sum(predictionNB == (y_test+1))/ len(y_test)*100)
    
    logReg.fit(X_train,y_train)
    predictionlogReg = logReg.predict(X_test) + 1
    print('The accuracy for logReg is ', np.sum(predictionlogReg == (y_test+1))/ len(y_test)*100)

In [13]:
test(10,drug)

(413, 10)
Found 2741 unique tokens.
(413, 500)
(20,)
The accuracy for GDA is  35.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


The accuracy for NB is  20.0
The accuracy for logReg is  40.0


In [14]:
test(30,drug)

(1239, 10)
Found 4937 unique tokens.
(1239, 500)
(20,)
The accuracy for GDA is  30.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


The accuracy for NB is  35.0
The accuracy for logReg is  30.0


In [15]:
test(60,drug)

(2479, 10)
Found 7117 unique tokens.
(2479, 500)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


(20,)
The accuracy for GDA is  35.0
The accuracy for NB is  25.0
The accuracy for logReg is  45.0


In [16]:
test(100,drug)

(4132, 10)
Found 9108 unique tokens.
(4132, 500)
(20,)
The accuracy for GDA is  30.0
The accuracy for NB is  20.0
The accuracy for logReg is  25.0
