# GDA ALGORITHM

In [1]:
#import our libraries
import numpy as np
import pandas as pd
import numpy.linalg as LA
from sklearn.feature_extraction.text import CountVectorizer

## LOAD OUR DATA SET
##  as we have two dataset we have to caoncatenate its first

In [2]:
data1 = pd.read_table('drugLibTest_raw.tsv',sep='\t')
data2 = pd.read_table('drugLibTrain_raw.tsv',sep='\t')
data = pd.concat((data1,data2), axis=0)
data.head()

Unnamed: 0.1,Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,1366,biaxin,9,Considerably Effective,Mild Side Effects,sinus infection,The antibiotic may have destroyed bacteria cau...,"Some back pain, some nauseau.",Took the antibiotics for 14 days. Sinus infect...
1,3724,lamictal,9,Highly Effective,Mild Side Effects,bipolar disorder,Lamictal stabilized my serious mood swings. On...,"Drowsiness, a bit of mental numbness. If you t...",Severe mood swings between hypomania and depre...
2,3824,depakene,4,Moderately Effective,Severe Side Effects,bipolar disorder,Initial benefits were comparable to the brand ...,"Depakene has a very thin coating, which caused...",Depakote was prescribed to me by a Kaiser psyc...
3,969,sarafem,10,Highly Effective,No Side Effects,bi-polar / anxiety,It controlls my mood swings. It helps me think...,I didnt really notice any side effects.,This drug may not be for everyone but its wond...
4,696,accutane,10,Highly Effective,Mild Side Effects,nodular acne,Within one week of treatment superficial acne ...,Side effects included moderate to severe dry s...,Drug was taken in gelatin tablet at 0.5 mg per...


then i will group the data according to the length, 
and for that I will use dummies to convert my strings values in
numerical values for the data having a small length and countvectorize for the data having
a long length for the conversion

In [3]:
d1 = data[['effectiveness','sideEffects']]
#data['benefitsReview','commentsReview']= data['benefitsReview','commentsReview'].str.lower()
text= data[['benefitsReview','commentsReview','sideEffectsReview']].astype(str).apply(lambda z: ' '.join(z),axis=1)
text = text.str.lower()
texte= text.replace('[\d+]')#remove numbers
d2 = texte
data3 = pd.get_dummies(d1)
 #list of text documents
text = d2
# create the transform
vectorizer = CountVectorizer( binary = True ,min_df=50,
                            stop_words ='english', max_features =50,
                                                         )
# tokenize and build vocab
tati = vectorizer.fit_transform(text)
data_text=tati.toarray()

 after that i transform it into dataframe and concatenate the both 

In [4]:
data_text = pd.DataFrame(data_text)
D = pd.concat([pd.DataFrame(data3.values),pd.DataFrame(data_text.values)],axis=1)
D

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,1,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,1
3,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4138,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4139,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,1,1,1,1,1
4140,0,0,0,1,0,0,0,1,0,0,...,1,0,0,0,1,1,0,0,0,0
4141,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


Define my features and target. here i use the whole dataset ie 100%

In [5]:
data['rating'] =data['rating'].apply(lambda x  :x-1)
y=data['rating'].values
x = D.values

## Define the function

In [6]:
#our function
class Gaussian(object):
    
      
    '''
    The GDA algorithm (generative algorithm) is a generative 
    learning algorithm in which we assume p(x|y) is distributed according to a multivariate normal distribution
    '''
    #initialise our variables
    def __init__(self,x,y):
        self.phi = 0
        self.x = x
        self.y = y
     
    #calculate the function phi
    def calculate_phi(self):
        classes = np.unique(self.y)
        n_samples = len(self.y)
        pi = []
        for k in classes:
            pi.append(np.sum(self.y==k)/n_samples)
        return pi
    
     #calculate the mu  
    def calculate_mu(self):
        classes = np.unique(self.y)
        mu = []
        for i in classes:
            mu.append( np.mean(x[y==1],axis=0))
        return mu

    #calculate the sigma
    def calculate_sigma(self):
            x = self.x
            classes = np.unique(self.y)
            n_samples= len(self.y)
            sigma =0

            for i in range(n_samples):
                for k in classes:
                    if self.y[i] ==k:
                        sigma+= (x[i].reshape(1,-1)-mu[k]).T@(x[i].reshape(1,-1)-mu[k].reshape(1,-1))
                        #print(sigma.shape)
            return sigma/n_samples
 
    # define the  multivariate normal distribution
    def multivariate(self,x,sigma,mu):
            dim = len(mu)
            c =  (1./np.sqrt((2*np.pi)**(dim))*(LA.det(sigma)+0.1)**0.5)
            gaussian = c * np.exp((-0.5)*np.dot(np.dot((x-mu), LA.inv(sigma)), (x-mu).T))

            return gaussian

        #prediction
    def predict(self,x,mu,phi,sigma):
        
        pred =[]
        for i in range(len(x)):
                prob =[]
                for j in range(len(mu)):
                    s=self.multivariate(x[i],sigma,mu[j])*phi[j]
                    #print(s)
                    prob.append(s)
                pred.append(np.argmax(prob))    
        return pred

In [7]:
pi = Gaussian(x,y).calculate_phi()
pi

[0.10137581462708183,
 0.032826454260197924,
 0.04706734250543085,
 0.03644701906830799,
 0.05454984310885831,
 0.0518947622495776,
 0.11368573497465605,
 0.18054549843108858,
 0.147960415158098,
 0.23364711561670287]

In [8]:
mu =  Gaussian(x,y).calculate_mu()
mu

[array([0.13970588, 0.08823529, 0.40441176, 0.22058824, 0.14705882,
        0.13970588, 0.13970588, 0.20588235, 0.07352941, 0.44117647,
        0.05882353, 0.13235294, 0.08823529, 0.40441176, 0.29411765,
        0.14705882, 0.31617647, 0.24264706, 0.13970588, 0.27205882,
        0.09558824, 0.11029412, 0.08823529, 0.375     , 0.16911765,
        0.08823529, 0.19852941, 0.09558824, 0.125     , 0.16176471,
        0.06617647, 0.19117647, 0.19117647, 0.13970588, 0.16176471,
        0.05882353, 0.16176471, 0.27205882, 0.13970588, 0.18382353,
        0.21323529, 0.11029412, 0.11029412, 0.19852941, 0.17647059,
        0.15441176, 0.14705882, 0.375     , 0.19852941, 0.13235294,
        0.30147059, 0.22058824, 0.16911765, 0.05882353, 0.125     ,
        0.17647059, 0.08823529, 0.125     , 0.13970588, 0.125     ]),
 array([0.13970588, 0.08823529, 0.40441176, 0.22058824, 0.14705882,
        0.13970588, 0.13970588, 0.20588235, 0.07352941, 0.44117647,
        0.05882353, 0.13235294, 0.08823529, 0.

In [9]:
sigma  =  Gaussian(x,y).calculate_sigma()
sigma

array([[ 0.23484195, -0.07274741, -0.07544071, ..., -0.00405783,
        -0.00175131,  0.00478948],
       [-0.07274741,  0.35385467, -0.14126816, ..., -0.00057947,
        -0.0037721 ,  0.03409986],
       [-0.07544071, -0.14126816,  0.1787304 , ...,  0.00470341,
         0.00688809, -0.02144098],
       ...,
       [-0.00405783, -0.00057947,  0.00470341, ...,  0.10686323,
         0.00764867,  0.00775027],
       [-0.00175131, -0.0037721 ,  0.00688809, ...,  0.00764867,
         0.11309159,  0.00491017],
       [ 0.00478948,  0.03409986, -0.02144098, ...,  0.00775027,
         0.00491017,  0.15266338]])

In [10]:
# X_train = x[20:]
# X_test = x[:20]
# Y_train = y[20:]
# Y_test = y[:20]

In [11]:
X_train = x[20:]
X_test = x[:20]
Y_train = y[20:]
Y_test = y[:20]

In [12]:
pred_test = Gaussian(x,y).predict(X_test,mu,pi,sigma)
pred_test

[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]

## The accuracy

In [13]:
s=0
for i in range(len(Y_test)):
    if Y_test[i]==pred_test[i]:
        s=s+1
                
print((s/len(Y_test)) *100) 
    

30.0


## Using 10% of the data

In [14]:
percentage = int(len(x)* 10/100)
x = x[:percentage,]
y = y[:percentage]
X_train = x[20:]
X_test = x[:20]
Y_train = y[20:]
Y_test = y[:20]

In [15]:
pi = Gaussian(x,y).calculate_phi()
mu =  Gaussian(x,y).calculate_mu()
sigma  =  Gaussian(x,y).calculate_sigma()
pred_test = Gaussian(x,y).predict(X_test,mu,pi,sigma)
pred_test

[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]

In [16]:
s=0
for i in range(len(Y_test)):
    if Y_test[i]==pred_test[i]:
        s=s+1
                
print((s/len(Y_test)) *100) 

30.0


## Using 30% of the data

In [17]:
percentage = int(len(x)* 30/100)
x = x[:percentage,]
y = y[:percentage]
X_train = x[20:]
X_test = x[:20]
Y_train = y[20:]
Y_test = y[:20]

In [18]:
pi = Gaussian(x,y).calculate_phi()
mu =  Gaussian(x,y).calculate_mu()
sigma  =  Gaussian(x,y).calculate_sigma()
pred_test = Gaussian(x,y).predict(X_test,mu,pi,sigma)
pred_test

[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]

In [19]:
s=0
for i in range(len(Y_test)):
    if Y_test[i]==pred_test[i]:
        s=s+1
                
print((s/len(Y_test)) *100) 

30.0


## Using 60% of the data

In [20]:
percentage = int(len(x)* 60/100)
x = x[:percentage,]
y = y[:percentage]
X_train = x[20:]
X_test = x[:20]
Y_train = y[20:]
Y_test = y[:20]

In [21]:
pi = Gaussian(x,y).calculate_phi()
mu =  Gaussian(x,y).calculate_mu()
sigma  =  Gaussian(x,y).calculate_sigma()
pred_test = Gaussian(x,y).predict(X_test,mu,pi,sigma)
pred_test

[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]

In [22]:
s=0
for i in range(len(Y_test)):
    if Y_test[i]==pred_test[i]:
        s=s+1
                
print((s/len(Y_test)) *100) 

30.0


## Observations


we notice that whatever the size of the input data we get a fairly good performance of the algorithm
so in view of all of the above we can say that this model is best suited for solving the problem