In [353]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import preprocessing
import statsmodels.api as sm
from sklearn.metrics import r2_score
import sys

In [198]:
data = pd.read_csv("yeast_csv.csv")

In [199]:
df = pd.DataFrame(data)

In [200]:
df.head()

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc,class_protein_localization
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT


In [233]:
log_reg = LogisticRegression(max_iter= 10000)
features = ['mcg', 'gvh', 'alm', 'mit', 'erl', 'pox', 'vac', 'nuc']
X = df[features].to_numpy()
std_scaler = preprocessing.StandardScaler().fit(X)
X = std_scaler.fit_transform(X)
y = df.class_protein_localization


log_reg.fit(X,y)


LogisticRegression(max_iter=10000)

In [234]:
log_reg.coef_

array([[-7.31687008e-01, -6.60376688e-01,  1.29154371e+00,
        -2.93867151e-01,  5.08784894e-02, -6.66106910e-02,
         3.97244118e-02,  1.47086898e-01],
       [ 1.04802028e+00,  4.55905081e-01, -3.92860236e-01,
         6.69825919e-01,  9.08433817e-01, -7.86559780e-04,
         4.32253861e-01, -4.47329662e-02],
       [ 7.39569151e-01,  1.14647228e+00,  6.95083735e-01,
        -1.14537060e-01, -2.47862825e-01, -5.69614879e-02,
        -6.89471746e-01, -8.89693896e-01],
       [ 1.34659737e+00,  1.48747154e+00, -1.18334639e+00,
        -4.42138633e-03, -2.14102910e-01, -2.38753793e-02,
         2.44401849e-02,  4.44772082e-01],
       [ 1.21086290e+00, -5.25064359e-01, -8.26136409e-01,
         1.69299797e-01,  1.72602738e-01, -9.92339504e-02,
        -1.50871031e-01, -1.57366360e-01],
       [-1.40235744e+00, -4.40412259e-01, -2.15669887e+00,
        -4.20763764e-01,  5.69533306e-02, -1.33460507e-01,
         6.46725660e-02,  4.92997569e-01],
       [-6.12000589e-01, -2.631722

In [235]:
log_reg.intercept_

array([ 3.94439737, -7.14335468, -1.40500068, -3.75474838, -0.11245909,
        0.8075341 ,  2.90662046,  3.74193869,  0.09158056,  0.92349165])

In [232]:
log_reg.predict(X)

array([0, 0, 0, ..., 0, 0, 0])

In [237]:
log_reg.score(X,y)

0.6044474393530997

## Gradient descent for logistic regression 

In [409]:
def gradient_descend (X, y ,n_iteration= 20000, learning_rate = 0.1):
    m,n = np.shape(X)
    beta = np.random.uniform(-10,10,n)
    for i in range (n_iteration):
        error = sigmoid(X,beta)-y
        sub = X.T@error
        beta -= learning_rate*(1/m)*sub
    return beta    

In [410]:
def sigmoid (X,beta):
    return 1/(1+np.exp(-1*(X@beta)))

In [411]:
X= sm.add_constant(X)

beta = gradient_descend(X,y1)


## class for multiclass logistic regresssion

In [412]:
class MultiClassLogistic :
    def __init__(self,X,y):
        self.X = X
        self.y = y
        self.set_classes = set(self.y)
        self.no_of_classes = len(self.set_classes)
        self.map = self.set_map()
        self.betas = self.beta_s(self.X)
        self.m = np.size(self.y)
    def set_map(self):
        dict = {}
        i = 0
        for x in self.set_classes:
            dict[i] = x
            i+=1
        return dict  
    def beta_s (self,X):
        m,n = np.shape(X)
        betas=np.empty([self.no_of_classes,n])
        
        for i in range (0,self.no_of_classes):
            curr_y = self.get_y_forclass(i)
            betas[i]=(gradient_descend(X,curr_y))
        return betas    
    def get_y_forclass(self,key):
        needed_class = self.map[key]
        new_y = [1 if item == needed_class else 0 for item in self.y]
        return new_y
    def predict (self,X):
        (m,n) = np.shape(X)
        class_Scores=1/(1+np.exp(-1*(X@self.betas.T)))
        predictions = []
        for i in range (m):
            predictions.append(self.get_predicted_class(class_Scores[i]))
        return predictions    
    def get_predicted_class(self,class_score):
        maxi = -1*sys.maxsize
        key = -1
        for i in range (len(class_score)):
             val = class_score[i]
             if val> maxi:
                key =i
                maxi = val
                
        return self.map[key]  
    def get_score(self,predicted_y):
        n = np.size(y)
        correct = 0
        for i in range (n):
            if(predicted_y[i]==self.y[i]):
                correct+=1
        return (correct/n)        

In [413]:
my_model = MultiClassLogistic(X,y)

In [414]:
my_model.predict(X)

['CYT',
 'MIT',
 'CYT',
 'CYT',
 'MIT',
 'POX',
 'MIT',
 'CYT',
 'CYT',
 'CYT',
 'CYT',
 'CYT',
 'CYT',
 'MIT',
 'CYT',
 'CYT',
 'NUC',
 'CYT',
 'CYT',
 'CYT',
 'CYT',
 'CYT',
 'ME2',
 'CYT',
 'CYT',
 'MIT',
 'CYT',
 'CYT',
 'CYT',
 'ME1',
 'CYT',
 'MIT',
 'NUC',
 'MIT',
 'EXC',
 'MIT',
 'CYT',
 'CYT',
 'MIT',
 'CYT',
 'ME1',
 'MIT',
 'ME2',
 'ME3',
 'CYT',
 'NUC',
 'MIT',
 'CYT',
 'CYT',
 'ME3',
 'CYT',
 'NUC',
 'NUC',
 'CYT',
 'CYT',
 'CYT',
 'CYT',
 'CYT',
 'CYT',
 'MIT',
 'MIT',
 'NUC',
 'NUC',
 'CYT',
 'CYT',
 'CYT',
 'CYT',
 'CYT',
 'CYT',
 'ME1',
 'CYT',
 'CYT',
 'MIT',
 'MIT',
 'CYT',
 'MIT',
 'MIT',
 'MIT',
 'CYT',
 'MIT',
 'MIT',
 'MIT',
 'MIT',
 'CYT',
 'POX',
 'ME3',
 'ME3',
 'ME2',
 'NUC',
 'MIT',
 'NUC',
 'CYT',
 'CYT',
 'NUC',
 'CYT',
 'ME3',
 'MIT',
 'MIT',
 'ME1',
 'CYT',
 'ME3',
 'ME3',
 'MIT',
 'NUC',
 'CYT',
 'ME3',
 'CYT',
 'CYT',
 'CYT',
 'CYT',
 'MIT',
 'ME3',
 'NUC',
 'MIT',
 'NUC',
 'MIT',
 'MIT',
 'MIT',
 'NUC',
 'CYT',
 'ME3',
 'MIT',
 'MIT',
 'MIT',
 'ME3',


In [415]:
my_model.get_score(my_model.predict(X))

0.5923180592991913