# IMPLEMENTING LOGISTIC REGRESSION FROM SCRATCH 

In [1]:
import numpy as np 
import pandas as pd

class logistic_regression:
    def __init__(self,lr,num_iter):
        self.lr=lr
        self.num_iter=num_iter 
        
    def sigmoid(self,z):
        return 1/(1+np.exp(-z))  
    def loss(self,h,y): 
        return (-y*np.log(h)-(1-y)*np.log(1-h)).mean()  
    def gradient(self,x,h,y): 
        return np.dot(x.T,(h-y))/len(y)
    def fit(self,x,y): 
        self.weights=np.zeros(x.shape[1]) 
        for _ in range(self.num_iter):
            z=np.dot(x,self.weights)
            h=self.sigmoid(z)   
            gradient=self.gradient(x,h,y) 
            self.weights-=self.lr*gradient 
    def predict(self,x,weights): 
        return np.round(self.sigmoid(np.dot(x,weights)))
    
            

In [2]:
def train_with_file(data,num_iter): 
        global columns
        columns=[]
        normalization_param={"min":[],"max":[]} 
        columns_names=['Age','Workclass','fnlgwt','Education','Education Num','Marital Status',
           'Occupation','Relationship','Race','Sex','Capital Gain','Capital Loss',
           'Hours/Week','Country','Income']
        train=pd.read_csv(data,sep=",",names=columns_names) 
        target=train["Income"]  
        target=target.astype("category").cat.codes
        train.drop("Income",axis=1,inplace=True) 
        #categorical_features=train.select_dtypes(exclude=["bool","int64","Float64"]).columns 
        train=pd.get_dummies(train) 
        num_features=train.select_dtypes(include=["int64","float64"]).columns 
        #Storing the min,max values
        for num in num_features: 
            normalization_param["min"].append(train[num].min()) 
            normalization_param["max"].append(train[num].max()) 
        #Normalizing using the formula x=(x-min(x))/(max(x)-min(x))
        train[num_features]=train[num_features].apply(lambda x:(x-x.min())/(x.max()-x.min()),axis=0)
        columns=train.columns 
        model=logistic_regression(0.1,1000) 
        model.fit(train,target) 
        return model.weights,normalization_param

In [3]:
def classify(data_file,weights,normalization_param):  
    test=pd.read_csv(data_file,sep=",",names=['Age','Workclass','fnlgwt','Education','Education Num','Marital Status',
           'Occupation','Relationship','Race','Sex','Capital Gain','Capital Loss',
           'Hours/Week','Country','Income']) 
    test.dropna(axis=0,inplace=True)
    test.drop("Income",inplace=True,axis=1)
    num_features=test.select_dtypes(include=["int64","float64"]).columns 
    for i in range(0,len(normalization_param)): 
        test[num_features[0]]=test[num_features[0]].apply(lambda x:(x-normalization_param["min"][i])/(normalization_param["max"][i]-normalization_param["min"][i]))
    test=pd.get_dummies(test)  
    inter_col=np.intersect1d(np.array(test.columns),np.array(columns)) 
    col=set(columns)-set(inter_col)
    test=test[inter_col] 
    for c in col: 
        test[c]=0
    model=logistic_regression(lr=0.1,num_iter=1000) 
    y_pred=model.predict(test,weights) 
    return y_pred

In [4]:
%time weights,normalization_param=train_with_file("adult-training.csv",1000) 
%time y_pred=classify("adult-test.csv",weights,normalization_param)

Wall time: 47.5 s
Wall time: 234 ms
