In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [172]:
class LogisticRegression:
    def __init__(self, x, y, learning_rate = 0.01, iterations = 1000):
        self.X = x
        self.Y = y
        self.learning_rate = learning_rate
        self.iterations = iterations
    
    def initialize_weights(self, dim):
        self.w = np.zeros((dim, 1))
        self.b = np.float64(0)
    
    def get_weights(self):
        return self.w, self.b

    def split_data(self, test_size=0.33, random_state=42):
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.X, self.Y, test_size = test_size, random_state = 42)
        self.x_train = np.array(self.x_train)
        self.x_test = np.array(self.x_test)
        self.y_train = np.array(self.y_train)
        self.y_test = np.array(self.y_test)
        self.m = self.x_train.shape[0]
        print(self.x_train.shape, self.x_test.shape, self.y_train.shape, self.y_test.shape)
    
    def sigmoid(self, z):
        s = 1 / (1 + np.exp(-z))
        return s
    
    def calculate_cost(self, A):
        cost = -1 /self.m * np.sum(self.y_train * np.log(A) + (1 - self.y_train) * np.log(1 - A))
        cost = np.squeeze(np.array(cost))
        return cost
    
    def forward_propogation(self):
        A = self.sigmoid(np.dot(self.w.T,self.x_train)+self.b)
        cost = self.calculate_cost(A)
        return A, cost
    
    def backward_propogation(self, A):
        print(np.dot(X, (A-Y).T).shape)
        dw = 1 / self.m * np.dot(X, (A-Y).T)
        db = 1/ self.m * np.sum(A-Y)
        grads = {"dw": dw,
                 "db": db}
        return grads
    
    def predict(self, X):
        Y_prediction = np.zeros((1, X.shape[0]))
        A = self.sigmoid(np.dot(self.w.T, X) + self.b)
        for i in range(A.shape[1]):
            if A[0, i] > 0.5:
                Y_prediction[0,i] = 1
            else:
                Y_prediction[0,i] = 0
        Y_prediction = np.zeros((1, m))
        return Y_prediction
    
    def optimize(self):
        for i in range(self.iterations):
            A, cost = self.forward_propogation()
            grads = self.backward_propogation(A)
            print(grads["dw"].shape, self.w.shape)
            self.w = self.w - self.learning_rate * grads["dw"]
            self.b = self.b - self.learning_rate * grads["db"]
            if i % 100 == 0:
                self.costs.append(cost)
                print ("Cost after iteration %i: %f" %(i, cost))
        
    def fit(self):
        # initializing variables and splitting data set
        
        self.split_data()
        self.initialize_weights(self.x_train.shape[0])
        self.costs = []
        
        # Applying gradient descent
        
        self.optimize()
        
        # Printing test/train accuracy
        
        self.Y_prediction_train = self.predict(self.x_train)
        train_score = 100 - np.mean(np.abs(self.Y_prediction_train - y_train)) * 100
        
        self.Y_prediction_test = self.predict(self.x_test)
        test_score = 100 - np.mean(np.abs(self.Y_prediction_test - y_test)) * 100
        
        print("Train accuracy: {} %".format(train_score))
        print("Test accuracy: {} %".format(test_score))

In [173]:
df = pd.read_excel('titanic.xls')
columns_to_drop = ['name','ticket', 'home.dest']
df = df.drop(columns_to_drop, axis=1)

df['embarked'] = pd.factorize(df['embarked'])[0]

df['cabin'] = df['cabin'].fillna(df['cabin'].mode())
df['cabin'] = pd.factorize(df['cabin'])[0]

df['sex'].value_counts(dropna=False)
df['sex'] = df['sex'].map(lambda x: 1 if x == 'male' else 0)
df['sex'].value_counts()

df['fare'] = df['fare'].fillna(df['fare'].mean())

df['age'] = df['age'].fillna(df['age'].mean())

df['boat'] = df['boat'].fillna(0)
df['boat'] = pd.factorize(df['boat'])[0]

df['body'] = df['body'].fillna(df['body'].mean())

df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked,boat,body
0,1,1,0,29.0,0,0,211.3375,0,0,0,160.809917
1,1,1,1,0.9167,1,2,151.55,1,0,1,160.809917
2,1,0,0,2.0,1,2,151.55,1,0,2,160.809917
3,1,0,1,30.0,1,2,151.55,1,0,2,135.0
4,1,0,0,25.0,1,2,151.55,1,0,2,160.809917


In [174]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   sex       1309 non-null   int64  
 3   age       1309 non-null   float64
 4   sibsp     1309 non-null   int64  
 5   parch     1309 non-null   int64  
 6   fare      1309 non-null   float64
 7   cabin     1309 non-null   int64  
 8   embarked  1309 non-null   int64  
 9   boat      1309 non-null   int64  
 10  body      1309 non-null   float64
dtypes: float64(3), int64(8)
memory usage: 112.6 KB


In [175]:
df.corr()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked,boat,body
pclass,1.0,-0.3124694,0.124617,-0.36637,0.060832,0.018322,-0.558477,-0.503073,0.042356,-0.0626319,-0.01064472
survived,-0.312469,1.0,-0.528693,-0.050199,-0.027825,0.08266,0.244208,0.225741,0.095171,0.7241942,-1.014861e-15
sex,0.124617,-0.5286931,1.0,0.057398,-0.109609,-0.213125,-0.185484,-0.097263,-0.116904,-0.3500616,-0.002509213
age,-0.36637,-0.05019898,0.057398,1.0,-0.190747,-0.130872,0.171521,0.178484,0.032068,-0.08338106,0.01964239
sibsp,0.060832,-0.02782512,-0.109609,-0.190747,1.0,0.373587,0.160224,-0.017956,-0.07211,-0.03674232,-0.02591925
parch,0.018322,0.08265957,-0.213125,-0.130872,0.373587,1.0,0.221522,0.029907,-0.094181,0.03551282,0.01517702
fare,-0.558477,0.2442078,-0.185484,0.171521,0.160224,0.221522,1.0,0.329468,0.058818,0.008847911,-0.01058336
cabin,-0.503073,0.2257414,-0.097263,0.178484,-0.017956,0.029907,0.329468,1.0,0.030111,0.04591269,0.01693983
embarked,0.042356,0.09517116,-0.116904,0.032068,-0.07211,-0.094181,0.058818,0.030111,1.0,0.1179736,0.009080342
boat,-0.062632,0.7241942,-0.350062,-0.083381,-0.036742,0.035513,0.008848,0.045913,0.117974,1.0,-1.649683e-16


In [176]:
Y = np.array(df.iloc[:, 1:2])
X = np.array(df.iloc[:, 2:])

In [177]:
model = LogisticRegression(X, Y, learning_rate = 0.01, iterations = 1000)
model.fit()

(877, 9) (432, 9) (877, 1) (432, 1)
(1309, 1309)
(1309, 1309) (877, 1)


ValueError: operands could not be broadcast together with shapes (877,1) (1309,1309) 