# Simple Logistic Regression Model for Titanic Problem

We will first use six input features that make sense to contribute to the survival of a passenger; they are: pclass, sex, age, sibsp, parch, cabin.
The other features - fare, name and ticket number - seem to have little effect on a passenger's survival.


## Structure of this project

We can break the code into few parts
*Initializing parameters
*Forward Proogation 
*Cost function
*Back Propogation
*Updation of parameters

## Importing kernels and creating helper functions

In [256]:
import numpy as np 
import pandas as pd
import test

## Loading Data

In [381]:
data_train = pd.read_csv("train.csv")
data_train.head(0)
data_train = data_train.dropna(subset = ['Embarked'])
X_train = pd.DataFrame(data_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']])
Y_train = pd.DataFrame(data_train[['Survived']])
Y_train = Y_train.to_numpy()

#print(X_train.dtypes)
data_test = pd.read_csv("test.csv")
data_test.head()
X_test_orig = pd.DataFrame(data_test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']])




gender = {"male": 1, "female": 2}
X_train.Sex = [gender[item] for item in X_train.Sex]
cab  = {'C': 1, 'Q': 2, 'S': 3}
X_train.Embarked = [cab[item] for item in X_train.Embarked]


Age_mean = X_train.mean(skipna = True)
#print(Age_mean["Age"])
X_train['Age'] = X_train['Age'].fillna(Age_mean["Age"])

#Normalising the data
X_train = (X_train - X_train.mean())/(X_train.std())


X_train = X_train.to_numpy()
X_0 = np.ones((X_train.shape[0], 1))

X_train = np.insert(X_train, 0, 1, axis = 1)
X_test_orig.Sex =  [gender[item] for item in X_test_orig.Sex]
X_test_orig.Embarked = [cab[item] for item in X_test_orig.Embarked]
X_test_orig['Age'] = X_test_orig['Age'].fillna(0)
X_test = X_test_orig.to_numpy()
X_test = np.insert(X_test, 0, 1, axis = 1)
X_train1 = X_train[:700, :]
X_train2 = X_train[700:, :]
Y_train1 = Y_train[:700, :]
Y_train2 = Y_train[700:, :]
print(X_test_orig.Embarked)
#print(Y_train)


0      2
1      3
2      2
3      3
4      3
      ..
413    3
414    1
415    3
416    3
417    1
Name: Embarked, Length: 418, dtype: int64


In [369]:
def sigmoid(z):
    """
    To compute the sigmoid of z
    z is an array
    Returns s -> sigmoid(z)
    """

    s = 1 / (1 + np.exp(-z))
    return s

In [370]:
print(sigmoid(X_train[:,4]))

[0.60613811 0.60613811 0.38344963 0.60613811 0.38344963 0.38344963
 0.38344963 0.90406117 0.38344963 0.60613811 0.60613811 0.38344963
 0.38344963 0.60613811 0.38344963 0.38344963 0.95887819 0.38344963
 0.60613811 0.38344963 0.38344963 0.38344963 0.38344963 0.38344963
 0.90406117 0.60613811 0.38344963 0.90406117 0.38344963 0.38344963
 0.38344963 0.60613811 0.38344963 0.38344963 0.60613811 0.60613811
 0.38344963 0.38344963 0.79202035 0.60613811 0.60613811 0.60613811
 0.38344963 0.60613811 0.38344963 0.38344963 0.60613811 0.38344963
 0.79202035 0.60613811 0.95887819 0.38344963 0.60613811 0.60613811
 0.38344963 0.38344963 0.38344963 0.38344963 0.60613811 0.98296435
 0.38344963 0.60613811 0.90406117 0.38344963 0.60613811 0.38344963
 0.38344963 0.95887819 0.79202035 0.38344963 0.98296435 0.38344963
 0.60613811 0.38344963 0.38344963 0.38344963 0.38344963 0.38344963
 0.38344963 0.38344963 0.38344963 0.38344963 0.38344963 0.38344963
 0.90406117 0.60613811 0.38344963 0.90406117 0.38344963 0.3834

In [371]:
def initialize_paras(dims):
    """
    To create the weights of dimension (n_h, 1) and a scalar b
    """
    
    w = np.zeros(shape=(dims, 1))
    b = 0
    
    return w, b

In [372]:
w, b = initialize_paras(X_train.shape[1])
print(w,b)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]] 0


In [373]:
def forwardProp( X, Y, w, b):
    """
    To perform the forward propogation step
    Returns a dictionary with the gradients and the cost separately
    """
    m = X.shape[0]
    A = sigmoid(np.dot(X,w) + b)
    cost = (-1/m) * np.sum(Y * np.log(A) + (1-Y) * np.log(1-A))
        
    dw = (1/m) * np.dot(X.T, (A-Y))
    db = (1/m) * np.sum(A - Y)
    
    grads = {"dw":dw, "db":db}

    return grads, cost

In [374]:
g, c = forwardProp( X_train, Y_train, w, b)

In [375]:
def GradientDescent( X, Y, w, b, iterations, rate):
    """
    Runs gradient decent to optimize w and b
    Returns optimized parameters, accumulated gradients and all the 
    costs to see if it is decreasing
    """
    
    costs = []
    
    for i in range(iterations):
        
        grads, cost = forwardProp(X, Y, w, b)
        dw = grads["dw"]
        db = grads["db"]
        #Updating weights
        w = w - dw*rate
        b = b - db*rate
        
        #Appending cost to the list
        if i%100 ==0:
            costs.append(cost)
        
    
    #Creating dictionaries to store the optimized parameters
    paras = {"w":w, "b":b}
    grads = {"dw":dw, "db":db}
    
    return paras, grads, costs

In [376]:
GradientDescent(X_train, Y_train, w, b, 100, 0.5)

({'w': array([[-0.32322355],
         [-0.86564023],
         [ 1.29298582],
         [-0.49311783],
         [-0.35264481],
         [-0.08610664],
         [ 0.12460108],
         [-0.17452079]]),
  'b': -0.3232235483739071},
 {'dw': array([[ 2.24417920e-04],
         [ 1.99241936e-03],
         [-7.80068741e-04],
         [ 1.20074580e-03],
         [ 6.31806819e-04],
         [-2.07953466e-04],
         [ 1.35698100e-03],
         [ 4.82237793e-05]]),
  'db': 0.0002244179199354488},
 [0.6931471805599454])

In [377]:
def predict(w, b, X):
    """
    To predict the Y value for a corresponding X value given.
    Returns the prediction for all x in X
    """
    
    m = X.shape[0]# No. of training examples
    Predictions = np.zeros((m, 1))
    
    z = np.dot(X, w) + b
    A = sigmoid(z)
    
    for i in range(m):
        if A[i, 0] > 0.5:
            Predictions[i, 0] = 1
        else:
            Predictions[i, 0] = 0
            
    return Predictions
    

In [393]:
def model(X_train, Y_train, X_test, iterations=2000, rate=0.9):
    """
    Puts together functions created as a single model
    Returns details about the model
    """
    print("Running")
    dims = X_train.shape[1]
    w, b = initialize_paras(dims)
    
    paras, grads, costs = GradientDescent(X_train, Y_train, w, b, iterations, rate)
    w = paras["w"]
    b = paras["b"]
    
    Prediction_test = predict(w, b, X_test)
    Prediction_train = predict(w, b, X_train)
    
    #Calculating accuracies
    Train_accuracy = 100 - np.mean(np.abs(Prediction_train - Y_train)) * 100
    print("Training set accuracy: " + str(Train_accuracy))
    
    details = {"costs":costs, "w":w, "b":b, "rate":rate, "iterations":iterations, 
              "Prediction_test":Prediction_test, "Prediction_train":Prediction_train}
    
    return details
    

In [395]:
details = model(X_train1, Y_train1, X_test)
p2 = predict(w, b, X_train2)
Train_accuracy1 = 100 - np.mean(np.abs(p2 - Y_train2)) * 100
print(Train_accuracy1)
print(details["costs"])
#print(details["Prediction_train"] - Y_train)
P = details["Prediction_test"].T
P = np.squeeze(np.array(P))
#print(P)
submission = pd.DataFrame({'PassengerId': data_test['PassengerId'], 'Survived':P.T})
#submission.to_csv('Titanic Predictions LR1.csv', index=False)

Running
Training set accuracy: 79.71428571428572
63.492063492063494
[0.6931471805599454, 0.45071734403346375, 0.45071604132029286, 0.4507160411483511, 0.4507160411483282, 0.4507160411483282, 0.4507160411483282, 0.45071604114832814, 0.45071604114832814, 0.45071604114832814, 0.45071604114832814, 0.45071604114832814, 0.45071604114832814, 0.45071604114832814, 0.45071604114832814, 0.45071604114832814, 0.45071604114832814, 0.45071604114832814, 0.45071604114832814, 0.45071604114832814]
