In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

In [2]:
train_data = pd.read_csv('train.csv')
train_data #train
test_data = pd.read_csv('test.csv')
test_data #test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [3]:

train = train_data.drop(['Cabin', 'Ticket'], axis=1)
test = test_data.drop(['Cabin', 'Ticket'], axis=1)


In [4]:

### before dropping the name column you may want to extract the title for each person

train['Title'] = train.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train['Title'], train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [5]:
### replacing the titles to fit 5 catergories

train['Title'] = train['Title'].replace(['Capt', 'Col', 'Countess', 'Lady', 'Col','Don','Dona','Dr','Major','Jonkheer','Rev','Sir'], 'Rare')
train['Title'] = train['Title'].replace('Mlle','Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme','Mrs')
train[['Title', 'Survived']].groupby(['Title']).mean()

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
Master,0.575
Miss,0.702703
Mr,0.156673
Mrs,0.793651
Rare,0.347826


In [6]:

### changing title into numeric variables

map_title = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
train['Title'] = train['Title'].map(map_title)

In [7]:

### drop the name feature now

train = train.drop(['Name','PassengerId'], axis=1)

In [8]:
### convert the other non-numeric features into numeric

map_sex = {'male': 1 , 'female': 0}
train['Sex'] = train['Sex'].map(map_sex)

map_embark = {'S': 3, 'C':1, 'Q':2}
train['Embarked'] = train['Embarked'].map(map_embark)

In [9]:
train['Age'] = train['Age'].fillna(value=train['Age'].mean())

In [10]:
train['Embarked'] = train['Embarked'].fillna(value=3) ## filling with the most common

In [11]:

x = train.drop('Survived', axis=1)
y = train["Survived"]
x.to_numpy()

array([[ 3.        ,  1.        , 22.        , ...,  7.25      ,
         3.        ,  1.        ],
       [ 1.        ,  0.        , 38.        , ..., 71.2833    ,
         1.        ,  3.        ],
       [ 3.        ,  0.        , 26.        , ...,  7.925     ,
         3.        ,  2.        ],
       ...,
       [ 3.        ,  0.        , 29.69911765, ..., 23.45      ,
         3.        ,  2.        ],
       [ 1.        ,  1.        , 26.        , ..., 30.        ,
         1.        ,  1.        ],
       [ 3.        ,  1.        , 32.        , ...,  7.75      ,
         2.        ,  1.        ]])

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.10)
print(y_train.shape)
print(y_test.shape)

(801,)
(90,)


In [13]:

### function for data partition train-validation-test

def partition(x,y,train_portion = None):
    """ Partitions the data into train-validatin-test. 
    Inputs  - x : the titanic dataset
            - y : the survived columns 
    Outputs - The data splitted in 3 different parts
    
    """

    # Divide the dataset into train, validation and test splits with the respective portions

    ### putting together the data

    if train_portion is None:
        train_portion = 0.8
    else:
        train_portion = train_portion
    
    valid_portion = 0.1
    test_portion = 0.1

    y = y.to_numpy()
    x = x.to_numpy()

    ### randomise the data set
    idx = [i for i in range(len(x))]
    random.shuffle(idx)
    train_idx, valid_idx, test_idx = np.split(idx,[int(train_portion*len(x)), int((train_portion + test_portion)*len(x))])

    X_train = x[train_idx]
    Y_train = y[train_idx]

    X_valid = x[valid_idx]
    Y_valid = y[valid_idx]

    X_test = x[test_idx]
    Y_test = y[test_idx]

    return X_train, Y_train, X_valid, Y_valid, X_test, Y_test

In [14]:
X_train, Y_train, X_valid, Y_valid, X_test, Y_test=partition(x,y,train_portion=0.8)

In [15]:
print(Y_valid.shape)


(89,)


In [16]:
# Logistic Sigmoid equation 
def sigmoid(a):
    return 1 / (1 + np.exp(-a))

# Cost function equation
def J(h, y):
    return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

In [17]:
# Stochastic gradient descent equation
def gradientdescent(X, y, lmd, alpha, itr, print_cost):

    weights = np.zeros(X.shape[1])
    
    costs = []  
    
    #Calculating across all iterations
    for i in range(itr):
        a = np.dot(X, weights)
        h = sigmoid(a)
        
        # regularizing the data
        reg = lmd / y.size * weights
        reg[0] = 0
        cost = J(h, y)
        
        # Gradient Descent equation
        graddes = np.dot(X.T, (h - y)) / y.size + reg
        weights = weights - alpha * graddes
    
        if print_cost and i % 100 == 0: 
             print('Number of Iterations: ', i, 'Cost : ', cost, 'Weight: ', weights)
        if i % 100 == 0:
            costs.append(cost)
      
    return weights, costs

In [18]:
# Predict Function 
def predict(X_test, weights):
    a = np.dot(X_test, weights)
    return sigmoid(a)

In [19]:
# Logistic Regression function
def logistic(X_train, y_train, X_test, lmd=0, alpha=0.1, itr=30000, print_cost = False):
    
    # Adding intercept to train and test data
    intercept = np.ones((X_train.shape[0], 1))
    X_train = np.concatenate((intercept, X_train), axis=1)
    
    intercept = np.ones((X_test.shape[0], 1))
    X_test = np.concatenate((intercept, X_test), axis=1)

    # Applying the cost function
    p=set(y_train)
    q=[]
    allCosts=[]   
    for s in p:
        # Giving binary values to each data
        ynew = np.array(y_train == s, dtype = int)
        
        weights_Jcost, costs_Jcost = gradientdescent(X_train, ynew, lmd, alpha, itr, print_cost)
        q.append(weights_Jcost)
        
        # Applying costs
        allCosts.append(costs_Jcost)
        
    # Determines the probability of a class membership for train and test

    ptest = np.zeros((len(p),len(X_test)))
    for i in range(len(p)):
        ptest[i,:] = predict(X_test,q[i])
    
    # Select max probability
    prediction_test = np.argmax(ptest, axis=0)
 
    ptrain = np.zeros((len(p),len(X_train)))
    for i in range(len(p)):
        ptrain[i,:] = predict(X_train,q[i])
    
    # Select max probability
    prediction_train = np.argmax(ptrain, axis=0)
    
    pt = prediction_test
        
    
    return pt

In [20]:
# Calling the function
log = logistic(X_train, Y_train, X_test)

  import sys
  import sys
  This is separate from the ipykernel package so we can avoid doing imports until


In [21]:
# Accuracy function used to compare succesfully classified to total samples
def accuracy(Y_predict, Y):
  assert len(Y) == len(Y_predict)
  correct = sum(Y_predict == Y)
  return correct/len(Y)

In [22]:
# Printing the test prediction
Y_test_predict = log
print(Y_test_predict)

[1 0 1 1 1 0 1 1 0 0 0 0 1 0 1 0 1 1 1 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1
 0 0 1 0 1 1 0 1 1 0 1 0 0 0 1 1 0 1 0 1 1 0 0 1 0 0 0 1 1 1 0 1 0 1 0 1 1
 0 1 0 0 0 1 1 1 1 1 0 0 1 1 0 1]


In [23]:
# Accuracy of the created Logistic Regression method
print(accuracy(Y_test_predict, Y_test))


0.7111111111111111
