## Logistic Regression

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
def get_sigmoid(features_k, beta):
    exp = np.dot(features_k, np.transpose(beta))
    sigmoid = 1/(1+np.exp(-exp))
    return sigmoid

In [3]:
def logistic_regression(features, output, learning_rate, max_iter):
    beta = np.zeros(len(np.transpose(features)))
    gradient_j = 0
    for i in range(max_iter):
        for j in range(len(beta)):
            for k in range(len(features)):
                sigmoid = get_sigmoid(features[k], beta)
                gradient_j -= features[k][j]*(output[k] - sigmoid)
            beta[j] += learning_rate * gradient_j
    return beta

In [17]:
data = pd.read_csv("train_titanic.csv")
data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S,0


In [18]:
test = pd.read_csv("test_titanic.csv")
test.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,"Davies, Master. John Morgan Jr",male,8.0,1,1,C.A. 33112,36.75,,S
1,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S
2,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
3,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Chr...",female,24.0,2,1,243847,27.0,,S
4,1,"McGough, Mr. James Robert",male,36.0,0,0,PC 17473,26.2875,E25,S


In [19]:
test.dropna(subset=['Embarked'], inplace=True)

In [20]:
data.dropna(subset=['Embarked'], inplace=True)

In [21]:
x_train = data.iloc[:,0:10]
x_test = test.iloc[:,0:10]

In [22]:
x_test.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

In [23]:
x_train.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

In [24]:
encoder = OneHotEncoder()
temp = encoder.fit_transform(np.array(x_test['Sex']).reshape(-1,1)).toarray()
temp = temp[:,1:]
x_test['Male'] = temp.flatten()
x_test.drop(columns=['Sex'], inplace=True)

In [25]:
encoder = OneHotEncoder()
temp = encoder.fit_transform(np.array(x_train['Sex']).reshape(-1,1)).toarray()
temp = temp[:,1:]
x_train['Male'] = temp.flatten()
x_train.drop(columns=['Sex'], inplace=True)

In [26]:
temp = encoder.fit_transform(np.array(x_test['Embarked']).reshape(-1,1)).toarray()
temp = temp[:,1:]
x_test.drop(columns=['Embarked'], inplace =True)
temp
x_test['Q'] = temp[:,0]
x_test['S'] = temp[:,1]

In [27]:
temp = encoder.fit_transform(np.array(x_train['Embarked']).reshape(-1,1)).toarray()
temp = temp[:,1:]
x_train.drop(columns=['Embarked'], inplace =True)
temp

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]])

In [28]:
x_train['Q'] = temp[:,0]
x_train['S'] = temp[:,1]

In [29]:
x_train.tail()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Male,Q,S
663,2,17.0,0,0,10.5,0.0,0.0,1.0
664,3,,0,0,7.75,1.0,1.0,0.0
665,3,32.0,0,0,56.4958,1.0,0.0,1.0
666,3,22.0,0,0,9.8375,0.0,0.0,1.0
667,3,,1,0,15.5,0.0,1.0,0.0


In [30]:
x_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Male,Q,S
0,2,8.0,1,1,36.75,1.0,0.0,1.0
1,1,49.0,0,0,25.9292,0.0,0.0,1.0
2,3,,0,0,7.7375,1.0,1.0,0.0
3,2,24.0,2,1,27.0,0.0,0.0,1.0
4,1,36.0,0,0,26.2875,1.0,0.0,1.0


In [32]:
x_train['Age'] = x_train['Age'].fillna(value=np.round(np.mean(x_train['Age'])))

In [33]:
x_test['Age'] = x_test['Age'].fillna(value=np.round(np.mean(x_test['Age'])))

In [34]:
x_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Male,Q,S
0,2,29.0,1,0,26.0,0.0,0.0,1.0
1,3,30.0,0,0,8.05,1.0,0.0,1.0
2,2,39.0,0,0,26.0,1.0,0.0,1.0
3,3,29.0,0,4,21.075,0.0,0.0,1.0
4,3,25.0,0,0,7.05,1.0,0.0,1.0


In [35]:
y_train = data.iloc[:,10]
y_train = np.array(y_train)

In [36]:
x_train = np.array(x_train)
x_train

array([[ 2., 29.,  1., ...,  0.,  0.,  1.],
       [ 3., 30.,  0., ...,  1.,  0.,  1.],
       [ 2., 39.,  0., ...,  1.,  0.,  1.],
       ...,
       [ 3., 32.,  0., ...,  1.,  0.,  1.],
       [ 3., 22.,  0., ...,  0.,  0.,  1.],
       [ 3., 30.,  1., ...,  0.,  1.,  0.]])

In [37]:
y_train

array([1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,

In [45]:
beta = logistic_regression(x_train, y_train, 0.01, 1000)

In [46]:
y_pred = []
for i in range(len(x_train)):
    y_pred.append(1/(1+np.exp(-1*(np.dot(x_train[i], beta)))))

In [47]:
y_pred

[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0

In [38]:
clf = LogisticRegression(random_state=0).fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
y_pred = clf.predict(x_test)

In [44]:
y_pred = pd.DataFrame(y_pred)
y_pred.to_csv('out.csv', index=False)