In [155]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns

In [156]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print(train_df)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [157]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [158]:
# operating for age
print('Percent of missing "Age" records is %.2f%%' %((train_df['Age'].isnull().sum()/train_df.shape[0])*100))

Percent of missing "Age" records is 19.87%


In [159]:
# mean age
print('The mean of "Age" is %.2f' %(train_df["Age"].mean(skipna=True)))
# median age
print('The median of "Age" is %.2f' %(train_df["Age"].median(skipna=True)))

The mean of "Age" is 29.70
The median of "Age" is 28.00


In [160]:
#operating on missing cabin values
print('Percent of missing "Cabin" is %.2f%%' %((train_df['Cabin'].isnull().sum()/train_df.shape[0])*100))

Percent of missing "Cabin" is 77.10%


In [161]:
#operating on missing embarked
print('Percent of missing "Embarked" records is %.2f%%' %((train_df['Embarked'].isnull().sum()/train_df.shape[0])*100))

Percent of missing "Embarked" records is 0.22%


In [162]:
print('The most common boarding port of embarkation is %s.' %train_df['Embarked'].value_counts().idxmax())
#we shall assign most common boarding port to the missing values

The most common boarding port of embarkation is S.


In [163]:
train_d = train_df.copy()
train_d["Age"].fillna(train_df["Age"].median(skipna=True), inplace=True)
train_d["Embarked"].fillna(train_df['Embarked'].value_counts().idxmax(), inplace=True)
train_d.drop('Cabin', axis=1, inplace=True)
# train_d is modified form of train_df

In [164]:
print(train_d)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [165]:
train_d.isnull().sum()
#all values are null, so it is easier to solve

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [166]:



train_d['TravelAlone']=np.where((train_d["SibSp"]+train_d["Parch"])>0, 0, 1)
train_d.drop('SibSp', axis=1, inplace=True)
train_d.drop('Parch', axis=1, inplace=True)

In [167]:
#create categorical variables and drop some variables
training=pd.get_dummies(train_d, columns=["Pclass","Embarked","Sex"])
training.drop('Sex_female', axis=1, inplace=True)
training.drop('PassengerId', axis=1, inplace=True)
training.drop('Name', axis=1, inplace=True)
training.drop('Ticket', axis=1, inplace=True)

final_train = training
print(final_train)

     Survived   Age     Fare  TravelAlone  Pclass_1  Pclass_2  Pclass_3  \
0           0  22.0   7.2500            0         0         0         1   
1           1  38.0  71.2833            0         1         0         0   
2           1  26.0   7.9250            1         0         0         1   
3           1  35.0  53.1000            0         1         0         0   
4           0  35.0   8.0500            1         0         0         1   
..        ...   ...      ...          ...       ...       ...       ...   
886         0  27.0  13.0000            1         0         1         0   
887         1  19.0  30.0000            1         1         0         0   
888         0  28.0  23.4500            0         0         0         1   
889         1  26.0  30.0000            1         1         0         0   
890         0  32.0   7.7500            1         0         0         1   

     Embarked_C  Embarked_Q  Embarked_S  Sex_male  
0             0           0           1        

In [168]:
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [169]:
test_d = test_df.copy()
test_d["Age"].fillna(train_df["Age"].median(skipna=True), inplace=True)
test_d["Fare"].fillna(train_df["Fare"].median(skipna=True), inplace=True)
test_d.drop('Cabin', axis=1, inplace=True)

test_d['TravelAlone']=np.where((test_d["SibSp"]+test_d["Parch"])>0, 0, 1)

test_d.drop('SibSp', axis=1, inplace=True)
test_d.drop('Parch', axis=1, inplace=True)

testing = pd.get_dummies(test_d, columns=["Pclass","Embarked","Sex"])
testing.drop('Sex_female', axis=1, inplace=True)
testing.drop('PassengerId', axis=1, inplace=True)
testing.drop('Name', axis=1, inplace=True)
testing.drop('Ticket', axis=1, inplace=True)

final_test = testing
print(final_test)

      Age      Fare  TravelAlone  Pclass_1  Pclass_2  Pclass_3  Embarked_C  \
0    34.5    7.8292            1         0         0         1           0   
1    47.0    7.0000            0         0         0         1           0   
2    62.0    9.6875            1         0         1         0           0   
3    27.0    8.6625            1         0         0         1           0   
4    22.0   12.2875            0         0         0         1           0   
..    ...       ...          ...       ...       ...       ...         ...   
413  28.0    8.0500            1         0         0         1           0   
414  39.0  108.9000            1         1         0         0           1   
415  38.5    7.2500            1         0         0         1           0   
416  28.0    8.0500            1         0         0         1           0   
417  28.0   22.3583            0         0         0         1           1   

     Embarked_Q  Embarked_S  Sex_male  
0             1        

In [170]:
cols = ["Age","Fare","TravelAlone","Pclass_1","Pclass_2","Embarked_C","Embarked_S","Embarked_Q","Sex_male"] 
x_tr = final_train[cols]
y_tr = final_train['Survived']
print(x_tr)
print(y_tr)

      Age     Fare  TravelAlone  Pclass_1  Pclass_2  Embarked_C  Embarked_S  \
0    22.0   7.2500            0         0         0           0           1   
1    38.0  71.2833            0         1         0           1           0   
2    26.0   7.9250            1         0         0           0           1   
3    35.0  53.1000            0         1         0           0           1   
4    35.0   8.0500            1         0         0           0           1   
..    ...      ...          ...       ...       ...         ...         ...   
886  27.0  13.0000            1         0         1           0           1   
887  19.0  30.0000            1         1         0           0           1   
888  28.0  23.4500            0         0         0           0           1   
889  26.0  30.0000            1         1         0           1           0   
890  32.0   7.7500            1         0         0           0           0   

     Embarked_Q  Sex_male  
0             0        

In [171]:
cols = ["Age","Fare","TravelAlone","Pclass_1","Pclass_2","Embarked_C","Embarked_S","Embarked_Q","Sex_male"] 
x_te = final_test[cols]
#y_tr = final_train['Survived']
print(x_te)
#print(y_tr)

      Age      Fare  TravelAlone  Pclass_1  Pclass_2  Embarked_C  Embarked_S  \
0    34.5    7.8292            1         0         0           0           0   
1    47.0    7.0000            0         0         0           0           1   
2    62.0    9.6875            1         0         1           0           0   
3    27.0    8.6625            1         0         0           0           1   
4    22.0   12.2875            0         0         0           0           1   
..    ...       ...          ...       ...       ...         ...         ...   
413  28.0    8.0500            1         0         0           0           1   
414  39.0  108.9000            1         1         0           1           0   
415  38.5    7.2500            1         0         0           0           1   
416  28.0    8.0500            1         0         0           0           1   
417  28.0   22.3583            0         0         0           1           0   

     Embarked_Q  Sex_male  
0          

In [172]:
test_d = pd.read_csv("gender_submission.csv")

y_te=test_d['Survived']
print (y_te)

0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64


In [173]:
weights=np.zeros([1,len(x_tr.T)], dtype='float')

In [174]:
def sigmoid(x):
    a=np.zeros([len(x), 1], dtype='float')
    for i in range(len(x)):
        a[i, 0]=1/( 1+ (math.exp( -x[i, 0] ) ) )
    return a

In [175]:
def predict(x_tr, weights):
    a=np.dot(x_tr, weights.T)
    return sigmoid(a)

In [176]:
def cost_function(x_tr, y_tr, weights):
    costs=np.zeros([1, len(x_tr)], dtype='float')
    prediction=predict(x_tr, weights)
    sum=0
    for i in range(len(x_tr)):
        costs[0, i]+=-( (y_tr[i, 0]*math.log(prediction[i,0])) + ( (1-y_tr[i,0])*math.log(1-prediction[i,0]) ) )
        sum+=costs[0, i]
    return sum/(len(x_tr))

In [177]:
def update(x_tr, y_tr, weights, learning_rate):
    weightD=np.zeros(np.shape(x_tr), dtype=float)
    prediction=predict(x_tr, weights)#891 X 1
    for i in range(len(x_tr)):
        for j in range(len(x_tr.T)):
            weightD[i,j]=x_tr[i,j]*(prediction[i,0]-y_tr[i, 0])
    sum=np.zeros(len(weightD.T), dtype='float') #this is probably the only 1D array
    for i in range(len(x_tr.T)):
        for j in range(len(x_tr)):
            sum[i]+=weightD[j , i]
        sum[i]=sum[i]/j
    for i in range(len(weights.T)):
        weights[0,i]-=sum[i]*learning_rate
    return weights

In [182]:
def train(x_tr, y_tr, weights, learning_rate, iters):
    a=np.zeros(int(iters/10))
    for i in range(iters):
        weights=update(x_tr, y_tr, weights, learning_rate)
        if i%10 == 0:
            a[(i//10)]=cost_function(x_tr, y_tr, weights)
        if i%50 == 0:
            print(cost_function(x_tr, y_tr, weights))

train(x_tr, y_tr, weights, learning_rate=1.0e-3, iters=500)
predict(x_te, weights)
accuracy=0
for i in range(len(p)):
    if predict[i,0]<0.5:
        predict[i,0]=0
    else:
        predict[i,0]=1
    if p[i, 0] == y_te[i,0]:
        accuracy=accuracy+1

accuracy=accuracy/len(p)
print("Accuracy= ", accuracy)


KeyError: (0, 0)