In [142]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Train data preprocessing

In [143]:
df_train = pd.read_csv('titanic_train.csv')

In [144]:
def impute_age_manual(cols):  # add missing values to age manually with this function (try with imputer too)
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age
    
df_train['Age'] = df_train[['Age', 'Pclass']].apply(impute_age_manual,axis=1)

In [145]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [146]:
df_train.drop(['Sex','Embarked','Name','Ticket','Pclass','Cabin'],axis=1,inplace=True) 
df_train.dropna(inplace=True)
df_train.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare
0,1,0,22.0,1,0,7.25
1,2,1,38.0,1,0,71.2833
2,3,1,26.0,0,0,7.925
3,4,1,35.0,1,0,53.1
4,5,0,35.0,0,0,8.05


## Test data preprocessing

In [147]:
df_test = pd.read_csv('titanic_test.csv')

In [148]:
def impute_age_manual2(cols):  # add missing values to age manually with this function (try with imputer too)
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age
    
df_test['Age'] = df_test[['Age', 'Pclass']].apply(impute_age_manual2,axis=1)

In [149]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [150]:
df_test.drop(['Sex','Embarked','Name','Ticket','Pclass','Cabin'],axis=1,inplace=True) 
df_test.dropna(inplace=True)
df_test.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare
0,892,34.5,0,0,7.8292
1,893,47.0,1,0,7.0
2,894,62.0,0,0,9.6875
3,895,27.0,0,0,8.6625
4,896,22.0,1,1,12.2875


In [151]:
X_train = df_train.drop('Survived',axis=1)
y_train = df_train['Survived']

## Log Regression

In [152]:
from sklearn.linear_model import LogisticRegression

In [153]:
logmodel = LogisticRegression()

In [154]:
logmodel.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [155]:
df_test.columns

Index(['PassengerId', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [156]:
X_test = df_test[['PassengerId', 'Age', 'SibSp', 'Parch', 'Fare']]
predictions = logmodel.predict(X_test)

In [157]:
pred_data = pd.DataFrame(predictions)

In [158]:
df_test = pd.concat([df_test,pred_data],axis=1) 

In [159]:
df_test.drop(['Age', 'SibSp', 'Parch', 'Fare'],axis=1,inplace=True) 

In [160]:
df_test.columns = ['PassengerId', 'Survived']
df_test.head()

Unnamed: 0,PassengerId,Survived
0,892.0,0.0
1,893.0,0.0
2,894.0,0.0
3,895.0,0.0
4,896.0,0.0


In [161]:
df_test.to_csv('Titanic_log_res.csv')

## Random Forest

In [165]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)

In [167]:
rfc_pred_data = pd.DataFrame(rfc_predict)
df_test = pd.concat([df_test,rfc_pred_data],axis=1) 
df_test.head()

Unnamed: 0,PassengerId,Survived,0
0,892.0,0.0,0.0
1,893.0,0.0,0.0
2,894.0,0.0,0.0
3,895.0,0.0,0.0
4,896.0,0.0,0.0


In [168]:
df_test.drop(['Survived'],axis=1,inplace=True)
df_test.columns = ['PassengerId', 'Survived']
df_test.head()

Unnamed: 0,PassengerId,Survived
0,892.0,0.0
1,893.0,0.0
2,894.0,0.0
3,895.0,0.0
4,896.0,0.0


In [169]:
#np.savetxt('output_rand_forest.csv', rfc_predict, delimiter=',')
df_test.to_csv('Titanic_rnd_forest.csv')