In [1]:
import os
import pandas as pd
import numpy as np

# Import files
folder = 'titanic'
file_train = 'train.csv'
file_test = 'test.csv'

df_train_raw = pd.read_csv(os.path.join(os.getcwd(),folder,file_train))
df_test_raw = pd.read_csv(os.path.join(os.getcwd(),folder,file_test))
df_train_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# Prepare train data
df_train = df_train_raw.drop(['PassengerId','Name','Ticket','Cabin','Embarked','Sex'],axis=1)

from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
SexCat = df_train_raw[['Sex']]
SexCat_encoded = ordinal_encoder.fit_transform(SexCat)
df_train['SexCat'] = SexCat_encoded

SurvivedCat = df_train_raw[['Survived']]
SurvivedCat_encoded = ordinal_encoder.fit_transform(SurvivedCat)
df_train['SurvivedCat'] = SurvivedCat_encoded

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
imputed_train_array = imputer.fit_transform(df_train)
df_train_imputed = pd.DataFrame(imputed_train_array,columns=df_train.columns,index=df_train.index)
print('# NaN:')
print(df_train_imputed.isnull().sum())

AgeCat = pd.cut(df_train_imputed['Age'],bins=[0,10,np.inf],labels = [0,1])
df_train_imputed['AgeCatHelp'] = AgeCat
AgeCat = df_train_imputed[['AgeCatHelp']]
AgeCat_encoded = ordinal_encoder.fit_transform(AgeCat)
df_train_imputed['AgeCat'] = AgeCat_encoded
df_train_imputed = df_train_imputed.drop(['Age','AgeCatHelp','Survived'],axis=1)

from sklearn.preprocessing import MinMaxScaler
logScaler = MinMaxScaler()
df_train_imputed = pd.DataFrame(logScaler.fit_transform(df_train_imputed),columns=df_train_imputed.columns,index=df_train_imputed.index)

# NaN:
Survived       0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
SexCat         0
SurvivedCat    0
dtype: int64


In [7]:
# Train model (different trials)
X = df_train_imputed.drop(['SurvivedCat'],axis=1)
y = df_train_imputed['SurvivedCat']

from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print('Scores: ', scores)
    print('Mean: ', scores.mean())
    print('Standard deviation: ', scores.std())

# Try random forest model
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
forest_reg.fit(X,y)

rf_rmse_scores = -cross_val_score(forest_reg, X, y, scoring = "neg_mean_absolute_error", cv=3)
print('\n Random Forest (Mean Absolute Error):')
display_scores(rf_rmse_scores)

# Try logistic model
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X,y)

log_reg_scores = -cross_val_score(log_reg, X, y, scoring = "neg_mean_absolute_error", cv=3)
print('\n Logistic Regression (Mean Absolute Error):')
display_scores(log_reg_scores)


 Random Forest (Mean Absolute Error):
Scores:  [0.2738081  0.24468772 0.24698441]
Mean:  0.2551600792592512
Standard deviation:  0.01321943727884219

 Logistic Regression (Mean Absolute Error):
Scores:  [0.1986532  0.18518519 0.21548822]
Mean:  0.19977553310886645
Standard deviation:  0.01239658924487908


In [4]:
# Prepare test data
df_test = df_test_raw.drop(['PassengerId','Name','Ticket','Cabin','Embarked','Sex'],axis=1)

SexCat = df_test_raw[['Sex']]
SexCat_encoded = ordinal_encoder.fit_transform(SexCat)
df_test['SexCat'] = SexCat_encoded

imputed_test_array = imputer.fit_transform(df_test)
df_test_imputed = pd.DataFrame(imputed_test_array,columns=df_test.columns,index=df_test.index)
print('# NaN:')
print(df_test_imputed.isnull().sum())

AgeCat = pd.cut(df_test_imputed['Age'],bins=[0,10,np.inf],labels = [0,1])
df_test_imputed['AgeCatHelp'] = AgeCat
AgeCat = df_test_imputed[['AgeCatHelp']]
AgeCat_encoded = ordinal_encoder.fit_transform(AgeCat)
df_test_imputed['AgeCat'] = AgeCat_encoded
df_test_imputed = df_test_imputed.drop(['Age','AgeCatHelp'],axis=1)

df_test_imputed = pd.DataFrame(logScaler.fit_transform(df_test_imputed),columns=df_test_imputed.columns,index=df_test_imputed.index)
df_test_imputed.head()

# NaN:
Pclass    0
Age       0
SibSp     0
Parch     0
Fare      0
SexCat    0
dtype: int64


Unnamed: 0,Pclass,SibSp,Parch,Fare,SexCat,AgeCat
0,1.0,0.0,0.0,0.015282,1.0,1.0
1,1.0,0.125,0.0,0.013663,0.0,1.0
2,0.5,0.0,0.0,0.018909,1.0,1.0
3,1.0,0.0,0.0,0.016908,1.0,1.0
4,1.0,0.125,0.111111,0.023984,0.0,1.0


In [5]:
# Predict data
X_test = df_test_imputed
y_test_rf = forest_reg.predict(X_test)
y_test_log = log_reg.predict(X_test)

In [6]:
# Output data
output = pd.DataFrame({'PassengerId': df_test_raw.PassengerId, 'Survived': np.round(y_test_rf).astype(int)})
output.to_csv('my_submission_rf.csv', index=False)

output_log = pd.DataFrame({'PassengerId': df_test_raw.PassengerId, 'Survived': y_test_log.astype(int)})
output_log.to_csv('my_submission_log.csv', index=False)
print("Script finished")

Script finished
