In [1]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


def train_set_missing_ages_model(df):
    age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]

    known_age = age_df[age_df.Age.notnull()].values
    y = known_age[:, 0]
    X = known_age[:, 1:]
    
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
 
    set_age_regressor = SVR(C=1.0, epsilon=0.2)
#     set_age_regressor.fit(X_train, y_train)
#     pred = set_age_regressor.predict(X_test)
#     print(mean_absolute_error(y_test, pred))
    set_age_regressor.fit(X, y)
    
    return set_age_regressor


def set_missing_ages(df, regressor):
    age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]

    unknown_age = age_df[age_df.Age.isnull()].values
    pred = regressor.predict(unknown_age[:, 1:])
    df.loc[(df.Age.isnull()), 'Age'] = pred


def set_cabin_type(df):
    df.loc[df.Cabin.notnull(), 'Cabin'] = 'YES'
    df.loc[df.Cabin.isnull(), 'Cabin'] = 'NO'


def fit_age_fare_scaler(df):
    scaler = StandardScaler()
    scaler.fit(df[['Age', 'Fare']])
    return scaler


def scale_features(df, scaler):
    scaled_values = scaler.transform(df[['Age', 'Fare']])
    df['Age_scaled'] = scaled_values[:, 0]
    df['Fare_scaled'] = scaled_values[:, 1]
    return scaler


def dummy_features(df):
    dummies_cabin = pd.get_dummies(df['Cabin'], prefix='Cabin')
    dummies_embarked = pd.get_dummies(df['Embarked'], prefix='Embarked')
    dummies_sex = pd.get_dummies(df['Sex'], prefix='Sex')
    dummies_pclass = pd.get_dummies(df['Pclass'], prefix='Pclass')

    df = pd.concat([df, dummies_cabin, dummies_embarked, dummies_sex, dummies_pclass], axis=1)
    df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age', 'Fare'], axis=1, inplace=True)
    return df

    
print('done')

done


In [2]:
all_data = pd.read_csv('./input/train.csv')

set_age_regressor = train_set_missing_ages_model(all_data)
set_missing_ages(all_data, set_age_regressor)
set_cabin_type(all_data)

scaler = fit_age_fare_scaler(all_data)
scale_features(all_data, scaler)

all_data = dummy_features(all_data)

all_data.head()
# print('done')

Unnamed: 0,PassengerId,Survived,SibSp,Parch,Age_scaled,Fare_scaled,Cabin_NO,Cabin_YES,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3
0,1,0,1,0,-0.56014,-0.502445,1,0,0,0,1,0,1,0,0,1
1,2,1,1,0,0.663198,0.786845,0,1,1,0,0,1,0,1,0,0
2,3,1,0,0,-0.254305,-0.488854,1,0,0,0,1,1,0,0,0,1
3,4,1,1,0,0.433822,0.42073,0,1,0,0,1,1,0,1,0,0
4,5,0,0,0,0.433822,-0.486337,1,0,0,0,1,0,1,0,0,1


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

train_data = all_data.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np = train_data.values

X = train_np[:, 1:]
y = train_np[:, 0]
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = RandomForestClassifier(max_depth=4, random_state=42)
clf.fit(X, y)


# clf.fit(X_train, y_train)
# pred = clf.predict(X_test)
# accuracy_score(y_test, pred)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [4]:
import numpy as np

submission_data = pd.read_csv('./input/test.csv')
set_missing_ages(submission_data, set_age_regressor)
set_cabin_type(submission_data)

scale_features(submission_data, scaler)

submission_data = dummy_features(submission_data)

submission_df = submission_data.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
submission_features = submission_df.values

pred = clf.predict(submission_features)
result = pd.DataFrame({'PassengerId': submission_data.PassengerId.values, 'Survived': pred.astype(np.int32)})
result.to_csv('./submission.csv', index=False)
print('done')

done
