In [40]:
import pandas as pd
import numpy as np

def titanic_model():
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import mean_absolute_error
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.model_selection import cross_val_score
    import matplotlib.pyplot as plt
    %matplotlib notebook


 
    X_full = pd.read_csv('train.csv')
    X_test_full = pd.read_csv('test.csv')
    
    X_complete = [X_full, X_test_full]
    
    age_bins = [0, 2, 7, 13, 16, 20, 27, 40, 60, np.inf]
    
    age_bins_numerical = np.arange(len(age_bins)-1)
    
    
    title_dict = {'Mlle' : 'Miss', 'Ms' : 'Miss', 'Mme' : 'Mrs', 'Don' : 'Royalty',
                 'Dona' : 'Royalty', 'Sir' : 'Royalty', 'Lady' : 'Royalty', 'Countess' : 'Royalty',
                 'Jonkheer' : 'Royalty', 'Major' : 'Military', 'Capt' : 'Military',
                 'Col' : 'Military', 'Rev' : 'Other', 'Dr' : 'Other'}
    
    for dataframe in X_complete:
        dataframe['Title'] = dataframe.Name.str.extract(r' ([A-Za-z]+)\.', expand=False)
        dataframe['Title'].replace(to_replace=title_dict, inplace=True)
        dataframe['Age'] = dataframe.groupby(['Sex', 'Title', 'Pclass']).Age.apply(lambda x: x.fillna(x.median()))
        dataframe['AgeRange'] = pd.cut(dataframe['Age'], age_bins, labels=age_bins_numerical)
        dataframe['AgeRange'] = pd.to_numeric(dataframe['AgeRange'])


    #baseline
    gender_submission = pd.read_csv('gender_submission.csv')

    cols_with_missing = [col for col in X_full.columns
                     if X_full[col].isnull().any()]
    
    y=X_full.Survived
    X_full.drop(['Survived'], axis=1,inplace=True)    
    
    # Preprocessing for numerical data
    numerical_transformer = SimpleImputer(strategy='median')
    
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    numerical_cols = ['AgeRange', 'Pclass', 'Fare', 'SibSp', 'Parch']
    categorical_cols = ['Sex', 'Embarked', 'Title']
    

    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])
    
    
    # Keep selected columns only
    my_cols = categorical_cols + numerical_cols
    X_red = X_full[my_cols].copy()
    X_test = X_test_full[my_cols].copy()
    
    
    #model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1,
    #         max_depth=4, random_state=0)
    
    #with n_estimators = 100, max_depth=6, Accuracy 0.821, on test_set= 0.804
    model = RandomForestClassifier(n_estimators=100, max_depth=6, n_jobs=-1, random_state=0)
    
    # Bundle preprocessing and modeling code in a pipeline
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])
    
    # Preprocessing of training data, fit model 
    my_pipeline.fit(X_red, y)
    
    scores = cross_val_score(my_pipeline, X_red, y,
                              cv=5,
                              scoring='accuracy')
    print('Accuracy:', scores)
    print("Accuracy mean:", scores.mean())
    print('Standard deviation:', scores.std())

    test_pred = my_pipeline.predict(X_test)

    answer = pd.DataFrame(data=X_test_full.PassengerId, columns=['PassengerId'])
    answer['Survived'] = test_pred

    answer.to_csv('answerv3', index=False)
    return answer
titanic_model()

Accuracy: [0.82681564 0.81564246 0.83146067 0.78651685 0.84745763]
Accuracy mean: 0.8215786511534382
Standard deviation: 0.02029467844484886


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
