In [187]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
from sklearn.learning_curve import learning_curve
from sklearn.ensemble import BaggingRegressor

In [188]:
# load data
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
ori_train_df, ori_test_df = train_df.copy(), test_df.copy()

train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)
all_df = [train_df, test_df]

for df in all_df:
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col', \
                                       'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Others')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    df['LastName'] = df['Name'].apply(lambda x: str.split(x, ",")[0])

for df in all_df:
    df['FamMem'] = df['SibSp'] + df['Parch']
    df['Alone'] = 0
    df.loc[df.FamMem == 0, 'Alone'] = 1

for df in all_df:
    df.loc[df['Cabin'].notnull(), 'Cabin'] = 1
    df.loc[df['Cabin'].isnull(), 'Cabin'] = 0

entire_df = train_df.append(test_df)
entire_df['FamSurvived'] = 1
for grp, grp_df in entire_df.groupby(['LastName', 'Fare']):
    if len(grp_df) != 1:
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            if smax == 1:
                entire_df.loc[ind, 'FamSurvived'] = 2
            elif smin == 0:
                entire_df.loc[ind, 'FamSurvived'] = 0
entire_df['FamSurvived'] = entire_df['FamSurvived'].astype(int)
train_df, test_df = entire_df[:891], entire_df[891:]

train_df = train_df.drop(['Name', 'LastName', 'SibSp', 'Parch'], axis=1)
test_df = test_df.drop(['Name', 'LastName', 'SibSp', 'Parch'], axis=1)
all_df = [train_df, test_df]

for df in all_df:
    df['Sex'] = df['Sex'].map({'female': 1, 'male': 0}).astype(int)
    df['Title'] = df['Title'].map({'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Others': 5}).astype(int)
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Embarked'] = df['Embarked'].map({'S': 1, 'C': 2, 'Q': 3}).astype(int)

for df in all_df:
    for i in range(1, 4):
        for j in range(2):
            ages = df[(df['Pclass'] == i) & (df['Sex'] == j)]['Age'].dropna()
            age_est = ages.mean()
            age_est = int(2 * age_est + 0.5) * 0.5
            df.loc[(df.Age.isnull()) & (df.Pclass == i) & (df.Sex == j), 'Age'] = age_est

for df in all_df:
    df.Age = (df.Age // 16).astype(int)

test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)

for df in all_df:
    df.loc[df['Fare'] <= 7.91, 'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare']   = 2
    df.loc[df['Fare'] > 31, 'Fare'] = 3
    df['Fare'] = df['Fare'].astype(int)

x, y = train_df.drop(['PassengerId', 'Survived'], axis=1), train_df['Survived'].astype(int)
x_test = test_df.drop(['PassengerId', 'Survived'], axis=1)
y_preds = []
classifiers = []
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

In [189]:
rf = RandomForestClassifier()
rf_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 9],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}
rf_gs = GridSearchCV(rf, param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs=-1)
rf_gs.fit(x, y)
y_pred = rf_gs.predict(x_test)

In [190]:
submission = pd.DataFrame({
        'PassengerId': test_df['PassengerId'],
        'Survived': y_pred
    })
submission.to_csv('submission.csv', index=False)