In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
pd.plotting.register_matplotlib_converters()
import seaborn as sns

X = pd.read_csv('../input/titanic/train.csv')
X_test = pd.read_csv('../input/titanic/test.csv')
y = X['Survived'].copy()
#X.drop(['Survived'], inplace = True, axis = 1)
X.head()

# Basic EDA:

In [None]:
X.shape

In [None]:
print(X.describe())
X.describe(include = 'O')

In [None]:
print('Train:')
print(X.isnull().sum())
print('Test:')      
X_test.isnull().sum()

Observations:
* The training set contains 891 rows and 11 columns
* The Passenger Ids consist of every value between 1 and 891 including. They are sequential and not arbitrary.
* The training and test data contains missing values in Age and Cabin columns. Test also has a missing Fare value.
* There are duplicate ticket and cabin values. This may help during column imputations.


# Univariate and Bivariate Analysis:

Survival by Pclass

In [None]:
pid_surv = X.groupby('Pclass').Survived.sum()
pid_notsurv = (X.groupby('Pclass').PassengerId.count()) - pid_surv
pid_notsurv.rename('Not Survived', axis = 1, inplace = True)
X_pid = pd.concat([pid_surv, pid_notsurv], axis = 1)
X_pid.plot(kind = 'bar', stacked = True)

Survival by Sex

In [None]:
gen_surv = X.groupby('Sex').Survived.sum()
gen_nsurv = X.groupby('Sex').Survived.count() - gen_surv
gen_nsurv.rename("Not Survived", inplace = True)
gen_pd = pd.concat([gen_surv, gen_nsurv], axis = 1)
gen_pd.plot(kind = "bar", stacked = True)

Survival by Age

In [None]:
sns.boxplot(X['Age'], orient = 'v')
plt.figure()
sns.swarmplot(y = X.Age, x = X.Survived)
plt.figure()
sns.boxplot(x = X.Survived,y =  X.Age)

Survival by Fare

In [None]:
sns.boxplot(X.Fare, orient = 'v')
plt.figure()
sns.swarmplot(x = X.Survived, y = X.Fare)
plt.figure()
sns.boxplot(x = X.Survived, y = X.Fare)

Survival by Embarked

In [None]:
em_surv = X.groupby('Embarked').Survived.sum()
em_nsurv = X.groupby('Embarked').Survived.count() - em_surv
em_nsurv.rename("Not Survived", inplace = True)
X_em = pd.concat([em_surv, em_nsurv], axis = 1)
X_em.plot(kind = 'bar', stacked = True)

Observations:
* Pclass 3 contained the highest number of passengers in the dataset. It also had the lowest survival rate, whereas Pclass 1 had the highest rate of survival
* Male passengers had significantly lower survival rate than female passengers.
* From the Age vs Survived boxplot, the age of people who survived was overall lower than the age of the people who did not survive. From the swarwplot, very young children (Age < 8?) were more likely to survive than not.
* If you had paid a fare higher than 60?, you were more likely to survive than not. 
* Most of the passengers who had free tickets did not survive. Maybe they were crew members and hence stayed on the ship. This tells us we should not impute Fare values that are 0.
* Majority of the passengers had embarked from the Southampton port.

# Feature Generation

In [None]:
X_train = X.copy()
y_train = y.copy()
X_test1 = X_test.copy()
tt_data = [X_train, X_test1]

Creating Feature Deck

In [None]:
def setDeck(cabin):
    if pd.isnull(cabin):
        return 'U'
    elif str(cabin)[0] in ['A', 'B', 'C', 'D', 'E', 'F']:
        return str(cabin)[0]
    else:
        return 'Other'
for dataset in tt_data:
    dataset['Deck'] = dataset.Cabin.map(setDeck)
    
deck_surv = X_train.groupby('Deck').Survived.sum()
deck_nsurv = X_train.groupby('Deck').PassengerId.count() - deck_surv
deck_nsurv.rename("Not Survived", inplace = True)
X_deck = pd.concat([deck_surv,deck_nsurv], axis = 1)
X_deck.plot(kind = 'bar')

Creating Feature Family Size

In [None]:
label_names = ['single','small','medium','large']
cut_points = [-1,0,3,6,10]

for dataset in tt_data:
    dataset['Fam_size'] = dataset['Parch'] + dataset['SibSp']
    
X_fam = X_train.groupby('Fam_size').Survived.mean()
sns.barplot(x = X_fam.index, y = X_fam)

for dataset in tt_data:
    dataset['Fam_size'] = pd.cut(dataset.Fam_size, cut_points, labels = label_names)

Creating Feature Title

In [None]:
import regex as re
def setTitle(name):
    pat = ",\s(.*?)\."
    return re.search(pat, name).group(1)

for dataset in tt_data:
    dataset['Title'] = dataset['Name'].map(setTitle)
    dataset.Title.replace(to_replace = ['Dr', 'Rev', 'Col', 'Major', 'Capt', 'Jonkheer', 'Don'], value = 'Other', inplace = True)
    dataset.Title.replace({'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs', 'Sir':'Mr', 
                           'Lady':'Mrs', 'Dona': 'Mrs', 'the Countess':'Mrs'
                          }, inplace = True)

# Imputing Missing Values

In [None]:
plt.figure(figsize = (10, 7))
sns.heatmap(X_train.corr(), annot = True)

In [None]:
age_by_pclass_title = X_train.groupby(['Pclass','Title']).Age.median()
age_by_pclass_title

# Bin Fare and Age Features
Binning continuous data significantly reduces overfitting in tree based classifier algorithms, such as the Random Forest Classifier model we are going to use.

In [None]:
label_names = ['infant','child','teenager','young_adult','adult','aged']
cut_points = [0,5,12,18,35,60,81]
def imputeAge(row):
    if pd.isnull(row.Age):
        return age_by_pclass_title[row.Pclass, row.Title]
    else: 
        return row.Age
    
for dataset in tt_data:
    dataset['Age'] = dataset.apply(imputeAge, axis = 1)
    dataset['Age'] = pd.cut(dataset.Age, cut_points, labels = label_names)

In [None]:
fare_median = X_train.Fare.median()
label_names = ['free','low','medium','high','very high']
cut_points = [-1,0,37,100,170,700]
for dataset in tt_data:
    dataset.Fare.fillna(value = fare_median, inplace = True)
    dataset['Fare'] = pd.cut(dataset.Fare, cut_points, labels = label_names)

In [None]:
for dataset in tt_data:
    dataset['Embarked'].fillna(value = 'S', inplace = True)

# Implement Random Forest ML model
1. Encode Categorical Data with One Hot Encoding. We do not use ordinal encoding as there is no linear ordinal relationship between the different classes in any of the features.
2. Carry out parameter tuning with GridSearchCV

In [None]:
import category_encoders as ce
oh_enc = ce.OneHotEncoder(cols = ['Sex', 'Deck', 'Embarked', 'Title', 'Age', 'Fare', 'Fam_size', 'Pclass'], use_cat_names = True)
#ord_enc = ce.OrdinalEncoder(cols = [, 'Deck', 'Embarked', 'Title'])

X_train2 = oh_enc.fit_transform(X_train)
X_test2 = oh_enc.fit_transform(X_test1)
#X_train2 = X_train2.join(X_train[['Deck', 'Embarked', 'Title']])
#X_train2 = ord_enc.fit_transform(X_train)
X_train2.drop(['Ticket', 'Name', 'Cabin', 'Survived', 'PassengerId'], axis = 1, inplace = True)
X_test2.drop(['Ticket', 'Name', 'Cabin', 'PassengerId'], axis = 1, inplace = True)
X_train2.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier()
rf_params = {
             'max_features': [0.2],
             'n_estimators': [100],
             'max_depth': [4],
             'random_state':[1]
            }

grid = GridSearchCV(model, param_grid = rf_params, scoring = 'accuracy', cv = 4)
grid.fit(X_train2, y_train)
pred = grid.predict(X_test2)
output = pd.DataFrame({'PassengerId': X_test1.PassengerId, 'Survived': pred})
output.to_csv('my_submission_rf2.csv', index=False)
print("Your submission was successfully saved!")