In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
pwd

In [None]:
test = pd.read_csv('Data/test.csv')
#test.set_index('PassengerId', inplace = True)
test.head()

In [None]:
train = pd.read_csv('Data/train.csv')
train.set_index('PassengerId', inplace = True)
train.head()

## Explore Train Data

Aim is to look into the train data-set to find out trends and distributions

### Survival Rate

In [None]:
#Look at survivial rate
train['Survived'].value_counts()

In [None]:
sum(train['Survived'])/len(train['Survived'])
#survival rate is 42%

In [None]:
#Suvival by class
train.groupby('Pclass').apply(lambda x: sum(x['Survived'])/len(x))

In [None]:
#Suvival by embarkment point
train.groupby('Embarked').apply(lambda x: sum(x['Survived'])/len(x))

#High rate in Southampton

In [None]:
train.groupby(['Embarked','Pclass']).apply(lambda x: len(x))

#High southampton rate potentially caused by prevelance of third class pasengers

In [None]:
train['Sex'].value_counts()

In [None]:
train.groupby('Sex').apply(lambda x: sum(x['Survived'])/len(x))

In [None]:
train.groupby(['Pclass','Sex']).apply(lambda x: len(x))

In [None]:
train.groupby('SibSp').apply(lambda x: sum(x['Survived'])/len(x))


## Correlation Plot

In [None]:
#Built quick cabin flag to indicate whether they had a cabin or not, may expand to build a "no cabin", "C Floor", "B Floor"... feature
train['CabinFlag'] = train['Cabin'].isna().astype('int')

In [None]:
plt.matshow(train.corr())
plt.show()

In [None]:
corr = train.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
train.head()

## Perform Baskic feature engineering

In [None]:
y = train['Survived']

In [None]:
#impute Embarked NA columns
train['Embarked'].fillna('NA', inplace = True)

In [None]:
#impute Embarked NA columns
train['Embarked'].fillna('NA', inplace = True)
train['Pclass'] = train['Pclass'].astype('str')
train['CabinFlag'] = train['CabinFlag'].astype('str')
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1

In [None]:
train['CabinLt'] = train['Cabin'].str[0]
train['CabinLt'].fillna('NA', inplace = True)
train.head()

In [None]:
categoricals = ['Pclass', 'Sex', 'Embarked', 'CabinFlag', 'CabinLt']
trainCat = pd.get_dummies(train[categoricals])
trainCat.head()

In [None]:
#need to imput fares and age, going to imput average per class, per embarkment point
train['Fare'] = train.groupby(['Pclass','Embarked', 'Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))
train['Age'] = train.groupby(['Pclass','Embarked', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
train.head()

In [None]:
numericals = ['SibSp', 'Parch', 'Fare', 'Age', 'FamilySize']
trainNum = train[numericals]
trainNum.head()

In [None]:
train['Embarked'].isna().value_counts()

In [None]:
test['Age'].isna().value_counts()

In [None]:
np.mean(test['Age'])

In [None]:
np.mean(train['Age'])

In [None]:
X = trainCat


In [None]:
X[numericals] = train[numericals]

In [None]:
X.head()

## Quick GBM model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score


In [None]:
kf = KFold(n_splits=10)
kf.get_n_splits(X)

In [None]:
kf

In [None]:
gbm = GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=1, learning_rate = 0.1)
#Best: 0.77914, using {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

In [None]:
accuracy_model = []

In [None]:
X.isnull().values.any()

In [None]:
for train_index, test_index in kf.split(X):
    # Split train-test
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train the model
    model = gbm.fit(X_train, y_train)
    # Append to accuracy_model the accuracy of the model
    accuracy_model.append(accuracy_score(y_test, model.predict(X_test), normalize=True)*100)

In [None]:
print(accuracy_model)

In [None]:
test['CabinFlag'] = test['Cabin'].isna().astype('int')

In [None]:
#impute Embarked NA columns
test['Embarked'].fillna('NA', inplace = True)
test['Pclass'] = test['Pclass'].astype('str')
test['CabinFlag'] = test['CabinFlag'].astype('str')
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

In [None]:
test['CabinLt'] = test['Cabin'].str[0]
test['CabinLt'].fillna('NA', inplace = True)

In [None]:
#need to imput fares and age, going to imput average per class, per embarkment point
test['Fare'] = test.groupby(['Pclass','Embarked', 'Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))
test['Age'] = test.groupby(['Pclass','Embarked', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
test.head()

In [None]:
testCat = pd.get_dummies(test[categoricals])
testCat.head()

In [None]:
XTest = testCat
XTest[numericals] = test[numericals]
XTest.head()

In [None]:
test.index

In [None]:
predictions = model.predict(XTest)


In [None]:
test.index.shape

In [None]:
predictions.shape

In [None]:
predictions

In [None]:
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survived'] = pd.DataFrame(predictions)

In [None]:
test.shape

In [None]:
submission.to_csv('Data/submission.csv', index=False)

In [None]:
submission['Survived'].value_counts()

## Trial grid search

In [None]:
from sklearn.model_selection import GridSearchCV, KFold

In [None]:
gbm = GradientBoostingClassifier()

In [None]:
gridparams = {'learning_rate':[0.1,0.3], 'n_estimators': [100,300,1000], 'max_depth':[3,5,7]}
#Best: 0.77914, using {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

In [None]:
seed =1 

In [None]:
#grid = GridSearchCV(estimator=gbm, param_grid=gridparams,
#                    cv=KFold(n_splits=10), verbose=10)
#
#grid_results = grid.fit(X, y)
#
## Summarize the results in a readable format
#print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))