In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
test = pd.read_csv('Data/test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,100000,3,"Holliday, Daniel",male,19.0,0,0,24745,63.01,,S
1,100001,3,"Nguyen, Lorraine",female,53.0,0,0,13264,5.81,,S
2,100002,1,"Harris, Heather",female,19.0,0,0,25990,38.91,B15315,C
3,100003,2,"Larsen, Eric",male,25.0,0,0,314011,12.93,,S
4,100004,1,"Cleary, Sarah",female,17.0,0,2,26203,26.89,B22515,C


In [3]:
train = pd.read_csv('Data/train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,4,1,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S


In [4]:
#impute Embarked NA columns
train['Embarked'].fillna('NA', inplace = True)
train['CabinFlag'] = train['Cabin'].isna().astype('int')

test['Embarked'].fillna('NA', inplace = True)
test['CabinFlag'] = test['Cabin'].isna().astype('int')

In [5]:
train['Embarked'].fillna('NA', inplace = True)
train['Pclass'] = train['Pclass'].astype('str')
train['CabinFlag'] = train['CabinFlag'].astype('str')
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
train['CabinLt'] = train['Cabin'].str[0]
train['CabinLt'].fillna('NA', inplace = True)

In [6]:
test['Embarked'].fillna('NA', inplace = True)
test['Pclass'] = test['Pclass'].astype('str')
test['CabinFlag'] = test['CabinFlag'].astype('str')
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
test['CabinLt'] = test['Cabin'].str[0]
test['CabinLt'].fillna('NA', inplace = True)

In [7]:
#need to imput fares and age, going to imput average per class, per embarkment point
train['Fare'] = train.groupby(['Pclass','Embarked', 'Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))
train['Age'] = train.groupby(['Pclass','Embarked', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))

test['Fare'] = test.groupby(['Pclass','Embarked', 'Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))
test['Age'] = test.groupby(['Pclass','Embarked', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))

In [8]:
categoricals = ['Pclass', 'Sex', 'Embarked', 'CabinFlag', 'CabinLt']
trainCat = pd.get_dummies(train[categoricals])
testCat = pd.get_dummies(test[categoricals])

In [9]:
numericals = ['SibSp', 'Parch', 'Fare', 'Age', 'FamilySize']
trainNum = train[numericals]
testNum = test[numericals]

In [10]:
trainNum

Unnamed: 0,SibSp,Parch,Fare,Age,FamilySize
0,2,0,27.14,42.50,3
1,0,0,13.35,32.00,1
2,1,2,71.29,0.33,4
3,0,0,13.04,19.00,1
4,0,0,7.76,25.00,1
...,...,...,...,...,...
99995,0,0,14.86,62.00,1
99996,0,0,11.15,66.00,1
99997,0,0,9.95,37.00,1
99998,0,1,30.92,51.00,2


In [11]:
x = trainNum.values #returns a numpy array
min_max_scaler = MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
pd.DataFrame(x_scaled, columns = trainNum.columns)

Unnamed: 0,SibSp,Parch,Fare,Age,FamilySize
0,0.250,0.000000,0.035565,0.488035,0.117647
1,0.000,0.000000,0.017030,0.367234,0.000000
2,0.125,0.222222,0.094908,0.002876,0.176471
3,0.000,0.000000,0.016613,0.217671,0.000000
4,0.000,0.000000,0.009516,0.286700,0.000000
...,...,...,...,...,...
99995,0.000,0.000000,0.019060,0.712379,0.000000
99996,0.000,0.000000,0.014073,0.758399,0.000000
99997,0.000,0.000000,0.012460,0.424758,0.000000
99998,0.000,0.111111,0.040646,0.585826,0.058824


In [12]:
nums = trainNum.append(testNum).values
min_max_scaler = MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(nums)
nums_f = pd.DataFrame(x_scaled, columns = trainNum.columns)

In [13]:
trainNumNorm = nums_f[0:100000]
testNumNorm = nums_f[100000::]

In [14]:
X = trainCat
X[numericals] = trainNumNorm

XTest = testCat
XTest[numericals] = testNumNorm

In [15]:
kf = KFold(n_splits=10)
kf.get_n_splits(X)

10

In [16]:
gbm = GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=1, learning_rate = 0.1)
#Best: 0.77914, using {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

In [17]:
accuracy_model = []

In [18]:
X.isnull().values.any()

False

In [19]:
for train_index, test_index in kf.split(X):
    # Split train-test
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train the model
    model = gbm.fit(X_train, y_train)
    # Append to accuracy_model the accuracy of the model
    accuracy_model.append(accuracy_score(y_test, model.predict(X_test), normalize=True)*100)

NameError: name 'y' is not defined

In [None]:
predictions = model.predict(XTest)

In [None]:
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survived'] = pd.DataFrame(predictions)

In [None]:
submission.to_csv('Data/submission.csv', index=False)