In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv(
    "/Users/rahulanil/garchomp/projects/kaggle/titanic/data/train.csv"
)
test_df = pd.read_csv("/Users/rahulanil/garchomp/projects/kaggle/titanic/data/test.csv")


In [3]:
import titanic_preprocessing as tp

test_df_passengeId = test_df["PassengerId"]
train_df, test_df = tp.generic_perprocessing(train_df, test_df)


In [4]:
X = train_df.loc[:, train_df.columns != "Survived"]
y = train_df["Survived"]

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(
    [("onehot", OneHotEncoder(sparse=False), ["Pclass", "Sex", "Embarked", "Initials"])]
)

ct.fit(X)
X_ct = ct.transform(X)
print(f"X_ct shape: {X_ct.shape}")

test = ct.transform(test_df)
print(f"X_test shape: {test.shape}")

X_ct shape: (889, 16)
X_test shape: (418, 16)


In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {
    "max_depth": [i for i in range(1, 6)],
    "n_estimators": [i for i in range(10, 200, 10)], 
    "learning_rate": [0.001, 0.01, 0.1, 1.0],
    "min_samples_split": [i for i in range(2, 10)],
}

grid_search = GridSearchCV(GradientBoostingClassifier(random_state=0), param_grid, cv=5, return_train_score=True, n_jobs=-1)


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_ct, y, random_state=0)

grid_search.fit(X_train, y_train)
print(f"test scores: {grid_search.score(X_test, y_test)}")
print(f"Best parametesrs: {grid_search.best_params_}")
print(f"best cross validation score: {grid_search.best_score_}")

test scores: 0.7757847533632287
Best parametesrs: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 150}
best cross validation score: 0.8423409269442261


In [8]:
grid_search_results = pd.DataFrame(grid_search.cv_results_)

In [10]:
best_params = grid_search.best_params_
print(best_params)
clf = GradientBoostingClassifier(**best_params)
clf.fit(X_ct, y)
y_test = pd.Series(clf.predict(test), name="Survived").astype(int)
result = pd.concat([test_df_passengeId, y_test], axis=1)
result.to_csv("GBC.csv", index=False)
display(result)

{'learning_rate': 0.01, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 150}


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
