In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

%matplotlib inline

RANDOM_STATE = 42

In [15]:
df_train = pd.read_csv("D:/Titanic/titanic/train.csv")
df_test = pd.read_csv("D:/Titanic/titanic/test.csv")

In [16]:
df_train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
df_test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [17]:
df_train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [18]:
df_test.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [19]:
df_train['Age'].fillna(value=df_train['Age'].mean(), inplace=True)
df_test['Age'].fillna(value=df_test['Age'].mean(), inplace=True)
df_train.dropna().head()
df_test.dropna().head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [20]:
one_hot_cols = ['Embarked']
train = pd.get_dummies(data=df_train, prefix=one_hot_cols, columns=one_hot_cols, dtype=int)
train['Sex'] = (train['Sex'] == 'male').astype(int)

test = pd.get_dummies(data=df_test, prefix=one_hot_cols, columns=one_hot_cols, dtype=int)
test['Sex'] = (test['Sex'] == 'male').astype(int)

In [21]:
numerical_cols = ['Age', 'Fare']
train[numerical_cols] = StandardScaler().fit_transform(train[numerical_cols])
test[numerical_cols] = StandardScaler().fit_transform(test[numerical_cols])

In [22]:
X = train[[x for x in train.columns if x not in 'Survived']].values
y = train['Survived'].values.reshape(-1, 1)
X_train, X_cv, y_train, y_cv = train_test_split(X, y, train_size=0.8, random_state=RANDOM_STATE)
# X_cv, X_test, y_cv, y_test = train_test_split(X_, y_, train_size=0.5, random_state=RANDOM_STATE)
# del X_, y_
print(f'X_train:', X_train.shape)
print(f'X_cv:', X_cv.shape)
# print(f'X_test:', X_test.shape)
print(f'y_train:', y_train.shape)
print(f'y_cv:', y_cv.shape)
# print(f'y_test:', y_test.shape)

X_train: (712, 9)
X_cv: (179, 9)
y_train: (712, 1)
y_cv: (179, 1)


In [23]:
xgb_model = XGBClassifier(random_state=RANDOM_STATE)

In [24]:
grid_space = {
  "n_estimators": [10, 30, 60, 80, 100, 125, 150, 175, 190, 200, 240],
  "max_depth": [1, 2, 3, 4, 5, 8, 10, 20, 30],
  "learning_rate": [0.01, 0.1]
}

In [25]:
grid = GridSearchCV(
  estimator=xgb_model,
  param_grid=grid_space,
  cv=3,
  scoring="accuracy",
  verbose=0,
  return_train_score=True
)

In [26]:
grid.fit(X_train, y_train, verbose=0)

In [27]:
grid.best_params_

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 240}

In [28]:
# n = int(len(X_train) * 0.8)
# X_train_fit, X_train_eval, y_train_fit, y_train_eval = X_train[:n], X_train[n:], y_train[:n], y_train[n:]

In [29]:
# xgb_model = XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=150, random_state=RANDOM_STATE)
# xgb_model.fit(X_train_fit, y_train_fit, eval_set=[(X_train_eval, y_train_eval)], early_stopping_rounds=10)

In [30]:
final_model = grid.best_estimator_
final_model.score(X_cv, y_cv)

0.8044692737430168

In [31]:
df = pd.DataFrame(grid.cv_results_)
df = df.sort_values("rank_test_score")
df.to_csv("result.csv", )

In [32]:
results = grid.predict(test)
results = pd.Series(data=results, name='Survived')

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [33]:
submission = pd.concat([pd.Series(range(892, 1310), name='PassengerId'), results], axis=1)
submission.to_csv('submit_xgboost.csv', index=False)