In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
%%time
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns, zipfile, xgboost as xgb, optuna
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, make_scorer

## **1. Working with data**

In [None]:
data_train = pd.read_csv('/kaggle/input/titanic/train.csv')
data_train.head(2)

In [None]:
data_train.shape

In [None]:
data_train.info()

In [None]:
data_train = data_train.drop('Name', axis=1)
data_train.head(2)

In [None]:
data_train = data_train.drop('Cabin', axis=1)
data_train = data_train.drop('Ticket',axis=1)
data_train.head(2)

In [None]:
data_train.Sex = data_train.Sex.replace({'male' : 1, 'female' : 0})
data_train = pd.get_dummies(data_train, columns=['Embarked'])
data_train.head(2)

In [None]:
for col in data_train.columns.to_list():
  if col[:-2] == 'Embarked': data_train[col] = data_train[col].replace({True : 1, False : 0})
data_train.head(2)

In [None]:
for col in data_train.columns.to_list():
  print(f"{col} : {data_train[col].isna().sum()}")

In [None]:
data_train['Age'] = data_train['Age'].fillna(data_train['Age'].mean())
data_train.head(2)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
sns.histplot(data=data_train, x='Age', hue='Survived', palette={0 : 'pink', 1 : 'red'}, ax=axes[0])
sns.barplot(data=data_train, x="Sex", y="Survived", palette=["pink", "red"], ax=axes[1])
axes[0].set_title("Counts of survived via age")
axes[1].set_title("Relation of survived via sex")

In [None]:
scaler = MinMaxScaler()
data_train[['Age']] = scaler.fit_transform(data_train[['Age']])
data_train[['Fare']] = scaler.fit_transform(data_train[['Fare']])
data_train.head(2)

In [None]:
data_train = data_train.drop('PassengerId', axis=1)
data_train.head(2)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    data_train.iloc[:,1:],
    data_train.iloc[:,0],
    test_size=0.2
)

## **2. Creating model**

In [None]:
LR = LogisticRegression()
LR.fit(X_train, y_train)
None

In [None]:
GS = GridSearchCV(
    KNeighborsClassifier(), 
    [{'n_neighbors' : [1,2,3,4,5,6,7,8,9,10,15]}], 
    cv=5, 
    scoring=make_scorer(mean_squared_error, greater_is_better=False),
    verbose=0
)
GS.fit(X_train, y_train)
best_param = GS.best_params_['n_neighbors']

In [None]:
KNN = KNeighborsClassifier(n_neighbors=best_param)
KNN.fit(X_train, y_train)
None

In [None]:
tree = DecisionTreeClassifier(
      criterion='entropy',
      max_depth=200,
      max_features='sqrt',
      random_state=42
)
tree.fit(X_train, y_train)
None

In [None]:
forest = RandomForestClassifier(
    n_estimators=200,
    criterion='entropy',
    max_features='sqrt',
    max_depth=150
)
forest.fit(X_train, y_train)
None

In [None]:
optuna.logging.set_verbosity(optuna.logging.CRITICAL)
def objective(trial):
  params = {
      'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
      'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
      'max_depth': trial.suggest_int('max_depth', 3, 10),
      'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
      'subsample': trial.suggest_float('subsample', 0.5, 1.0),
      'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
      'gamma': trial.suggest_float('gamma', 0, 5),
      'alpha': trial.suggest_float('alpha', 0, 10),
      'lambda': trial.suggest_float('lambda', 0, 10),
  }
  tuned_boosting = xgb.XGBClassifier(
      **params,
      random_state=42,
      use_label_encoder=False,
      eval_metric='logloss'
  )
  tuned_boosting.fit(X_train, y_train)
  tb_predictions = tuned_boosting.predict(X_valid)
  return mean_squared_error(tb_predictions, y_valid)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

In [None]:
best_params = study.best_params
tuned_boost = xgb.XGBClassifier(**best_params, random_state=42, use_label_encoder=False)
tuned_boost.fit(X_train, y_train)
None

In [None]:
predictions_1 = LR.predict(X_valid)
predictions_2 = KNN.predict(X_valid)
predictions_3 = tree.predict(X_valid)
predictions_4 = forest.predict(X_valid)
predictions_5 = tuned_boost.predict(X_valid)

In [None]:
print(
    f'1. LogisticRegression: {mean_squared_error(predictions_1, y_valid)}\n'
    f'2. KNeighbors: {mean_squared_error(predictions_2, y_valid)}\n'
    f'3. Decision Tree: {mean_squared_error(predictions_3, y_valid)}\n'
    f'4. Random Forest: {mean_squared_error(predictions_4, y_valid)}\n'
    f'5. Tuned XGBoost: {mean_squared_error(predictions_5, y_valid)}'
)

## **3. Submision**

In [None]:
data_test = pd.read_csv('/kaggle/input/titanic/test.csv')
data_test.head(2)

In [None]:
data_test = data_test.drop('Name', axis=1)
data_test = data_test.drop('Cabin', axis=1)
data_test = data_test.drop('Ticket',axis=1)
data_test.Sex = data_test.Sex.replace({'male' : 1, 'female' : 0})
data_test = pd.get_dummies(data_test, columns=['Embarked'])
for col in data_test.columns.to_list():
  if col[:-2] == 'Embarked': data_test[col] = data_test[col].replace({True : 1, False : 0})
data_test.head()

In [None]:
for col in data_test.columns.to_list():
  print(f"{col} : {data_test[col].isna().sum()}")

In [None]:
data_test['Age'] = data_test['Age'].fillna(data_test['Age'].mean())
data_test['Fare'] = data_test['Fare'].fillna(data_test['Fare'].mean())

In [None]:
scaler = MinMaxScaler()
data_test[['Age']] = scaler.fit_transform(data_test[['Age']])
data_test[['Fare']] = scaler.fit_transform(data_test[['Fare']])
data_test.head(2)

In [None]:
X_test = data_test.iloc[:,1:]
X_test.head(2)

In [None]:
preds_test = forest.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': data_test['PassengerId'],
                       'Survived': preds_test})
output.to_csv('submission.csv', index=False)