In [1]:
import sys
sys.path.append('../')

```
training.py -> titanic_pro/pipeline.pipeline      -> titanic_pro/model.xgb_model 
                                                  -> titanic_pro/custom_transformers.AgeImputer
                                                  -> config
            -> titanic_pro/train_utils.tune_model -> titanic_pro/pipeline.pipeline -> the rest (as above)
```

## titanic_pro/custom_transformers.py

In [2]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin


class AgeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        age_filled = X.copy()
        age_filled["age"].fillna(age_filled["age"].median(), inplace=True)
        return age_filled

## titanic_pro/pipeline.py

In [3]:
import logging
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from titanic_pro.model import xgb_model
from titanic_pro.config import LOG_FORMAT
# Custom transformer
#from titanic_pro.custom_transformers import AgeImputer

numeric_features = ["fare"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("age", AgeImputer(), ["age"]),
    ]
)

pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", xgb_model)])

logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
logger = logging.getLogger(__name__)


## titanic_pro/train_utils.py

In [4]:
from sklearn.model_selection import GridSearchCV
#from titanic_pro.pipeline import pipeline


def tune_model(X, y):
    grid_search_params = {
        "classifier__learning_rate": [0.01, 0.1, 0.2],
        "classifier__max_depth": [3, 5, 7],
        "classifier__n_estimators": [50, 100, 200],
    }

    grid_search = GridSearchCV(
        pipeline,
        param_grid=grid_search_params,
        scoring="accuracy",
        cv=5,
        verbose=1,
        n_jobs=-1,
    )

    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    return best_params, best_score


## train.py

In [5]:
import logging
import pandas as pd
#from titanic_pro.pipeline import pipeline, logger 
#from titanic_pro.train_utils import tune_model
from titanic_pro.utils import load_data
from sklearn.model_selection import train_test_split
from joblib import dump

logger.info("Loading data...")
train_df, _ = load_data()
logger.info(train_df.columns.values.tolist())
y = train_df["survived"]
X = train_df.drop(columns=["survived"])

logger.info("Splitting data into train and validation sets...")
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

logger.info("Tuning model...")
best_params, best_score = tune_model(X_train, y_train)
logger.info(f"Best parameters: {best_params}")
logger.info(f"Best score: {best_score}")

# Train the final version of the model
logger.info("Training the model with the best parameters...")
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)

logger.info("Evaluating the model on the validation set...")
score = pipeline.score(X_val, y_val)
logger.info(f"Validation accuracy: {score}")

logger.info("Saving the model...")
dump(pipeline, "model.joblib")


2023-04-08 13:19:16,812 - INFO - Loading data...
2023-04-08 13:19:16,827 - INFO - ['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest']
2023-04-08 13:19:16,829 - INFO - Splitting data into train and validation sets...
2023-04-08 13:19:16,829 - INFO - Tuning model...


Fitting 5 folds for each of 27 candidates, totalling 135 fits


2023-04-08 13:19:21,551 - INFO - Best parameters: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__n_estimators': 100}
2023-04-08 13:19:21,551 - INFO - Best score: 0.8237982663514579
2023-04-08 13:19:21,551 - INFO - Training the model with the best parameters...
2023-04-08 13:19:21,632 - INFO - Evaluating the model on the validation set...
2023-04-08 13:19:21,646 - INFO - Validation accuracy: 0.7966101694915254
2023-04-08 13:19:21,646 - INFO - Saving the model...


['model2.joblib']