In [19]:
# Work with paths
import os
import sys
sys.path.append('..')

# import implemented modules
from src.models import tree
import src.config as cfg
# from src.data import make_dataset
# import src.utils as utils
# from src.visualization import visualize
# from src.features import features

# import other modules
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score
from catboost import CatBoostRegressor, metrics, Pool
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer



# Generate Dataset

In [20]:
! python3 ../src/data/make_dataset.py                           \
    --input_filepath=../data/raw/train.csv                      \
    --output_data_filepath=../data/processed/train_data.pkl     \
    --output_encoder_filepath=../data/processed/transformer.pkl

2022-10-22 20:19:33,530 - __main__ - INFO - making final data set from raw data
2022-10-22 20:19:33,659 - __main__ - INFO - Dataset saved to ../data/processed/train_data.pkl


In [21]:
! python3 ../src/data/make_dataset.py                           \
    --input_filepath=../data/raw/test.csv                       \
    --output_data_filepath=../data/processed/val_data.pkl       \
    --is_val=True                                               \
    --output_encoder_filepath=../data/processed/transformer.pkl

2022-10-22 20:19:36,387 - __main__ - INFO - making final data set from raw data
2022-10-22 20:19:36,493 - __main__ - INFO - Dataset saved to ../data/processed/val_data.pkl


None

Load dataset

In [22]:
train_data  = pd.read_pickle(str(os.getcwd() + "/../data/processed/train_data.pkl"))

val_data   = pd.read_pickle(str(os.getcwd() + "/../data/processed/val_data.pkl"))

Visualize data

If we want to predict illness, we are interested not in getting FalseNegative that is why main function is Recall

$$\text{Recall}=\frac{TP}{TP+FN}$$

Generate tran and test selections

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(
                                                    train_data[:,:-1],
                                                    train_data[:,-1],
                                                    train_size=0.8,
                                                    random_state=cfg.RANDOM_STATE,
                                                    )

In [48]:
Train_set = Pool(X_train, Y_train)
Test_set  = Pool(X_test, Y_test)

param = {'one_hot_max_size':1024,'iterations': 2000, 'learning_rate': 0.05, 'use_best_model': True, 'od_type': 'Iter', 'od_wait': 100, 'depth': 7, 'l2_leaf_reg': 0.13679400187948992,'loss_function':'MAE'}
model = CatBoostRegressor(**param)
model.fit(Train_set, eval_set=Test_set, plot=True, verbose=False)

pred = model.predict(Pool(X_test))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [54]:
model.best_score_.get('validation')
mean_absolute_error(Y_test, model.predict(X_test))

17542.550197271823

In [53]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

reg = LinearRegression().fit(X_train, Y_train)
mean_absolute_error(Y_test, reg.predict(X_test))



62469651416710.93

In [12]:
utils.save_model(model, str(os.getcwd() + "/../data/interim/CatboostClassifier.pkl"))

Accuracy: 0.5392670157068062
Recall: [0.80232558 0.         0.         0.         0.        ]
AUC: [0.74401993 0.5        0.5        0.5        0.5       ]
