In [19]:
# Work with paths
import os
import sys
sys.path.append('..')

# import implemented modules
from src.models import tree
import src.config as cfg

# import other modules
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score
from catboost import CatBoostRegressor, metrics, Pool
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Generate Dataset

In [83]:
! python3 ../src/data/make_dataset.py                           \
    --input_filepath=../data/raw/train.csv                      \
    --output_data_filepath=../data/processed/train_data.pkl     \
    --output_encoder_filepath=../data/processed/transformer.pkl

2022-10-22 23:50:16,578 - __main__ - INFO - preprocess data
2022-10-22 23:50:16,719 - __main__ - INFO - Dataset saved to ../data/processed/train_data.pkl


In [84]:
! python3 ../src/models/make_split.py                           \
    --input_data_filepath=../data/processed/train_data.pkl      \
    --output_selection_path=../data/interim/train_selection.pkl

2022-10-22 23:50:19,038 - __main__ - INFO - Split data into train and test


# Train model

In [85]:
! python3 ../src/models/train_model.py                          \
    --input_data_filepath=../data/interim/train_selection.pkl   \
    --output_model_filepath=../data/external/model.pkl          \
    --output_metrics_filepath=../reports/metrics/metrics.txt      \
    --plot_file=../reports/figures/learning_curve.html

2022-10-22 23:50:29,699 - __main__ - INFO - training catboost model
MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))
2022-10-22 23:50:32,558 - __main__ - INFO - metrics saved to ../data/external/metrics.txt


In [86]:
! python3 ../src/models/linear_reg.py                                   \
    --input_data_filepath=../data/interim/train_selection.pkl           \
    --output_model_filepath=../data/external/model_linear_reg.pkl       \
    --output_metrics_filepath=../reports/metrics/metrics_linear_reg.txt

2022-10-22 23:50:36,206 - __main__ - INFO - training model
2022-10-22 23:50:36,515 - __main__ - INFO - metrics saved to ../reports/metrics/metrics_linear_reg.txt


# Run inference

In [87]:
! python3 ../src/data/make_dataset.py                           \
    --input_filepath=../data/raw/test.csv                       \
    --output_data_filepath=../data/processed/val_data.pkl       \
    --is_val=True                                               \
    --output_encoder_filepath=../data/processed/transformer.pkl

2022-10-22 23:50:40,152 - __main__ - INFO - preprocess data
2022-10-22 23:50:40,259 - __main__ - INFO - Dataset saved to ../data/processed/val_data.pkl


In [88]:
! python3 ../src/models/run_inference.py                         \
    --input_filepath=../data/processed/val_data.pkl             \
    --input_model_filepath=../data/external/model.pkl           \
    --csv_outputh_path=../reports/kaggle/ansvers.csv

2022-10-22 23:50:43,000 - __main__ - INFO - run inference
2022-10-22 23:50:43,245 - __main__ - INFO - saved to file ../reports/kaggle/ansvers.csv


# y

Load dataset

In [22]:
train_data  = pd.read_pickle(str(os.getcwd() + "/../data/processed/train_data.pkl"))
val_data    = pd.read_pickle(str(os.getcwd() + "/../data/processed/val_data.pkl"  ))

Visualize data

If we want to predict illness, we are interested not in getting FalseNegative that is why main function is Recall

$$\text{Recall}=\frac{TP}{TP+FN}$$

Generate tran and test selections

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(
                                                    train_data[:,:-1],
                                                    train_data[:,-1],
                                                    train_size=0.8,
                                                    random_state=cfg.RANDOM_STATE,
                                                    )

In [57]:
Train_set = Pool(X_train, Y_train)
Test_set  = Pool(X_test, Y_test)

param = {
        'one_hot_max_size':1024,
        'iterations': 2000, 
        'learning_rate': 0.05, 
        'use_best_model': True, 
        'od_type': 'Iter', 
        'od_wait': 100, 
        'depth': 7, 
        'l2_leaf_reg': 0.13679400187948992,
        'loss_function':'MAE'
        }

model = CatBoostRegressor(**param)
model.fit(Train_set, eval_set=Test_set, plot=True, verbose=False)
pred = model.predict(Pool(X_test))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [54]:
model.best_score_.get('validation')
mean_absolute_error(Y_test, model.predict(X_test))

17542.550197271823

In [53]:
reg = LinearRegression().fit(X_train, Y_train)
mean_absolute_error(Y_test, reg.predict(X_test))



62469651416710.93

In [12]:
utils.save_model(model, str(os.getcwd() + "/../data/interim/CatboostClassifier.pkl"))

Accuracy: 0.5392670157068062
Recall: [0.80232558 0.         0.         0.         0.        ]
AUC: [0.74401993 0.5        0.5        0.5        0.5       ]


infer

In [56]:
val_data   = pd.read_pickle(str(os.getcwd() + "/../data/processed/val_data.pkl"))


val_data

array([[ 0.6457473 ,  0.11076257, -0.34007743, ...,  1.        ,
         1.        ,  0.        ],
       [ 0.67460525,  0.37584984, -0.43943965, ...,  1.        ,
         1.        ,  0.        ],
       [ 0.47259924,  0.33205283,  0.8522693 , ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [ 2.9543874 ,  0.95042276, -0.37319818, ...,  1.        ,
         1.        ,  0.        ],
       [ 0.12630318, -0.00759964,  0.6866656 , ...,  1.        ,
         1.        ,  0.        ],
       [ 0.47259924, -0.08918037,  0.71978635, ...,  1.        ,
         1.        ,  0.        ]], dtype=float32)