In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize

In [2]:
# custom files

import model_best_hyperparameters

In [3]:
# read train data
df = pd.read_csv("../data/train_data.csv")

In [4]:
# Створення нової колонки age_group
age_bins = [0, 30, 60, np.inf]
names = ['Youth', 'Adult', 'Senior']
df['age_group'] = pd.cut(df['age'], age_bins, labels=names)

# Створення нової колонки bmi_group
weight_bins = [0, 18.5, 24.9, 29.9, np.inf]
names = ['Underweight', 'Healthy weight', 'Overweight', 'Obese']
df['bmi_group'] = pd.cut(df['bmi'], weight_bins, labels=names)

# Розбиваємо charges на категорії
charges_bins = [0, df['charges'].quantile(0.33), df['charges'].quantile(0.66), df['charges'].max()]
names = ['Low', 'Medium', 'High']
df['charges_cat'] = pd.cut(df['charges'], charges_bins, labels=names)

In [5]:
# Кодуємо категоріальні змінні
le = LabelEncoder()
for col in ['sex', 'smoker', 'region', 'age_group', 'bmi_group', 'charges_cat']:
    df[col] = le.fit_transform(df[col])

In [6]:
# Видалення непотрібних стовпців
columns_to_drop = ['age', 'bmi', 'children', 'charges']
df = df.drop(columns_to_drop, axis=1)

In [7]:
X = df.drop(['charges_cat'], axis=1)
y = df['charges_cat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100, stratify=y)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
rf = RandomForestClassifier(**model_best_hyperparameters.params)

# Тренуємо модель
rf.fit(X_train, y_train)

# Робимо прогнози
y_pred = rf.predict(X_test)

# Перетворюємо мітки в бінарний формат для розрахунку roc_auc
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
y_pred_bin = label_binarize(y_pred, classes=[0, 1, 2])

# Обчислюємо метрики
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
roc_auc = roc_auc_score(y_test_bin, y_pred_bin, average='macro', multi_class='ovr')

# Виводимо метрики
print('Accuracy: ', accuracy)
print('Recall: ', recall)
print('F1 Score: ', f1)
print('ROC AUC: ', roc_auc)

Accuracy:  0.8008298755186722
Recall:  0.8023824225584028
F1 Score:  0.8039935398619881
ROC AUC:  0.8517637645087595


In [13]:
filename = 'finalized_model.sav'
pickle.dump(rf, open(filename, 'wb'))