In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [2]:
# custom files
import model_best_hyperparameters

In [3]:
# read train data
df = pd.read_csv("../data/new_data.csv")
print('new data size', df.shape)

new data size (134, 7)


In [4]:
# Створення нової колонки age_group
age_bins = [0, 30, 60, np.inf]
names = ['Youth', 'Adult', 'Senior']
df['age_group'] = pd.cut(df['age'], age_bins, labels=names)

# Створення нової колонки bmi_group
weight_bins = [0, 18.5, 24.9, 29.9, np.inf]
names = ['Underweight', 'Healthy weight', 'Overweight', 'Obese']
df['bmi_group'] = pd.cut(df['bmi'], weight_bins, labels=names)

# Розбиваємо charges на категорії
charges_bins = [0, df['charges'].quantile(0.33), df['charges'].quantile(0.66), df['charges'].max()]
names = ['Low', 'Medium', 'High']
df['charges_cat'] = pd.cut(df['charges'], charges_bins, labels=names)

In [5]:
# Кодуємо категоріальні змінні
le = LabelEncoder()
for col in ['sex', 'smoker', 'region', 'age_group', 'bmi_group', 'charges_cat']:
    df[col] = le.fit_transform(df[col])
print(df.head())

   age  sex     bmi  children  smoker  region      charges  age_group  \
0   39    0  31.920         2       0       1   7209.49180          0   
1   42    0  36.195         1       0       1   7443.64305          0   
2   57    1  33.630         1       0       1  11945.13270          0   
3   47    0  36.000         1       0       3   8556.90700          0   
4   31    1  39.490         1       0       2   3875.73410          0   

   bmi_group  charges_cat  
0          1            2  
1          1            2  
2          1            2  
3          1            2  
4          1            1  


In [6]:
# Видалення непотрібних стовпців
columns_to_drop = ['age', 'bmi', 'children', 'charges']
df = df.drop(columns_to_drop, axis=1)

In [7]:
X = df.drop(['charges_cat'], axis=1)
y = df['charges_cat']

In [8]:
# load the model and predict
knn = pickle.load(open('finalized_model.sav', 'rb'))

In [9]:
y_pred = knn.predict(X.values)

In [12]:
predictions = knn.predict(X.values)
original_labels = le.inverse_transform(predictions)
original_charges_cat = le.inverse_transform(df['charges_cat'])
df['charges_pred'] = original_labels
df['charges_cat'] = original_charges_cat
df.to_csv('prediction_results.csv', index=False)

In [13]:
df.head(10)

Unnamed: 0,sex,smoker,region,age_group,bmi_group,charges_cat,charges_pred
0,0,0,1,0,1,Medium,Medium
1,0,0,1,0,1,Medium,Medium
2,1,0,1,0,1,Medium,Medium
3,0,0,3,0,1,Medium,Medium
4,1,0,2,0,1,Low,Medium
5,1,1,3,2,1,High,High
6,1,0,2,0,1,Low,Medium
7,0,1,0,0,0,High,High
8,1,0,0,2,0,Low,Low
9,0,0,2,0,2,Medium,Medium
