In [30]:
!pip install --pre pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import matplotlib as mpl
import seaborn as sns
from pycaret.datasets import get_data
from pycaret.regression import *
mpl.rcParams['figure.dpi'] = 300

## Loading the data
Health Insurance Data where age, sex, bmi, children, smoker, and region are features and charges is a target.

charges are nothing but the billed charges for every individual based on features.

We will create a model that will predict the charges column.

In [32]:
data = pd.read_csv('wine.csv')

In [33]:
data.head(10)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur dioxide,total_sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [34]:
data.shape

(6497, 12)

In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         6497 non-null   float64
 1   volatile_acidity      6497 non-null   float64
 2   citric_acid           6497 non-null   float64
 3   residual_sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free_sulfur dioxide   6497 non-null   float64
 6   total_sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 609.2 KB


## Data Viz
we use hue mapping to highlight the differences between smokers and non-smokers. As we can see, age is correlated with charges, i.e, people get higher charges as they grow older. In spite of that, being a non-smoker keeps the cost lower for most people, regardless of their age. Furthermore, overweight and obese people don't seem to get significantly higher charges, unless they are smokers.

## Initialize the PyCaret

In [36]:
reg = setup(
    data = data,
    target= 'quality',
    train_size = 0.8,
    session_id = 7402,
    normalize = True
)

Unnamed: 0,Description,Value
0,Session id,7402
1,Target,quality
2,Target type,Regression
3,Original data shape,"(6497, 12)"
4,Transformed data shape,"(6497, 12)"
5,Transformed train set shape,"(5197, 12)"
6,Transformed test set shape,"(1300, 12)"
7,Numeric features,11
8,Preprocess,True
9,Imputation type,simple


In [37]:
best = compare_models(sort='RMSE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.4006,0.3657,0.6041,0.5224,0.0918,0.0725,1.367
rf,Random Forest Regressor,0.4431,0.3809,0.6165,0.5024,0.0936,0.0799,2.785
lightgbm,Light Gradient Boosting Machine,0.4921,0.4209,0.648,0.4503,0.0977,0.088,0.424
xgboost,Extreme Gradient Boosting,0.4686,0.4241,0.6505,0.4458,0.0985,0.084,0.595
gbr,Gradient Boosting Regressor,0.531,0.4701,0.685,0.386,0.1027,0.0946,1.052
knn,K Neighbors Regressor,0.5368,0.51,0.7136,0.3334,0.1074,0.0964,0.153
ada,AdaBoost Regressor,0.571,0.5321,0.7288,0.3052,0.1092,0.102,0.392
lar,Least Angle Regression,0.5709,0.5446,0.7375,0.2884,0.1103,0.1018,0.217
br,Bayesian Ridge,0.571,0.5445,0.7375,0.2884,0.1103,0.1018,0.25
lr,Linear Regression,0.5709,0.5446,0.7375,0.2884,0.1103,0.1018,0.472


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

In [38]:
model = create_model('et', fold= 10)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.376,0.3259,0.5709,0.5752,0.0855,0.0669
1,0.4324,0.4095,0.6399,0.4799,0.0977,0.0795
2,0.4106,0.3984,0.6312,0.5014,0.0984,0.0768
3,0.3775,0.3281,0.5728,0.518,0.0892,0.0703
4,0.3997,0.3583,0.5986,0.5423,0.091,0.0723
5,0.4017,0.3714,0.6095,0.5451,0.0938,0.0742
6,0.3879,0.3301,0.5746,0.5278,0.0867,0.0691
7,0.4127,0.3966,0.6297,0.4959,0.0948,0.0737
8,0.3822,0.3242,0.5694,0.5585,0.0845,0.0667
9,0.4257,0.4149,0.6442,0.4803,0.0965,0.0755


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

## Fine Tuning the Model

## Making Predictions and Saving the Model

In [39]:
predict_model(model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,0.3627,0.3237,0.569,0.565,0.087,0.0664


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur dioxide,total_sulfur dioxide,density,pH,sulphates,alcohol,quality,prediction_label
6277,6.6,0.285,0.49,11.4,0.035,57.0,137.0,0.99732,3.08,0.54,8.9,6,6.00
931,7.4,0.610,0.01,2.0,0.074,13.0,38.0,0.99748,3.48,0.65,9.8,5,5.00
5308,6.7,0.310,0.09,1.4,0.039,53.0,141.0,0.99206,3.12,0.44,10.1,5,5.20
3767,7.4,0.190,0.30,12.8,0.053,48.5,229.0,0.99860,3.14,0.49,9.1,7,7.00
3791,7.9,0.280,0.41,4.9,0.058,31.0,153.0,0.99660,3.27,0.51,9.7,6,5.21
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5378,6.5,0.320,0.45,7.7,0.022,31.0,97.0,0.99134,3.20,0.70,12.7,7,7.11
1781,6.8,0.280,0.40,22.0,0.048,48.0,167.0,1.00100,2.93,0.50,8.7,5,5.00
3341,6.3,0.250,0.23,14.9,0.039,47.0,142.0,0.99705,3.14,0.35,9.7,6,6.00
4672,6.7,0.240,0.41,2.9,0.039,48.0,122.0,0.99052,3.25,0.43,12.0,5,6.70


In [45]:
final_model = finalize_model(model)
save_model(final_model, 'wine.sav')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['fixed_acidity',
                                              'volatile_acidity', 'citric_acid',
                                              'residual_sugar', 'chlorides',
                                              'free_sulfur dioxide',
                                              'total_sulfur dioxide', 'density',
                                              'pH', 'sulphates', 'alcohol'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('normalize', TransformerWrapper(transformer=StandardScaler())),
                 ('clean_column_names',
                  TransformerWrapper(transformer=CleanColumnNames())),
                 ('

In [41]:
!pip install joblib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [44]:
import joblib
# save the model to disk
joblib.dump(model, 'wine.sav')

['wine.sav']

In [47]:
import pickle
filename = 'wine.sav'
pickle.dump(model, open(filename, 'wb'))