In [1]:
!pip install --pre pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycaret
  Downloading pycaret-3.0.0-py3-none-any.whl (481 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.8/481.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting wurlitzer
  Downloading wurlitzer-3.0.3-py3-none-any.whl (7.3 kB)
Collecting pmdarima!=1.8.1,<3.0.0,>=1.8.0
  Downloading pmdarima-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Collecting tbats>=1.1.0
  Downloading tbats-1.1.3-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import matplotlib as mpl
import seaborn as sns
from pycaret.datasets import get_data
from pycaret.regression import *
mpl.rcParams['figure.dpi'] = 300

## Loading the data
Health Insurance Data where age, sex, bmi, children, smoker, and region are features and charges is a target.

charges are nothing but the billed charges for every individual based on features.

We will create a model that will predict the charges column.

In [3]:
data = pd.read_csv('wholesale.csv')

In [4]:
data.head(10)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Sale
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185
5,2,3,9413,8259,5126,666,1795,1451
6,2,3,12126,3199,6975,480,3140,545
7,2,3,7579,4956,9426,1669,3321,2566
8,1,3,5963,3648,6192,425,1716,750
9,2,3,6006,11093,18881,1159,7425,2098


In [5]:
data.shape

(440, 8)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Channel           440 non-null    int64
 1   Region            440 non-null    int64
 2   Fresh             440 non-null    int64
 3   Milk              440 non-null    int64
 4   Grocery           440 non-null    int64
 5   Frozen            440 non-null    int64
 6   Detergents_Paper  440 non-null    int64
 7   Sale              440 non-null    int64
dtypes: int64(8)
memory usage: 27.6 KB


## Data Viz
we use hue mapping to highlight the differences between smokers and non-smokers. As we can see, age is correlated with charges, i.e, people get higher charges as they grow older. In spite of that, being a non-smoker keeps the cost lower for most people, regardless of their age. Furthermore, overweight and obese people don't seem to get significantly higher charges, unless they are smokers.

## Initialize the PyCaret

In [8]:
reg = setup(
    data = data,
    target= 'Sale',
    train_size = 0.8,
    session_id = 7402,
    normalize = True
)

Unnamed: 0,Description,Value
0,Session id,7402
1,Target,Sale
2,Target type,Regression
3,Original data shape,"(440, 8)"
4,Transformed data shape,"(440, 8)"
5,Transformed train set shape,"(352, 8)"
6,Transformed test set shape,"(88, 8)"
7,Numeric features,7
8,Preprocess,True
9,Imputation type,simple


In [9]:
best = compare_models(sort='RMSE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,944.0304,6637409.8679,1862.4424,0.1165,1.1861,8.976,0.055
par,Passive Aggressive Regressor,935.2263,6821212.5714,1879.2558,0.1065,1.1727,7.4429,0.05
et,Extra Trees Regressor,1025.898,7179612.1345,1938.7078,0.0364,1.2652,9.2157,0.306
ada,AdaBoost Regressor,1146.0236,7410383.5955,1996.6622,-0.0234,1.4127,12.3623,0.111
gbr,Gradient Boosting Regressor,1061.4345,7262701.0399,2008.8494,-0.1324,1.2899,10.7475,0.218
rf,Random Forest Regressor,1023.7891,7421809.188,2019.207,-0.1167,1.2583,8.9962,0.545
en,Elastic Net,1075.951,7513611.348,2021.618,-0.1133,1.2546,8.1842,0.048
knn,K Neighbors Regressor,1066.7349,7935973.1219,2032.4467,-0.0342,1.2743,8.3559,0.063
dummy,Dummy Regressor,1152.8416,8406790.4954,2083.9214,-0.0543,1.39,8.9056,0.054
xgboost,Extreme Gradient Boosting,1121.5257,7504438.9019,2088.6782,-0.2463,1.3516,8.1683,0.196


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

In [10]:
model = create_model('huber', fold= 10)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,979.6144,1934836.9522,1390.9842,0.2405,1.467,12.383
1,769.6103,1490761.9932,1220.9676,0.1406,0.9586,1.3532
2,666.3904,750918.4724,866.5555,0.1506,0.9829,1.9497
3,568.1788,698817.4342,835.953,0.6905,1.4559,19.6741
4,656.3071,777322.8288,881.6591,0.4467,0.8238,1.0434
5,662.8332,911392.4714,954.6688,-0.1332,1.0638,1.9393
6,1263.4734,6765123.416,2600.9851,-0.1295,1.112,1.5561
7,2139.1326,48837004.0853,6988.3477,0.2231,1.8737,46.0809
8,878.136,2552281.1481,1597.586,-0.6959,0.9399,1.2477
9,856.6274,1655639.8781,1286.7167,0.2312,1.1833,2.5332


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

## Fine Tuning the Model

## Making Predictions and Saving the Model

In [11]:
predict_model(model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,1043.2274,4799429.0109,2190.7599,0.2357,1.1636,3.525


Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Sale,prediction_label
267,1,1,20893,1222,2576,3975,737,3628,926.534727
361,1,3,4734,607,864,1206,159,405,468.880704
217,1,1,18044,1475,2046,2532,130,1158,865.566810
103,1,3,56082,3504,8906,18028,1480,2498,2682.717794
146,1,3,7769,1936,2177,926,73,520,666.621360
...,...,...,...,...,...,...,...,...,...
113,1,3,14438,2295,1733,3220,585,1561,878.286636
28,2,3,4113,20484,25957,1158,8604,5206,2431.301050
93,1,3,11314,3090,2062,35009,71,2698,3081.294822
418,2,3,660,8494,18622,133,6740,776,1181.746464


In [12]:
!pip install joblib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
import joblib
# save the model to disk
joblib.dump(model, 'wolesale.sav')

['wolesale.sav']