In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression  # (For machine learning approach we use sklearn package)
import statsmodels.api as sm # (For econometrics approach we use statsmodels package)

sns.set()  #if you want to use seaborn themes with matplotlib functions

In [3]:
import warnings
warnings.filterwarnings("ignore")  # Ignore warnings for cleaner output

In [4]:
# seed
rand_state = 100 # For reproducibility

In [5]:
# loading the data
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,cut,color,clarity,carat,depth,table,price,x (Premium),z (Very Good),y (Good)
0,Fair,E,VS2,0.22,65.1,61.0,337,3.87,2.49,3.78
1,Fair,E,SI2,0.86,55.1,69.0,2757,6.45,3.52,6.33
2,Fair,F,SI2,0.96,66.3,62.0,2759,6.27,4.07,5.95
3,Fair,F,VS2,0.7,64.5,57.0,2762,5.57,3.58,5.53
4,Fair,F,VS2,0.7,65.3,55.0,2762,5.63,3.66,5.58


### Pycares

In [6]:
# let's double ckeck the Pycaret version:
from pycaret.utils import version
version()

'3.3.2'

In [7]:
df

Unnamed: 0,cut,color,clarity,carat,depth,table,price,x (Premium),z (Very Good),y (Good)
0,Fair,E,VS2,0.22,65.1,61.0,337,3.87,2.49,3.78
1,Fair,E,SI2,0.86,55.1,69.0,2757,6.45,3.52,6.33
2,Fair,F,SI2,0.96,66.3,62.0,2759,6.27,4.07,5.95
3,Fair,F,VS2,0.70,64.5,57.0,2762,5.57,3.58,5.53
4,Fair,F,VS2,0.70,65.3,55.0,2762,5.63,3.66,5.58
...,...,...,...,...,...,...,...,...,...,...
53935,Very Good,E,VS2,0.70,62.8,60.0,2755,5.59,3.53,5.65
53936,Very Good,D,VS1,0.70,63.1,59.0,2755,5.67,3.55,5.58
53937,Very Good,E,VS2,0.70,60.5,59.0,2757,5.71,3.47,5.76
53938,Very Good,E,VS2,0.70,61.2,59.0,2757,5.69,3.49,5.72


In [8]:
from pycaret.regression import *

In [9]:
setup(data=df, 
      target='price', 
      session_id=rand_state, 
      train_size=0.8,
      fold_strategy='kfold',
        fold=5,
        normalize=True,
        normalize_method='zscore')

Unnamed: 0,Description,Value
0,Session id,100
1,Target,price
2,Target type,Regression
3,Original data shape,"(53940, 10)"
4,Transformed data shape,"(53940, 27)"
5,Transformed train set shape,"(43152, 27)"
6,Transformed test set shape,"(10788, 27)"
7,Numeric features,6
8,Categorical features,3
9,Preprocess,True


<pycaret.regression.oop.RegressionExperiment at 0x30a148ca0>

In [10]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model._base.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model._coordinate_descent.Lasso,True
ridge,Ridge Regression,sklearn.linear_model._ridge.Ridge,True
en,Elastic Net,sklearn.linear_model._coordinate_descent.Elast...,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPu...,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.Passi...,True


In [11]:
linear_model = create_model('lr')
linear_model

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,746.6244,1308943.6974,1144.0908,0.9183,0.5908,0.3848
1,751.7598,1362764.7336,1167.3751,0.9169,0.6024,0.3965
2,751.5173,1286258.6669,1134.1334,0.9198,0.5978,0.3963
3,728.5556,1238210.9602,1112.7493,0.9202,0.5613,0.3934
4,744.5735,1266689.9162,1125.4732,0.9206,0.5873,0.3958
Mean,744.6061,1292573.5948,1136.7644,0.9192,0.5879,0.3934
Std,8.4927,42094.9294,18.449,0.0014,0.0143,0.0044


In [None]:
xgboost_model = create_model('xgboost')
xgboost_model
# Compare the models
compare_models()
# Plot the residuals
plot_model(linear_model, plot='residuals')
plot_model(linear_model, plot='error')
# Plot the feature importance
plot_model(linear_model, plot='feature')
# Plot the learning curve
plot_model(linear_model, plot='learning')
# Plot the prediction error
plot_model(linear_model, plot='error')
# Plot the AUC
plot_model(linear_model, plot='auc')
# Plot the prediction error
plot_model(linear_model, plot='feature')
# Plot the residuals
plot_model(linear_model, plot='residuals')
# Plot the feature importance
plot_model(linear_model, plot='feature')
# Plot the learning curve