In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [28]:
# pip install pycaret

In [29]:
from pycaret.regression import *

In [30]:
df = pd.read_csv('EDA_data.csv')

In [31]:
# df['total_salary'] = df['total_salary'].astype(int)
df['total_salary'].dtype

dtype('float64')

In [32]:
df['total_salary'].value_counts()

120000.000000    54
90000.000000     53
60000.000000     52
70000.000000     51
80000.000000     50
                 ..
78550.807043      1
102360.000000     1
62800.000000      1
90.000000         1
119421.000000     1
Name: total_salary, Length: 1188, dtype: int64

In [33]:
# we already dropped irrelevant features in the EDA part
# get dummy data
df_dummy = pd.get_dummies(df)

In [34]:
len(df_dummy.columns)

385

In [35]:
# split the data
X = df_dummy.drop('total_salary', axis=1)
y = df_dummy['total_salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [36]:
s = setup(df_dummy, target = 'total_salary', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,total_salary
2,Target type,Regression
3,Original data shape,"(3861, 385)"
4,Transformed data shape,"(3861, 385)"
5,Transformed train set shape,"(2702, 385)"
6,Transformed test set shape,"(1159, 385)"
7,Numeric features,384
8,Preprocess,True
9,Imputation type,simple


In [37]:
from pycaret.regression import RegressionExperiment
exp = RegressionExperiment()

In [38]:
type(exp)

pycaret.regression.oop.RegressionExperiment

In [39]:
exp.setup(df_dummy, target = 'total_salary', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,total_salary
2,Target type,Regression
3,Original data shape,"(3861, 385)"
4,Transformed data shape,"(3861, 385)"
5,Transformed train set shape,"(2702, 385)"
6,Transformed test set shape,"(1159, 385)"
7,Numeric features,384
8,Preprocess,True
9,Imputation type,simple


<pycaret.regression.oop.RegressionExperiment at 0x1a44d06dc30>

In [40]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,42460.3638,7483131364.0172,81808.9534,0.3615,0.5739,3.1525,0.097
llar,Lasso Least Angle Regression,43732.7101,7634676441.2623,82867.0828,0.3417,0.6096,3.1581,0.102
huber,Huber Regressor,36301.4209,7851015910.3136,83668.3782,0.3349,0.5147,2.6569,0.084
omp,Orthogonal Matching Pursuit,44007.3236,7743914614.8553,83429.8044,0.3331,0.5845,3.0864,0.095
ridge,Ridge Regression,44704.9273,7706475855.4378,83420.2512,0.3312,0.6697,3.3653,0.089
lasso,Lasso Regression,46146.5705,7940339242.6377,85111.6958,0.2979,0.6834,3.3937,0.091
par,Passive Aggressive Regressor,37591.6244,8299966881.6554,86497.0602,0.2849,0.5074,2.6665,0.094
rf,Random Forest Regressor,41432.835,8270140824.5445,86615.6095,0.269,0.5305,3.6905,0.133
gbr,Gradient Boosting Regressor,42775.4158,8393056898.7026,87094.3729,0.2656,0.5464,3.4613,0.102
en,Elastic Net,49855.5541,9306469450.1683,92740.8135,0.164,0.6164,3.4736,0.091


Processing:   0%|          | 0/77 [00:00<?, ?it/s]

In [41]:
exp.compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,42460.3638,7483131364.0172,81808.9534,0.3615,0.5739,3.1525,0.084
llar,Lasso Least Angle Regression,43732.7101,7634676441.2623,82867.0828,0.3417,0.6096,3.1581,0.087
huber,Huber Regressor,36301.4209,7851015910.3136,83668.3782,0.3349,0.5147,2.6569,0.085
omp,Orthogonal Matching Pursuit,44007.3236,7743914614.8553,83429.8044,0.3331,0.5845,3.0864,0.086
ridge,Ridge Regression,44704.9273,7706475855.4378,83420.2512,0.3312,0.6697,3.3653,0.093
lasso,Lasso Regression,46146.5705,7940339242.6377,85111.6958,0.2979,0.6834,3.3937,0.086
par,Passive Aggressive Regressor,37591.6244,8299966881.6554,86497.0602,0.2849,0.5074,2.6665,0.084
rf,Random Forest Regressor,41432.835,8270140824.5445,86615.6095,0.269,0.5305,3.6905,0.112
gbr,Gradient Boosting Regressor,42775.4158,8393056898.7026,87094.3729,0.2656,0.5464,3.4613,0.096
en,Elastic Net,49855.5541,9306469450.1683,92740.8135,0.164,0.6164,3.4736,0.097


Processing:   0%|          | 0/77 [00:00<?, ?it/s]