In [52]:
import pandas as pd
from exploratory import Explore
from exploratory import ExploreTrain

explore_ebay = Explore('data/ebay.csv')
df_explore, features_explore = explore_ebay.read_csv(index = True)


In [53]:
#check shape before and after dropping null values
print(df_explore.shape)

df_explore = explore_ebay.drop_na(df_explore)

print(df_explore.shape)


(1000, 8)
(1000, 8)


In [54]:

df_explore.columns

Index(['item_category', 'item_condition', 'seller_rating', 'auction_duration',
       'starting_price', 'number_of_bidders', 'buy_price', 'ship_price'],
      dtype='object')

In [55]:
#ship price is the target, lets see the statistics of the target

df_explore.describe()



Unnamed: 0,seller_rating,auction_duration,starting_price,number_of_bidders,buy_price,ship_price
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,3.00524,72.48,512.03147,49.254,1790.38496,35.807699
std,1.176053,33.630531,295.705893,28.407437,1342.817145,26.856343
min,1.0,24.0,11.32,0.0,18.36,0.3672
25%,2.0,48.0,246.8425,25.0,697.7,13.954
50%,3.0,72.0,500.695,50.0,1433.56,28.6712
75%,4.0525,96.0,790.7975,73.0,2671.375,53.4275
max,4.99,120.0,998.67,100.0,5806.07,116.1214


In [56]:
#lets see the distribution of the features
ExploreTrain.check_distribution(df_explore, features_explore)

In [57]:
explore_ebay.display_chart(df_explore, features_explore, 'ship_price')

In [58]:
ExploreTrain.pycaret_explore(df_explore, 'ship_price', 'regression')

Unnamed: 0,Description,Value
0,Session id,123
1,Target,ship_price
2,Target type,Regression
3,Original data shape,"(1000, 8)"
4,Transformed data shape,"(1000, 14)"
5,Transformed train set shape,"(700, 14)"
6,Transformed test set shape,"(300, 14)"
7,Numeric features,5
8,Categorical features,2
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lr,Linear Regression,0.0,0.0,0.0,1.0,0.0,0.0,0.097
llar,Lasso Least Angle Regression,0.0006,0.0,0.0007,1.0,0.0001,0.0001,0.078
omp,Orthogonal Matching Pursuit,0.0,0.0,0.0,1.0,0.0,0.0,0.107
en,Elastic Net,0.0003,0.0,0.0004,1.0,0.0001,0.0,0.11
lar,Least Angle Regression,0.0,0.0,0.0,1.0,0.0,0.0,0.076
ridge,Ridge Regression,0.0,0.0,0.0,1.0,0.0,0.0,0.086
par,Passive Aggressive Regressor,0.026,0.0011,0.0334,1.0,0.0024,0.0017,0.096
br,Bayesian Ridge,0.0,0.0,0.0,1.0,0.0,0.0,0.082
lasso,Lasso Regression,0.0006,0.0,0.0007,1.0,0.0001,0.0001,0.096
huber,Huber Regressor,0.0,0.0,0.0,1.0,0.0,0.0,0.131


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0
6,0.0,0.0,0.0,1.0,0.0,0.0
7,0.0,0.0,0.0,1.0,0.0,0.0
8,0.0,0.0,0.0,1.0,0.0,0.0
9,0.0,0.0,0.0,1.0,0.0,0.0


Fitting 10 folds for each of 2 candidates, totalling 20 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [59]:
#clearly there is something fishy going on with 1.00 R2 score, there must be leakage in the data

#lets think of DAGS to see what the leakage could be
#seller_rating -> item_condition -> num_of_bidders -> buy_price -> ship_price
#seller_rating -> auction_duration -> num_of_bidders -> buyout_price -> ship_price
#seller_rating -> starting_price -> num_of_bidders -> buyout_price -> ship_price
#seller_rating -> starting_price -> auction_duration -> num_of_bidders -> buyout_price -> ship_price
#item_condition -> starting_price -> auction_duration -> num_of_bidders -> buyout_price -> ship_price
#item_category -> starting_price -> auction_duration -> num_of_bidders -> buyout_price -> ship_price
#item_condition -> auction_duration -> num_of_bidders -> buyout_price -> ship_price
#item_category -> auction_duration -> num_of_bidders -> buyout_price -> ship_price
#clearly there are a large number of colliders in the data which are causing leakage.
#The one feature that is not causing leakage is the seller_rating, so we will drop all the other features

df_explore = explore_ebay.drop_columns(df_explore,
    columns = ['item_condition', 'item_category', 'number_of_bidders',
    'auction_duration', 'buy_price', 'starting_price'])



In [62]:
print(df_explore)
try: 
    ExploreTrain.pycaret_explore(df_explore, 'ship_price', 'regression')
except:
    raise Exception('There is an issue with the regression or the evaluation of the regression model')

    item_category item_condition  seller_rating  ship_price
0             Art            New           2.66     27.0560
1     Electronics    Refurbished           4.69     61.7168
2     Electronics            New           1.35     95.8554
3         Fashion    Refurbished           2.71     62.6490
4        Antiques            New           4.14     36.9570
..            ...            ...            ...         ...
995      Antiques            New           4.91     54.9430
996       Fashion    Refurbished           3.92     49.4106
997   Electronics            New           2.07     34.0174
998           Art           Used           2.73     11.0420
999      Antiques            New           4.81     67.4158

[1000 rows x 4 columns]


Unnamed: 0,Description,Value
0,Session id,123
1,Target,ship_price
2,Target type,Regression
3,Original data shape,"(1000, 4)"
4,Transformed data shape,"(1000, 10)"
5,Transformed train set shape,"(700, 10)"
6,Transformed test set shape,"(300, 10)"
7,Numeric features,1
8,Categorical features,2
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dummy,Dummy Regressor,22.425,738.9958,27.0873,-0.0389,0.9995,2.5625,0.125
omp,Orthogonal Matching Pursuit,22.4288,740.2821,27.1093,-0.0405,0.9996,2.5638,0.094
en,Elastic Net,22.4212,740.1583,27.1085,-0.0405,0.9997,2.5648,0.067
llar,Lasso Least Angle Regression,22.4441,740.8019,27.1208,-0.0415,1.0,2.5644,0.107
lasso,Lasso Regression,22.4441,740.802,27.1208,-0.0415,1.0,2.5644,0.119
br,Bayesian Ridge,22.453,743.6382,27.1735,-0.0456,1.001,2.5678,0.078
lr,Linear Regression,22.3763,745.3411,27.2018,-0.0482,1.0025,2.6056,1.737
ridge,Ridge Regression,22.3884,745.4137,27.2045,-0.0486,1.0029,2.6061,0.076
lar,Least Angle Regression,22.3893,745.5298,27.2066,-0.0488,1.0029,2.6065,0.106
huber,Huber Regressor,21.7898,764.3286,27.4779,-0.067,0.9674,2.251,0.175


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Exception: There is an issue with the regression

In [63]:
#there was an issue with evaluating the regression as seen by the dummy regressor being the best regressor
# being the best regressor so lets try a more specific model to see if we can gleam more insights
#we choose the huber model because it is robust to outliers and the data has outliers 
# and can deal with heteroscedacisty
#is a good idea to use this dataset for putting a regression model into a pipeline
ExploreTrain.use_specific_model_pycaret(df_explore, 'ship_price', 'huber', 'regression')

Unnamed: 0,Description,Value
0,Session id,123
1,Target,ship_price
2,Target type,Regression
3,Original data shape,"(1000, 4)"
4,Transformed data shape,"(1000, 10)"
5,Transformed train set shape,"(700, 10)"
6,Transformed test set shape,"(300, 10)"
7,Numeric features,1
8,Categorical features,2
9,Preprocess,True


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,18.553,533.5129,23.0979,-0.0355,0.8921,1.5834
1,19.2024,630.941,25.1185,0.0491,0.8219,1.4989
2,20.555,637.5374,25.2495,-0.0418,0.9755,2.2444
3,24.458,945.9608,30.7565,-0.1471,1.0743,3.6022
4,24.3775,995.6754,31.5543,-0.0943,0.9749,1.7309
5,21.2002,632.7666,25.1549,0.0297,1.0648,2.9356
6,23.3783,846.7444,29.0989,0.021,1.1003,3.3312
7,19.7692,589.1137,24.2717,-0.0124,1.0132,2.8128
8,21.9053,850.367,29.1611,-0.2433,0.8278,1.0583
9,24.4989,980.667,31.3156,-0.195,0.9296,1.7119


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,19.7586,558.6942,23.6367,-0.0844,0.9352,1.7829
1,20.1142,639.4041,25.2864,0.0363,0.8571,1.6733
2,21.1442,636.8386,25.2357,-0.0407,1.0074,2.4953
3,24.0883,876.8267,29.6113,-0.0633,1.0837,3.9654
4,24.3092,943.3557,30.7141,-0.0368,0.9867,1.9006
5,22.0053,646.1457,25.4194,0.0092,1.1074,3.2898
6,23.9871,841.0356,29.0006,0.0276,1.1384,3.7362
7,21.0863,609.4983,24.688,-0.0474,1.0561,3.1967
8,21.5653,783.1026,27.984,-0.145,0.8228,1.1759
9,23.9416,907.9777,30.1327,-0.1064,0.934,1.8961


Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [None]:
#clearly with an R2 value of -0.002, putting this into production is a bad idea
#there is clearly less of a relationship with selling_rating and ship_price as also evidenced by the dummy regressor