In [28]:
#Importation of libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import Lasso, LogisticRegression
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.tree import DecisionTreeRegressor

In [29]:
#Get data
diamonds = pd.read_csv('../dapt202011mad/diamonds_train.csv')
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.00
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95
...,...,...,...,...,...,...,...,...,...,...
40450,1.34,Ideal,G,VS1,62.7,57.0,10070,7.10,7.04,4.43
40451,2.02,Good,F,SI2,57.1,60.0,12615,8.31,8.25,4.73
40452,1.01,Ideal,H,SI1,62.7,56.0,5457,6.37,6.42,4.01
40453,0.33,Ideal,J,VS1,61.9,54.3,456,4.45,4.47,2.76


In [30]:
diamonds['volume']= diamonds['x'] * diamonds['y'] * diamonds['z']
diamonds['bright_relation'] = diamonds['table'] / diamonds['depth']

In [31]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z,volume,bright_relation
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,0.797706,61.752841,57.446133,3928.444469,5.729392,5.732819,3.537154,129.802259,0.93102
std,0.475544,1.431725,2.233535,3992.416147,1.124453,1.14665,0.697062,78.903997,0.048041
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0,0.0,0.683625
25%,0.4,61.0,56.0,945.0,4.71,4.72,2.91,64.889552,0.898876
50%,0.7,61.8,57.0,2397.0,5.69,5.71,3.52,114.689347,0.923825
75%,1.04,62.5,59.0,5331.0,6.54,6.54,4.035,170.844547,0.955519
max,4.5,79.0,95.0,18823.0,10.23,58.9,8.06,3840.59806,1.62116


In [32]:
#Delete outliers from 'volume'
q1v = diamonds['volume'].quantile(0.25)
q3v = diamonds['volume'].quantile(0.75)
print(f'Q1: {q1v} | Q3: {q3v}')

Q1: 64.88955200000001 | Q3: 170.84454699999998


In [33]:
iqr_vol = q3v - q1v
print(f'iqr: {iqr_vol:.2f}')

iqr: 105.95


In [34]:
n = 3
upper_fence_vol = q3v + n * iqr_vol
lower_fence_vol = q1v - n * iqr_vol
print(f'upper: {upper_fence_vol:.2f}', 
      f'lower: {lower_fence_vol:.2f}')

upper: 488.71 lower: -252.98


In [35]:
volume_outlier_filter = diamonds['volume'] \
                             .between(lower_fence_vol,
                                      upper_fence_vol)

print(f'n_outliers: {len(diamonds) - volume_outlier_filter.sum()}')

n_outliers: 16


In [36]:
diamonds = diamonds.loc[volume_outlier_filter, :]
diamonds.shape

(40439, 12)

In [37]:
#Delete outliers from 'bright_relation'
q1b = diamonds['bright_relation'].quantile(0.25)
q3b = diamonds['bright_relation'].quantile(0.75)
print(f'Q1: {q1b} | Q3: {q3b}')

Q1: 0.8988764044943821 | Q3: 0.9555189456342669


In [38]:
iqr_bri = q3b - q1b
print(f'iqr: {iqr_bri:.2f}')

iqr: 0.06


In [39]:
n = 3
upper_fence_bri = q3b + n * iqr_bri
lower_fence_bri = q1b - n * iqr_bri
print(f'upper: {upper_fence_bri:.2f}', 
      f'lower: {lower_fence_bri:.2f}')

upper: 1.13 lower: 0.73


In [40]:
bright_outlier_filter = diamonds['bright_relation'] \
                             .between(lower_fence_bri,
                                      upper_fence_bri)

print(f'n_outliers: {len(diamonds) - bright_outlier_filter .sum()}')

n_outliers: 118


In [41]:
diamonds = diamonds.loc[bright_outlier_filter, :]
diamonds.shape

(40321, 12)

In [42]:
diamonds_to_predict = pd.read_csv('../dapt202011mad/diamonds_test.csv')
diamonds_to_predict

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19
...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45


In [43]:
diamonds_to_predict['volume']= diamonds_to_predict['x'] * diamonds_to_predict['y'] * diamonds_to_predict['z']
diamonds_to_predict['bright_relation'] = diamonds_to_predict['table'] / diamonds_to_predict['depth']

In [44]:
#Classification of columns by type
NUM_FEATS = ['carat', 'depth', 'table','volume', 'bright_relation']
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'price'

In [45]:
#Transformer for numerical columns
numeric_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy='mean')), 
                                      ('scaler', RobustScaler())])

In [46]:
#Transformer for categorical columns
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('encoder', OrdinalEncoder(handle_unknown='ignore'))])

In [47]:
#Apply Column Transformer
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS),
                                ('cat', categorical_transformer, CAT_FEATS)])

In [48]:
pd.DataFrame(data=preprocessor.fit_transform(diamonds[FEATS]))

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.796875,0.428571,0.333333,0.778156,0.104714,3.0,6.0,5.0
1,-0.593750,0.857143,0.000000,-0.587247,-0.332610,4.0,4.0,5.0
2,0.015625,2.642857,-0.666667,-0.011261,-1.483477,0.0,3.0,4.0
3,-0.453125,1.428571,-0.333333,-0.456340,-0.810504,1.0,0.0,2.0
4,0.500000,-0.928571,0.666667,0.507655,0.913370,2.0,3.0,2.0
...,...,...,...,...,...,...,...,...
40316,1.000000,0.642857,0.000000,1.007757,-0.256041,2.0,3.0,4.0
40317,2.062500,-3.357143,1.000000,1.978232,2.250205,1.0,2.0,3.0
40318,0.484375,0.642857,-0.333333,0.465765,-0.538136,2.0,4.0,2.0
40319,-0.578125,0.071429,-0.900000,-0.563611,-0.819730,2.0,6.0,4.0


In [49]:
#Split in Train and Test
diamonds_train, diamonds_test = train_test_split(diamonds)
print(diamonds_train.shape)
print(diamonds_test.shape)

(30240, 12)
(10081, 12)


In [50]:
#Model definition with pipeline
model = Pipeline(steps = [('preprocessor', preprocessor), ('regressor', LGBMRegressor())])

In [51]:
#Train model
X_train = diamonds_train[FEATS]
y_train = diamonds_train[TARGET]

X_test = diamonds_test[FEATS]
y_test = diamonds_test[TARGET]

model.fit(X_train, y_train);

In [52]:
#Check how good is the model
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

In [53]:
print(f"test error: {mean_squared_error(y_pred=y_test_predict, y_true=y_test, squared=False)}")
print(f"train error: {mean_squared_error(y_pred=y_train_predict, y_true=y_train, squared=False)}")

test error: 551.9400145337859
train error: 475.51394641810873


In [54]:
#Check crossvalidation
scores = cross_val_score(model, 
                         diamonds[FEATS], 
                         diamonds[TARGET], 
                         scoring='neg_root_mean_squared_error', 
                         cv=10, n_jobs=-1).mean()

In [55]:
scores

-537.3965108820164

In [56]:
#Optimize model using grid search
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
    'regressor__max_depth': [2, 4, 8, 16, 32, 64, 128],
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=10, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=50)

grid_search.fit(diamonds[FEATS], diamonds[TARGET])

Fitting 10 folds for each of 50 candidates, totalling 500 fits


RandomizedSearchCV(cv=10,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer()),
                                                                                               ('scaler',
                                                                                                RobustScaler())]),
                                                                               ['carat',
                                                                                'depth',
                                                                                'table',
                                                                                'volume',
                           

In [57]:
grid_search.best_params_

{'regressor__n_estimators': 256,
 'regressor__max_depth': 16,
 'preprocessor__num__imputer__strategy': 'median'}

In [58]:
grid_search.best_score_

-528.0045381277824

In [59]:
#Prepare data submission
y_sub = grid_search.predict(diamonds_to_predict[FEATS])

In [60]:
submission_df = pd.DataFrame({'id': diamonds_to_predict['id'], 'price': y_sub})

In [61]:
submission_df.describe()

Unnamed: 0,id,price
count,13485.0,13485.0
mean,6742.0,3953.75263
std,3892.928525,3960.961332
min,0.0,311.085216
25%,3371.0,923.719411
50%,6742.0,2454.373047
75%,10113.0,5323.685233
max,13484.0,18683.921125


In [62]:
submission_df.to_csv('diamonds_prediction_17.csv', index=False)