In [11]:
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import axes3d
from matplotlib import cm
%matplotlib inline
%config Inlinebackend.figure_format = 'retina'

import seaborn as sns
sns.set_context('poster')
sns.set(rc={'figure.figsize': (16., 9.)})
sns.set_style('whitegrid')

# Modeling libraries
import statsmodels.formula.api as smf # welcome!!
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(display='diagram')

import pandas as pd
np.random.seed(123)

import warnings
warnings.filterwarnings('ignore')


In [12]:
data = pd.read_csv("../data/final_train.csv")

In [13]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [14]:
data.head()

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_name,color_name,clarity_name
0,0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,2,7,7
1,1,0.41,63.0,56.0,4.8,4.75,3.01,6.824,4,3,7
2,2,0.32,61.6,56.0,4.37,4.39,2.7,6.107,4,2,6
3,3,0.31,61.2,56.0,4.34,4.37,2.66,6.39,4,4,3
4,4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,3,1,6


In [15]:
X = data.drop('price', axis=1)
y = data['price']


# split data in train/ test/ 
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=123)

In [16]:
model = GradientBoostingRegressor(n_estimators=3000)

params = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
          'max_depth': [4, 6],
          'min_samples_leaf': [3, 5, 9, 17],
          'max_features': [1, 0.3, 0.1]}
grid_search = GridSearchCV(model,
                           param_grid=params,
                           cv=2,
                           n_jobs=3,
                           verbose=1)
grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 96 candidates, totalling 192 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  8.9min
[Parallel(n_jobs=3)]: Done 192 out of 192 | elapsed: 39.1min finished


In [17]:
grid_search.best_params_

{'learning_rate': 0.02,
 'max_depth': 6,
 'max_features': 0.3,
 'min_samples_leaf': 5}

In [18]:
np.logspace(-1, -3, num=10) * 3

array([0.3       , 0.17984528, 0.10781441, 0.06463304, 0.03874649,
       0.02322791, 0.01392477, 0.00834768, 0.0050043 , 0.003     ])

In [24]:
grid_search.score(X_train, y_train)

0.9957428235495399

In [25]:
best_rf.predict(X_train)

array([6.66546422, 7.62700786, 8.67867223, ..., 7.85897969, 6.76601759,
       6.63473151])

In [26]:
mean_squared_error(
    y_true=y_train,
    y_pred=best_rf.predict(X_train)
)

0.004376389783862621

In [27]:
mean_squared_error(
    y_true=y_test,
    y_pred=best_rf.predict(X_test)
)

0.007877514028777857

In [56]:
test_final = pd.read_csv("../data/test_1.csv")

In [60]:
test_final.drop('Unnamed: 0', axis=1, inplace=True)

In [61]:
z = test_final

In [62]:
test_final.head()

Unnamed: 0,id,carat,depth,table,x,y,z,cut_name,color_name,clarity_name
0,0,0.33,61.9,55.0,4.44,4.42,2.74,4,4,1
1,1,0.41,61.8,54.0,4.79,4.76,2.95,4,6,6
2,2,0.91,62.5,59.0,6.16,6.23,3.87,2,6,5
3,3,0.42,62.6,57.0,4.76,4.8,2.99,2,7,6
4,4,0.54,61.5,56.0,5.28,5.25,3.24,4,7,1


In [63]:
best_rf.predict(z)

array([6.80675805, 6.90654101, 8.20330905, ..., 6.73789439, 7.26236723,
       8.14587585])

In [64]:
test_final['price'] = best_rf.predict(z)

In [65]:
best_rf_predictions = test_final[['id', 'price']]

In [66]:
best_rf_predictions.head()

Unnamed: 0,id,price
0,0,6.806758
1,1,6.906541
2,2,8.203309
3,3,6.707888
4,4,7.725162


In [67]:
best_rf_predictions.set_index('id', inplace=True)

In [69]:
best_rf_predictions.to_csv("../data/boost_predictions.csv")