In [10]:
import pandas as pd
import seaborn as sns
import numpy as np

from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import axes3d
from matplotlib import cm
%matplotlib inline
%config Inlinebackend.figure_format = 'retina'

import seaborn as sns
sns.set_context('poster')
sns.set(rc={'figure.figsize': (16., 9.)})
sns.set_style('whitegrid')

# Modeling libraries
import statsmodels.formula.api as smf # welcome!!
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(display='diagram')

import pandas as pd
np.random.seed(123)

import warnings
warnings.filterwarnings('ignore')

In [11]:
data = pd.read_csv("../data/final_train.csv")

In [12]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_name,color_name,clarity_name
0,0,0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,2,7,7
1,1,1,0.41,63.0,56.0,4.8,4.75,3.01,6.824,4,3,7
2,2,2,0.32,61.6,56.0,4.37,4.39,2.7,6.107,4,2,6
3,3,3,0.31,61.2,56.0,4.34,4.37,2.66,6.39,4,4,3
4,4,4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,3,1,6


In [13]:
# Split target and predictors
X = data.drop("price", axis=1)
y = data["price"]


# split data in train/ test/ 
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=123)

In [16]:
# GridSearch the parameter space (set verbose=-1)
model = RandomForestRegressor()
parameter_space = {'n_estimators': [100, 300, 1000],
                   'max_features': ['sqrt', 0.5, None],
                   'max_depth': [None, 10, 30, 100],
                   'min_samples_leaf': [1, 3, 10]}

grid_search = GridSearchCV(model,
                           param_grid=parameter_space,
                           verbose=1,
                           n_jobs=-1,
                           cv=5)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 43.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 106.5min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 141.1min finished


In [17]:
grid_search.best_score_

0.9909601073037122

In [19]:
best_rf = grid_search.best_estimator_
best_rf.score(X_train, y_train)

0.9988262627591733

In [20]:
grid_search.best_params_

{'max_depth': None,
 'max_features': None,
 'min_samples_leaf': 1,
 'n_estimators': 1000}

In [34]:
best_rf.predict(X_train)

array([6.653146, 7.648314, 8.702438, ..., 7.804022, 6.715008, 6.597863])

In [35]:
mean_squared_error(
    y_true=y_train,
    y_pred=best_rf.predict(X_train)
)

0.0012066053003600198

In [36]:
mean_squared_error(
    y_true=y_test,
    y_pred=best_rf.predict(X_test)
)

0.00931377035562152

In [58]:
test_final = pd.read_csv("../data/test_1.csv")

In [59]:
test_final.head()

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,x,y,z,cut_name,color_name,clarity_name
0,0,0,0.33,61.9,55.0,4.44,4.42,2.74,4,4,1
1,1,1,0.41,61.8,54.0,4.79,4.76,2.95,4,6,6
2,2,2,0.91,62.5,59.0,6.16,6.23,3.87,2,6,5
3,3,3,0.42,62.6,57.0,4.76,4.8,2.99,2,7,6
4,4,4,0.54,61.5,56.0,5.28,5.25,3.24,4,7,1


In [60]:
z = test_final

In [61]:
best_rf.predict(z)

array([6.897932, 6.914326, 8.203157, ..., 6.714912, 7.213445, 8.168071])

In [62]:
test_final['price'] = best_rf.predict(z)

In [63]:
test_final[:10]

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,x,y,z,cut_name,color_name,clarity_name,price
0,0,0,0.33,61.9,55.0,4.44,4.42,2.74,4,4,1,6.897932
1,1,1,0.41,61.8,54.0,4.79,4.76,2.95,4,6,6,6.914326
2,2,2,0.91,62.5,59.0,6.16,6.23,3.87,2,6,5,8.203157
3,3,3,0.42,62.6,57.0,4.76,4.8,2.99,2,7,6,6.754313
4,4,4,0.54,61.5,56.0,5.28,5.25,3.24,4,7,1,7.711399
5,5,5,1.01,60.4,58.0,6.5,6.54,3.94,4,4,7,8.537068
6,6,6,0.3,62.4,57.0,4.24,4.29,2.66,2,6,6,6.45579
7,7,7,1.01,63.5,57.0,6.29,6.33,4.01,1,5,7,8.488982
8,8,8,0.74,63.3,59.0,5.82,5.78,3.67,2,6,5,7.736937
9,9,9,0.53,62.0,54.0,5.22,5.27,3.25,4,4,1,7.56177


In [64]:
best_rf_predictions = test_final[['id', 'price']]

In [65]:
best_rf_predictions.head()

Unnamed: 0,id,price
0,0,6.897932
1,1,6.914326
2,2,8.203157
3,3,6.754313
4,4,7.711399


In [66]:
best_rf_predictions.set_index('id', inplace=True)

In [67]:
best_rf_predictions.to_csv("../data/rf_predictions.csv")