In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
from sklearn.model_selection import train_test_split
import sklearn.linear_model as sl
lr = sl.LinearRegression()
import sklearn.tree as tree
dtr = tree.DecisionTreeRegressor()
import sklearn.ensemble as se
rfr = se.RandomForestRegressor(random_state = 42)
gbr = se.GradientBoostingRegressor()
import sklearn.metrics as mt
import math
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
dataset = pd.read_csv("diamonds.csv")
dataset.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,120.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,834,14.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
# duplicate = dataset[dataset.duplicated()]
# for i in duplicate.index:
#         print("index", i) 
#         dataset.drop(index = [i], inplace = True)
#         dataset.reset_index()
# dataset.drop(columns = 'SI.NO', axis = 1, inplace = True)
for col in dataset.select_dtypes(include="object").columns:
        dataset[col] = le.fit_transform(dataset[col])

In [4]:
dataset.to_csv("Daimond_clean.csv")

In [5]:
x = dataset.drop(['price'], axis = 1)
y = dataset['price']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state = 0)

### Before Hyperparameter Tuning

In [6]:
score = []
model = [lr, dtr, rfr, gbr]
    
lr.fit(x_train,y_train)
lrs = lr.score(x_test, y_test)
score.append(lrs)

dtr.fit(x_train,y_train)
dtrs = dtr.score(x_test, y_test)
score.append(dtrs)

rfr.fit(x_train,y_train)
rfrs = rfr.score(x_test, y_test)
score.append(rfrs)

gbr.fit(x_train,y_train)
gbrs = gbr.score(x_test, y_test)
score.append(gbrs)

max_score = max(score)
for i in range(len(score)):
    if score[i] == max_score:
        best_model = model[i]
        
y_predict = best_model.predict(x_test)
MSE = mt.mean_squared_error(y_test, y_predict)
RMSE = math.sqrt(MSE)
MAE = mt.mean_absolute_error(y_test,y_predict)

for i in range(len(model)):
    print(model[i], " : ", score[i])
print("Best_Model :", best_model)
print("MSE : ", MSE)
print("RMSE : ", RMSE)
print("MAE : ", MAE)

LinearRegression()  :  0.8841638998560365
DecisionTreeRegressor()  :  0.9654482648655688
RandomForestRegressor(random_state=42)  :  0.9808351071602934
GradientBoostingRegressor()  :  0.9719700614584699
Best_Model : RandomForestRegressor(random_state=42)
MSE :  305482.28391724883
RMSE :  552.7045177282785
MAE :  270.54244493211803


### After Hyperparameter Tuning (Random Search)

In [7]:
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
random_grid = {'n_estimators': [5,20,50,100],
               'max_features': ['auto', 'sqrt'],
               'max_depth': max_depth,
               'min_samples_split': [2, 6, 10],
               'min_samples_leaf': [1, 3, 4],
               'bootstrap': [True, False]}
rf_random = RandomizedSearchCV(estimator = rfr, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)
rf_random.fit(x_train, y_train)
print("Best Params: ", rf_random.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  warn(


Best Params:  {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 40, 'bootstrap': True}


In [8]:
Best_Params = rf_random.best_params_
Best_Params

{'n_estimators': 100,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 40,
 'bootstrap': True}

In [9]:
randmf = RandomForestRegressor(**Best_Params)
randmf.fit(x_train, y_train)
randmf.score(x_test, y_test)

  warn(


0.9808966458164092

In [10]:
y_pred_rf1 = pd.DataFrame( { "actual": y_test, 
"predicted_prob": randmf.predict( 
( x_test ) ) } ) 
y_pred_rf1

Unnamed: 0,actual,predicted_prob
33127,816,805.193465
52245,2487,2386.047567
3536,3409,2893.494098
32793,804,909.796927
33686,838,955.291421
...,...,...
27835,651,632.390168
17728,614,658.913926
10796,4861,4458.068130
35318,896,824.187774


In [11]:
y_predict = randmf.predict(x_test)
MSE = mt.mean_squared_error(y_test, y_predict)
RMSE = math.sqrt(MSE)
MAE = mt.mean_absolute_error(y_test,y_predict)
print("MSE : ", MSE)
print("RMSE : ", RMSE)
print("MAE : ", MAE)

MSE :  304501.37735144177
RMSE :  551.8164344702337
MAE :  272.3802962335965


### After Hyperparameter Tuning (Grid Search)

In [12]:
estimator = RandomForestRegressor() 
param_grid = { 
            "n_estimators"      : [10,20,30],
            "max_features"      : ["auto", "sqrt", "log2"],
            "min_samples_split" : [2,4,8],
            "bootstrap": [True, False],
            }
grid = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=5)
grid.fit(x_train, y_train)
G_Best_Params = grid.best_params_
print(G_Best_Params)

  warn(


{'bootstrap': True, 'max_features': 'auto', 'min_samples_split': 8, 'n_estimators': 30}


In [13]:
randmf = RandomForestRegressor(**G_Best_Params)
randmf.fit(x_train, y_train)
randmf.score(x_test, y_test)

  warn(


0.980759746925934

In [14]:
y_pred_rf1 = pd.DataFrame( { "actual": y_test, 
"predicted_prob": randmf.predict( 
( x_test ) ) } ) 
y_pred_rf1

Unnamed: 0,actual,predicted_prob
33127,816,804.299055
52245,2487,2386.282520
3536,3409,2874.067547
32793,804,917.298620
33686,838,961.201683
...,...,...
27835,651,640.752113
17728,614,672.318519
10796,4861,4363.052659
35318,896,803.854817
