## Import needed libraries and functions 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.preprocessing import StandardScaler
from random import randint
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV  , cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


## Overview Dataset

In [None]:
train_df = pd.read_csv('../Dataset/train.csv')
test_df = pd.read_csv('../Dataset/test.csv')

In [None]:
print(f"The training set shape is {train_df.shape}, {train_df.shape[0]} rows and {train_df.shape[1]} columns(features).")
print(f"The testing set shape is {test_df.shape}, {test_df.shape[0]} rows and {test_df.shape[1]} columns(features).")

The training set shape is (43152, 11), 43152 rows and 11 columns(features).
The testing set shape is (10788, 10), 10788 rows and 10 columns(features).


In [None]:
train_df.columns, test_df.columns

(Index(['Id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price',
        'x', 'y', 'z'],
       dtype='object'),
 Index(['Id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
        'z'],
       dtype='object'))

In [None]:
train_df.dtypes

Id           int64
carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

In [None]:
train_df.isnull().sum()

Id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [None]:
(train_df==0).sum(axis=0) 

Id          0
carat       0
cut         0
color       0
clarity     0
depth       0
table       0
price       0
x           5
y           4
z          17
dtype: int64

In [None]:
(train_df==0).sum(axis=0).sum()

26

In [None]:
train_df[train_df.duplicated()]

Unnamed: 0,Id,carat,cut,color,clarity,depth,table,price,x,y,z


In [None]:
train_df.nunique()

Id         43152
carat        266
cut            5
color          7
clarity        8
depth        179
table        121
price      10640
x            546
y            543
z            368
dtype: int64

In [None]:
is_col_categorical = (train_df.dtypes =="object")
is_col_categorical

Id         False
carat      False
cut         True
color       True
clarity     True
depth      False
table      False
price      False
x          False
y          False
z          False
dtype: bool

In [None]:
categorical_cols = list(is_col_categorical[is_col_categorical].index)
categorical_cols

['cut', 'color', 'clarity']

## Preproccessing data and cleaning

In [None]:
#cut = {'Fair':5,'Good':4,'Very Good':3, 'Premium':2, 'Ideal':1}
cut = {'Fair':4,'Good':3,'Very Good':2, 'Premium':1, 'Ideal':0}

In [None]:
#color = {'J':7,'I':6,'H':5, 'G':4, 'F':3, 'E':2, 'D':1}
color = {'J':6,'I':5,'H':4, 'G':3, 'F':2, 'E':1, 'D':0}

In [None]:
#clarity = {'I1':8,'SI2':7,'SI1':6, 'VS2':5, 'VS1':4, 'VVS2':3, 'VVS1':2, 'IF':1}
clarity = {'I1':7,'SI2':6,'SI1':5, 'VS2':4, 'VS1':3, 'VVS2':2, 'VVS1':1, 'IF':0}

In [None]:
train_df.cut.replace(cut, inplace=True)
test_df.cut.replace(cut, inplace=True)

In [None]:
train_df.color.replace(color, inplace=True)
test_df.color.replace(color, inplace=True)

In [None]:
train_df.clarity.replace(clarity, inplace=True)
test_df.clarity.replace(clarity, inplace=True)

## Features Selection 

In [None]:
train_df = train_df.drop('x', axis=1)
train_df = train_df.drop('y', axis=1)
train_df = train_df.drop('z', axis=1)
train_df = train_df.drop('Id', axis=1)

In [None]:
test_df = test_df.drop('x', axis=1)
test_df = test_df.drop('y', axis=1)
test_df = test_df.drop('z', axis=1)
test_df = test_df.drop('Id', axis=1)

In [None]:
X = train_df.drop(columns = ['price'])
y = train_df.price
X_te = test_df.copy()

In [None]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(X)
train_scaled = pd.DataFrame(train_scaled,columns=X.columns)

test_scaled = scaler.fit_transform(X_te)
test_scaled = pd.DataFrame(test_scaled,columns=X_te.columns)

X = train_scaled
X_te =test_scaled

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
models =  [LinearRegression(), BayesianRidge(), RandomForestRegressor(),GradientBoostingRegressor()]
for model in models:
    print(f'{model} Model')
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    print(f'RMSE = {mean_squared_error(y_pred_train, y_train)**0.5}')
    print("\n")

LinearRegression() Model
RMSE = 1229.1895956672513


BayesianRidge() Model
RMSE = 1229.1895980819459


RandomForestRegressor() Model
RMSE = 214.41418947558213


GradientBoostingRegressor() Model
RMSE = 604.1957358782495




In [None]:
n_estimator =[int(x) for x in np.linspace(start=100,stop=300,num=10,)]
max_features=['auto','sqrt','log2']
max_depth=[5,10]
min_samples_split=[2,5]
min_samples_leaf=[1,2]
max_leaf_nodes = [100,200,500]

In [None]:
param_grid={'n_estimators':n_estimator,
            'max_features':max_features,
            'max_depth':max_depth,
            'min_samples_split':min_samples_split,
            'min_samples_leaf':min_samples_leaf,
            'max_leaf_nodes':max_leaf_nodes
                               }
print(param_grid)

{'n_estimators': [100, 122, 144, 166, 188, 211, 233, 255, 277, 300], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [5, 10], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'max_leaf_nodes': [100, 200, 500]}


In [None]:
#gradient_boosting_model = GradientBoostingRegressor()

In [None]:
#GBgridsearch=GridSearchCV(estimator=gradient_boosting_model,param_grid=param_grid,cv=5,verbose=2,n_jobs=4)

In [None]:
#GBgridsearch.best_params_

In [None]:
#GBgridsearch.best_score_

'''
"""
{'max_depth': 5,
 'max_features': 'auto',
 'max_leaf_nodes': 200,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 277}
"""


'''

'\n"""\n{\'max_depth\': 5,\n \'max_features\': \'auto\',\n \'max_leaf_nodes\': 200,\n \'min_samples_leaf\': 2,\n \'min_samples_split\': 2,\n \'n_estimators\': 277}\n"""\n\n\n'

In [None]:
gradient_boosting_model = GradientBoostingRegressor(max_depth= 5,max_features= 'auto',max_leaf_nodes= 200,
                                                    min_samples_leaf = 1,min_samples_split= 2,n_estimators= 277)

In [None]:
gradient_boosting_model.fit(X,y)

GradientBoostingRegressor(max_depth=5, max_features='auto', max_leaf_nodes=200,
                          n_estimators=277)

In [None]:
y_pred_te2 = gradient_boosting_model.predict(X_test)
mean_squared_error(y_pred_te2, y_test)**0.5

443.3082692378263

In [None]:
#random_forest_model = RandomForestRegressor()

In [None]:
#RFgridsearch=GridSearchCV(estimator=random_forest_model,param_grid=param_grid,cv=5,verbose=2,n_jobs=4)

In [None]:
#RFgridsearch.best_params_

In [None]:
#RFgridsearch.best_score_

'''
max_depth= 10,max_features= 'auto',max_leaf_nodes= 500,min_samples_leaf= 2,min_samples_split= 2,n_estimators= 255

'''

"\nmax_depth= 10,max_features= 'auto',max_leaf_nodes= 500,min_samples_leaf= 2,min_samples_split= 2,n_estimators= 255\n\n"

In [None]:
random_forest_model = RandomForestRegressor(max_depth= 10,max_features= 'auto',max_leaf_nodes= 500,
                                            min_samples_leaf= 2,min_samples_split= 2,n_estimators= 255)

In [None]:
random_forest_model.fit(X,y)

RandomForestRegressor(max_depth=10, max_leaf_nodes=500, min_samples_leaf=2,
                      n_estimators=255)

In [None]:
y_pred_te = random_forest_model.predict(X_test)
mean_squared_error(y_pred_te, y_test)**0.5

500.28313666518596

In [None]:
test_output = pd.read_csv('../Dataset/test.csv')
y_pred_test_rf = random_forest_model.predict(X_te)
test_output['price'] = y_pred_test_rf
submit = test_output[['Id', 'price']]
submit.to_csv('rf.csv', index=False)
submit

Unnamed: 0,Id,price
0,1,784.384194
1,2,2967.674665
2,3,789.685411
3,4,2870.899146
4,5,1247.214755
...,...,...
10783,10784,1592.696424
10784,10785,6575.365022
10785,10786,4255.671162
10786,10787,4683.801708


In [None]:
test_output = pd.read_csv('../Dataset/test.csv')
y_pred_test_gb = gradient_boosting_model.predict(X_te)
test_output['price'] = y_pred_test_gb
submit2 = test_output[['Id', 'price']]
submit2.to_csv('gb.csv', index=False)
submit2

Unnamed: 0,Id,price
0,1,815.978857
1,2,2892.359754
2,3,883.907012
3,4,2838.300796
4,5,1172.061424
...,...,...
10783,10784,1673.141668
10784,10785,6551.892801
10785,10786,4231.883298
10786,10787,4639.954978
