### Import the libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,cross_val_score,StratifiedKFold

In [4]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from xgboost import XGBRegressor

In [5]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [6]:
#read the dataset
df_dataset=pd.read_csv("gemstone.csv")

In [7]:
type(df_dataset)

pandas.core.frame.DataFrame

In [8]:
#concise summary of dataframe
df_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193573 entries, 0 to 193572
Data columns (total 11 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       193573 non-null  int64  
 1   carat    193573 non-null  float64
 2   cut      193573 non-null  object 
 3   color    193573 non-null  object 
 4   clarity  193573 non-null  object 
 5   depth    193573 non-null  float64
 6   table    193573 non-null  float64
 7   x        193573 non-null  float64
 8   y        193573 non-null  float64
 9   z        193573 non-null  float64
 10  price    193573 non-null  int64  
dtypes: float64(6), int64(2), object(3)
memory usage: 16.2+ MB


In [9]:
#display the first five rows of dataframe
df_dataset.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [10]:
#display the last five rows of dataframe
df_dataset.tail()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
193568,193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193569,193569,0.7,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193570,193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193571,193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681
193572,193572,0.71,Good,E,SI2,60.8,64.0,5.73,5.71,3.48,2258


#### Encoding categorical features

In [28]:
X=df_dataset.drop(['id','price'],axis=1)

In [29]:
y=df_dataset['price']

In [30]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7,random_state=42)

In [31]:
cut_enc = ["Fair","Good","Very Good","Premium","Ideal"]
clarity_enc = ["I1","SI2","SI1","VS2","VS1","VVS2","VVS1","IF"]
color_enc = ["J","I","H","G","F","E","D"]

In [32]:
numerical_columns=['carat', 'depth', 'table', 'x', 'y', 'z']
categorical_columns=cate_variables.columns

In [33]:
num_pipeline=Pipeline(steps=[('scaler',StandardScaler())])

In [34]:
cat_pipeline=Pipeline(steps=[('ord_enc',OrdinalEncoder(categories=[cut_enc,color_enc,clarity_enc])),
                             ('scaler',StandardScaler())])

In [35]:
preprocessor=ColumnTransformer([('num_pipeline',num_pipeline,numerical_columns),('cat_pipeline',cat_pipeline,categorical_columns)])

In [36]:
X_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
11504,0.41,Ideal,E,VVS2,60.6,56.0,4.85,4.80,2.93
95284,1.23,Very Good,H,VS1,59.9,59.0,6.91,7.01,4.19
184777,1.70,Premium,H,VS2,62.0,58.0,7.61,7.66,4.74
5419,0.33,Ideal,F,VVS1,61.2,56.0,4.47,4.44,2.73
45466,0.33,Very Good,I,SI1,62.1,58.0,4.41,4.45,2.75
...,...,...,...,...,...,...,...,...,...
119879,0.50,Very Good,E,SI1,60.2,61.0,5.11,5.15,3.09
103694,1.91,Very Good,F,SI1,62.3,62.0,7.85,7.79,4.87
131932,1.22,Premium,G,VS2,62.8,58.0,6.82,6.74,4.26
146867,0.31,Very Good,G,VVS1,61.1,56.0,4.37,4.40,2.67


In [37]:
X_train=preprocessor.fit_transform(X_train)

In [38]:
X_test=preprocessor.transform(X_test)

#### Training and testing

In [39]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [40]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2=r2_score(true, predicted)
    
    return mae, rmse,r2

In [42]:
param={"Linear Regression":{},
       "Lasso":{'alpha':[0.001,0.01,0.1,0.2,0.5,0.8,1,5,10],
                'max_iter':[1000,5000,10000]},
       "Ridge":{'alpha':[0.001,0.01,0.1,0.2,0.5,0.8,1,5,10],
                'max_iter':[1000,5000,10000]},
       "K-Neighbors Regressor":{'n_neighbors':[10]},
                           #'n_neighbors':[3,5,10]},
       #"Decision Tree": {'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
       #                  'splitter':['best','random'],
        #                 'max_features':['sqrt','log2'],
         #                'max_depth':[1,2,3]},
       "Decision Tree":{},
       #"Random Forest Regressor":{'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        #                          'max_features':['sqrt','log2',None],
         #                         'n_estimators': [100,200,400],
          #                        'bootstrap':[True,False],
           #                       'max_depth':[1,2,3]},
       "Random Forest Regressor":{'n_estimators': [400]},
                                 # 'n_estimators': [100,200,400]},
          #                        'bootstrap':[True,False],
           #                       'max_depth':[1,2,3]},
       
       "XGBRegressor":{'n_estimators':[100,400],
                       #'max_depth':[1,2,3,6,10],
                       'learning_rate' :[0.1, 0.2, 0.3, 0.5]},
                       #'min_child_weight' : [1.0, 2.0, 3.0, 4.0, 5.0],
                       #'subsample' : [0.5, 0.6, 0.7, 0.8, 1.0]},
       "AdaBoost Regressor":{'learning_rate':[0.001,0.01,0.1,0.2,0.5],
                            # 'loss':['linear','square','exponential'],
                             'n_estimators': [100,400]}
      }   

In [43]:
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    hyp_param=param[list(models.keys())[i]]
    
    gs=GridSearchCV(model,hyp_param,cv=3)
    gs.fit(X_train,y_train)
    print(gs.best_params_)
    
    model.set_params(**gs.best_params_)
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

{}
Linear Regression
Model performance for Training set
- Root Mean Squared Error: 1015.0060
- Mean Absolute Error: 676.2366
- R2 Score: 0.9369
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1014.6297
- Mean Absolute Error: 675.0758
- R2 Score: 0.9363


{'alpha': 0.8, 'max_iter': 1000}
Lasso
Model performance for Training set
- Root Mean Squared Error: 1015.1190
- Mean Absolute Error: 677.0891
- R2 Score: 0.9369
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1014.6284
- Mean Absolute Error: 676.0094
- R2 Score: 0.9363


{'alpha': 0.001, 'max_iter': 1000}
Ridge
Model performance for Training set
- Root Mean Squared Error: 1015.0060
- Mean Absolute Error: 676.2366
- R2 Score: 0.9369
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1014.6297
- Mean Absolute Error: 675.0759
- R2 Score: 0.9363


{'n_neighbors': 10}
K-Neighbors Regressor
Model performance fo

#### Results

In [44]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
6,XGBRegressor,0.979339
5,Random Forest Regressor,0.977027
3,K-Neighbors Regressor,0.973518
4,Decision Tree,0.954774
1,Lasso,0.936291
0,Linear Regression,0.936291
2,Ridge,0.936291
7,AdaBoost Regressor,0.925208


#### Data Prediction

In [49]:
xgb_model = XGBRegressor(learning_rate=0.1,n_estimators=100)
xgb_model = xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(" R2 score of the model is %.2f" %score)

 R2 score of the model is 97.93
