In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.linear_model import  LinearRegression, Lasso, Ridge
from sklearn.metrics import  mean_squared_error , r2_score , mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostClassifier
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR



In [3]:
df  =  pd.read_csv('gemstone.csv')
df.head() 

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [7]:
X =  df.drop(labels='price', axis=1)
y = df[['price']]

y




Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.preprocessing import  OneHotEncoder , StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

categorical_col  =  X.select_dtypes(include=['object']).columns
numerical_col  =  X.select_dtypes(exclude=['object']).columns

cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(
        categories=[cut_categories, color_categories, clarity_categories],
        sparse_output=False
    )),
    ('scaler', StandardScaler())
])

numerical_transformer  =  Pipeline(steps=[
    ('imputer' ,  SimpleImputer(strategy='median')) ,
    ('scaler' ,  StandardScaler())
])

preprocessor = ColumnTransformer(
    [
        ('cat', categorical_transformer, categorical_col),
        ('num', numerical_transformer, numerical_col)
    ]   )

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
X_train=  pd.DataFrame(preprocessor.fit_transform(X_train) ,  columns=preprocessor.get_feature_names_out())
X_test=  pd.DataFrame(preprocessor.transform(X_test) ,  columns=preprocessor.get_feature_names_out())

X_train.head()


Unnamed: 0,cat__cut_Fair,cat__cut_Good,cat__cut_Very Good,cat__cut_Premium,cat__cut_Ideal,cat__color_D,cat__color_E,cat__color_F,cat__color_G,cat__color_H,...,cat__clarity_VVS2,cat__clarity_VVS1,cat__clarity_IF,num__id,num__carat,num__depth,num__table,num__x,num__y,num__z
0,-0.103267,-0.252942,-0.489974,1.695535,-0.956244,-0.377437,2.096784,-0.464772,-0.54423,-0.435709,...,-0.298021,-0.241366,-0.148766,-0.237549,-1.016395,-0.204317,0.402608,-1.202472,-1.187395,-1.194148
1,-0.103267,-0.252942,-0.489974,1.695535,-0.956244,-0.377437,-0.476921,2.151594,-0.54423,-0.435709,...,-0.298021,-0.241366,-0.148766,1.137312,0.882396,0.720758,-0.118536,0.985177,0.941823,1.036109
2,-0.103267,-0.252942,-0.489974,-0.589784,1.045758,-0.377437,-0.476921,-0.464772,-0.54423,-0.435709,...,-0.298021,-0.241366,-0.148766,0.089219,1.529711,0.350728,-1.160823,1.426308,1.394848,1.441611
3,-0.103267,-0.252942,-0.489974,1.695535,-0.956244,-0.377437,-0.476921,-0.464772,-0.54423,-0.435709,...,-0.298021,-0.241366,-0.148766,1.495425,1.896523,0.073206,0.923751,1.741402,1.711965,1.70229
4,-0.103267,3.953472,-0.489974,-0.589784,-0.956244,-0.377437,-0.476921,-0.464772,-0.54423,2.295109,...,-0.298021,-0.241366,-0.148766,-0.863625,0.450852,1.73834,1.444895,0.562052,0.52504,0.703019


In [11]:
X_test.head()

Unnamed: 0,cat__cut_Fair,cat__cut_Good,cat__cut_Very Good,cat__cut_Premium,cat__cut_Ideal,cat__color_D,cat__color_E,cat__color_F,cat__color_G,cat__color_H,...,cat__clarity_VVS2,cat__clarity_VVS1,cat__clarity_IF,num__id,num__carat,num__depth,num__table,num__x,num__y,num__z
0,-0.103267,-0.252942,-0.489974,-0.589784,1.045758,2.649448,-0.476921,-0.464772,-0.54423,-0.435709,...,-0.298021,-0.241366,-0.148766,-1.464958,-0.628006,0.258221,-0.118536,-0.599293,-0.580341,-0.571414
1,-0.103267,-0.252942,2.040926,-0.589784,-0.956244,-0.377437,-0.476921,-0.464772,1.837458,-0.435709,...,-0.298021,-0.241366,-0.148766,1.231935,2.608569,-2.146973,-0.118536,2.128516,2.201232,1.962969
2,-0.103267,-0.252942,-0.489974,1.695535,-0.956244,-0.377437,2.096784,-0.464772,-0.54423,-0.435709,...,-0.298021,4.143086,-0.148766,-0.000466,-1.124281,-1.221899,0.923751,-1.373523,-1.413907,-1.46931
3,-0.103267,-0.252942,-0.489974,1.695535,-0.956244,-0.377437,-0.476921,-0.464772,-0.54423,-0.435709,...,-0.298021,4.143086,-0.148766,0.873768,-1.016395,-0.574346,0.923751,-1.157458,-1.160213,-1.194148
4,-0.103267,-0.252942,-0.489974,-0.589784,1.045758,-0.377437,-0.476921,-0.464772,-0.54423,2.295109,...,-0.298021,-0.241366,-0.148766,0.392444,0.860819,0.628251,-0.639679,0.949167,0.987125,1.007145


In [12]:
def evaluate_model(true, predicted):
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2score = r2_score(true, predicted)
    mae = mean_absolute_error(true, predicted)
    return rmse, r2score, mae



In [14]:
models = {
    'LinearRegression' : LinearRegression() , 
    'Lasso' : Lasso() ,
    'Ridge' : Ridge() ,
    'KNeighborsRegressor' : KNeighborsRegressor() ,
    'DecisionTreeRegressor' : DecisionTreeRegressor() ,
    'RandomForestRegressor' : RandomForestRegressor() ,
    'AdaBoostRegressor' : AdaBoostClassifier() ,
    'XGBRegressor' : XGBRegressor() ,
    'CatBoostRegressor' : CatBoostRegressor(verbose=0) ,
    'SVR' : SVR()
   

}

model_list = []
r2_list = []

for i in range(len(models)):
    model  =  list(models.values())[i]
    model.fit(X_train, y_train.values.flatten())

    y_train_p = model.predict(X_train)
    y_test_p = model.predict(X_test)

    mse , r2, mae = evaluate_model(y_train, y_train_p)
    r2_list.append(r2)

    print(f"{model} - Training Metrics:")
    print(f"RMSE: {mse}")
    print(f"R2 Score: {r2}")
    print(f"MAE: {mae}")

    test_mse , test_r2, test_mae = evaluate_model(y_test, y_test_p)

    print(f"\n{model} - Testing Metrics:")
    print(f"RMSE: {test_mse}")
    print(f"R2 Score: {test_r2}")
    print(f"MAE: {test_mae}")
  
    print("\n" + "="*50 + "\n")





LinearRegression() - Training Metrics:
RMSE: 951.6890573288946
R2 Score: 0.9444524913754836
MAE: 626.6974475424162

LinearRegression() - Testing Metrics:
RMSE: 944.6613865829004
R2 Score: 0.9447766976776784
MAE: 622.9514614402817


Lasso() - Training Metrics:
RMSE: 951.8489841300666
R2 Score: 0.9444338208197662
MAE: 626.9438628898557

Lasso() - Testing Metrics:
RMSE: 944.9359334949601
R2 Score: 0.9447445939196155
MAE: 623.3574498486574


Ridge() - Training Metrics:
RMSE: 951.6891140981708
R2 Score: 0.944452484748546
MAE: 626.7158080947954

Ridge() - Testing Metrics:
RMSE: 944.666718510831
R2 Score: 0.9447760742849934
MAE: 622.9713864330162


KNeighborsRegressor() - Training Metrics:
RMSE: 608.5680263829735
R2 Score: 0.9772860484062573
MAE: 320.72907954384016

KNeighborsRegressor() - Testing Metrics:
RMSE: 751.417392431091
R2 Score: 0.9650592251006267
MAE: 391.95655947307245


DecisionTreeRegressor() - Training Metrics:
RMSE: 0.0
R2 Score: 1.0
MAE: 0.0

DecisionTreeRegressor() - Testing

KeyboardInterrupt: 

In [None]:
df_result  =  pd.DataFrame(list((model_list, r2_list)).T, columns=['Model', 'R2 Score'])
df_result.sort_values(by='R2 Score', ascending=False, inplace=True)
df_result.reset_index(drop=True, inplace=True)  
df_result.head(10)