# 2.Model Training for Gemstone - Venkata Anil Kumar

### Loading Necessary Libraries:-

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Modelling
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
#from xgboost import XGBRegressor


In [2]:
df = pd.read_csv('./data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
df.isnull().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [4]:
df.duplicated().sum()

0

In [5]:
df=df.drop(labels='id',axis=1)


In [6]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [9]:
### Getting X and Y variables

X=df.drop(labels='price',axis=1)
y=df['price']

In [14]:
print(X.head()),print(y.head())

   carat        cut color clarity  depth  table     x     y     z
0   1.52    Premium     F     VS2   62.2   58.0  7.27  7.33  4.55
1   2.03  Very Good     J     SI2   62.0   58.0  8.06  8.12  5.05
2   0.70      Ideal     G     VS1   61.2   57.0  5.69  5.73  3.50
3   0.32      Ideal     G     VS1   61.6   56.0  4.38  4.41  2.71
4   1.70    Premium     G     VS2   62.6   59.0  7.65  7.61  4.77
0    13619
1    13387
2     2772
3      666
4    14453
Name: price, dtype: int64


(None, None)

### Creating data transformation pipeline



###### Creating pipeline with column Transformer:-

In [23]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_columns=X.select_dtypes(include='object').columns
numerical_columns=X.select_dtypes(exclude='object').columns

print(f"Categorical Columns: {categorical_columns}")
print(f'Numerical Columns: {numerical_columns}')

# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']


from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder,StandardScaler


## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
                     )

## Categorical pipeline

cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
                    )

## COlumnTransformer Pipeline
preprocessor=ColumnTransformer(
    transformers=[
        ('num_pipeline',num_pipeline,numerical_columns),
        ('cat_pipeline',cat_pipeline,categorical_columns)
    ]
                          )



Categorical Columns: Index(['cut', 'color', 'clarity'], dtype='object')
Numerical Columns: Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')


In [27]:
# Train Test split

from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2,random_state=42)

In [32]:
xtrain.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
83475,0.32,Premium,E,SI1,61.6,58.0,4.38,4.41,2.71
160324,1.2,Premium,F,VS2,62.6,57.0,6.81,6.76,4.25
101740,1.5,Ideal,I,VS2,62.2,55.0,7.3,7.26,4.53
180341,1.67,Premium,I,SI2,61.9,59.0,7.65,7.61,4.71
48480,1.0,Good,H,VS2,63.7,60.0,6.34,6.3,4.02


### Transforming data with pipeline created:-

In [33]:
xtrain=pd.DataFrame(preprocessor.fit_transform(xtrain),columns=preprocessor.get_feature_names_out())
xtest=pd.DataFrame(preprocessor.transform(xtest),columns=preprocessor.get_feature_names_out())



In [36]:
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

In [38]:
xtrain.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-1.016395,-0.204317,0.402608,-1.202472,-1.187395,-1.194148,-0.132842,-0.936018,-0.64895
1,0.882396,0.720758,-0.118536,0.985177,0.941823,1.036109,-0.132842,-0.320002,0.017052
2,1.529711,0.350728,-1.160823,1.426308,1.394848,1.441611,0.872563,1.528047,0.017052
3,1.896523,0.073206,0.923751,1.741402,1.711965,1.70229,-0.132842,1.528047,-1.314953
4,0.450852,1.73834,1.444895,0.562052,0.52504,0.703019,-2.143651,0.912031,0.017052


In [37]:
xtest.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.628006,0.258221,-0.118536,-0.599293,-0.580341,-0.571414,0.872563,-1.552034,-0.64895
1,2.608569,-2.146973,-0.118536,2.128516,2.201232,1.962969,-1.138246,0.296015,-1.314953
2,-1.124281,-1.221899,0.923751,-1.373523,-1.413907,-1.46931,-0.132842,-0.936018,2.015061
3,-1.016395,-0.574346,0.923751,-1.157458,-1.160213,-1.194148,-0.132842,1.528047,2.015061
4,0.860819,0.628251,-0.639679,0.949167,0.987125,1.007145,0.872563,0.912031,-0.64895


#### model training baseline models:

#### create an evaluate function to give all metrics after model training:-

In [39]:
def evaluate_function(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true,predicted)
    return mae,rmse,r2_square


### Trainig Various Models:-

In [50]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
models_list=[]
r2_list=[]
for i in range(len(list(models))):
    model=list(models.values())[i]
    
    model.fit(xtrain,ytrain.values.flatten()) ## train model
    
    ## make predictions
    
    y_train_pred=model.predict(xtrain)
    y_test_pred=model.predict(xtest)
    
     # Evaluate Train and Test dataset
        
    model_train_mae,model_train_rmse,model_train_r2=evaluate_function(ytrain,y_train_pred)
    model_test_mae,model_test_rmse,model_test_r2=evaluate_function(ytest,y_test_pred)
    
    print(list(models.keys())[i])
    models_list.append(list(models.keys())[i])
    
    print("Model performance for Training set")
    print("-Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("-mean absolute erroor: {:.4f}".format(model_train_mae))
    print("R2 score: {:.4f}".format(model_train_r2))
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    
    
    print("="*35)
    print('\n')
    
    
    


Linear Regression
Model performance for Training set
-Root Mean Squared Error: 1016.9490
-mean absolute erroor: 677.1656
R2 score: 0.9366
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1006.6010
- Mean Absolute Error: 671.5856
- R2 Score: 0.9373


Lasso
Model performance for Training set
-Root Mean Squared Error: 1017.0718
-mean absolute erroor: 678.3145
R2 score: 0.9366
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1006.8716
- Mean Absolute Error: 672.8635
- R2 Score: 0.9373


Ridge
Model performance for Training set
-Root Mean Squared Error: 1016.9491
-mean absolute erroor: 677.1925
R2 score: 0.9366
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1006.6062
- Mean Absolute Error: 671.6137
- R2 Score: 0.9373


K-Neighbors Regressor
Model performance for Training set
-Root Mean Squared Error: 545.6563
-mean absolute erroor: 285.8181
R2 score: 0.9817
-

### Results:-

In [52]:
df_results=pd.DataFrame(list(zip(models_list,r2_list)),columns=['Model_list','R2_score']).sort_values(by=['R2_score'],ascending=False)

In [53]:
df_results

Unnamed: 0,Model_list,R2_score
6,CatBoosting Regressor,0.979186
5,Random Forest Regressor,0.977173
3,K-Neighbors Regressor,0.972114
4,Decision Tree,0.956874
0,Linear Regression,0.937298
2,Ridge,0.937297
1,Lasso,0.937264
7,AdaBoost Regressor,0.835852
