# Diamond Price Prediction - Model Training

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [64]:
df = pd.read_csv('data/cleaned_gemstone.csv')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
4,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


#### Independent and Dependent Features

In [65]:
X = df.drop(labels=['price'], axis=1)
y = df[['price']]

In [66]:
y

Unnamed: 0,price
0,499
1,984
2,6289
3,1082
4,779
...,...
26928,5408
26929,1114
26930,1656
26931,682


#### Segregating numerical and categorical columns

In [67]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

#### Define the custom ranking for each ordinal variable 

In [68]:
cut_categories = ['Fair','Good','Very Good', 'Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [69]:
from sklearn.impute import SimpleImputer  # Handling missing values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding

# Pipelining
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


### Data Transformation

#### Building Pipeline

##### Numerical Pipeline

In [70]:
numerical_pipeline = Pipeline(
                                steps=[
                                    ('imputer',SimpleImputer(strategy='median')),
                                    ('scaler',StandardScaler())
                                ]) 

##### Categorical Pipeline

In [71]:
categorical_pipeline = Pipeline(
                                steps=[
                                    ('imputer', SimpleImputer(strategy='most_frequent')),
                                    ('ordinalencoder', OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
                                    ('scaler',StandardScaler())
                                ])

In [72]:
preprocessor = ColumnTransformer([
    ('numerical_pipeline', numerical_pipeline,numerical_cols),
    ('categorical_pipeline',categorical_pipeline, categorical_cols)
])

 ### Train Test Split

In [73]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.30,random_state=42)

In [74]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [75]:
X_train.shape, X_test.shape

((18853, 9), (8080, 9))

In [76]:
X_train.head()

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,0.968084,1.046385,-0.210369,1.001667,0.907752,1.120286,0.981224,1.99003,-0.038842
1,0.56969,-0.820838,-0.210369,0.78883,0.789512,-3.531862,0.981224,-0.357598,-0.645538
2,0.46485,-1.754449,0.682825,0.735621,0.730392,0.519085,-0.815844,1.403123,-0.038842
3,1.513255,0.328222,0.682825,1.409604,1.389157,1.478143,0.08269,1.99003,0.567855
4,-0.625492,-0.246308,0.236228,-0.523663,-0.54491,-0.554487,0.981224,1.99003,0.567855


### Model Training

In [93]:
from sklearn.linear_model import LinearRegression, Lasso,Ridge, ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error, mean_squared_error

In [94]:
regressor = LinearRegression()
regressor.fit(X_train,y_train)

In [95]:
regressor.intercept_

array([3937.95767252])

### Model Evaluation

In [96]:
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    r2 = r2_score(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    return mae, r2, rmse    

#### Training multiple model

In [97]:
models = {
    'Liner Regression':LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'Elasticnet': ElasticNet(),
      
}



In [98]:
trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    mae, r2, rmse = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("MAE: ", mae)
    print("RMSE: ", rmse)
    print('R2 Score: ', r2*100)

    r2_list.append(r2)

    print("-"*35)
    print('\n')
    

Liner Regression
Model Training Performance
MAE:  815.7734328988273
RMSE:  1232.8750456243995
R2 Score:  90.79007870355092
-----------------------------------


Lasso
Model Training Performance
MAE:  817.0429814716317
RMSE:  1232.4570491608395
R2 Score:  90.79632274578006
-----------------------------------


Ridge
Model Training Performance
MAE:  816.0153190188635
RMSE:  1233.0003114486778
R2 Score:  90.78820706701018
-----------------------------------


Elasticnet
Model Training Performance
MAE:  1092.9221711388586
RMSE:  1680.3075309883682
R2 Score:  82.89215360402223
-----------------------------------




In [99]:
model_list

['Liner Regression', 'Lasso', 'Ridge', 'Elasticnet']

# END