### Model Training

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("data/output.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,4,3,4,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,3,7,2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,5,4,5,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,5,4,5,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,4,4,4,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
df=df.drop(labels='Unnamed: 0',axis=1)

In [4]:
X=df.drop(labels=['price'],axis=1)
y=df['price']

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [6]:
cut_map={"Fair":1,
 "Good":2,
 "Very Good":3,
 "Premium":4,
 "Ideal":5}

clarity_map={"I1":1,
"SI2":2,
"SI1":3,
"VS2":4,
"VS1":5,
"VVS2":6,
"VVS1":7,
"IF":8}

color_map={"D":1,
           "E":2,
           "F":3,
           "G":4,
           "H":5,
           "I":6,
           "J":7}

In [7]:
numerical_columns=X.columns[X.dtypes!='O']
categorical_columns=X.columns[X.dtypes=='O']
numerical_columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [8]:
# NUmerical Pipeline
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())   
    ]
)

# Categorical Pipeline
cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinal_encoder',OrdinalEncoder(categories=[cut_map,color_map,clarity_map])),
        ('scaler',StandardScaler())
    ]
)

preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_columns),
    ('cat_pipeline',cat_pipeline,categorical_columns)
])

In [9]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)

In [10]:
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
11504,0.41,5,2,6,60.6,56.0,4.85,4.8,2.93
95284,1.23,3,5,5,59.9,59.0,6.91,7.01,4.19
184777,1.7,4,5,4,62.0,58.0,7.61,7.66,4.74
5419,0.33,5,3,7,61.2,56.0,4.47,4.44,2.73
45466,0.33,3,6,3,62.1,58.0,4.41,4.45,2.75


In [11]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [12]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__cut,num_pipeline__color,num_pipeline__clarity,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z
0,-0.823144,0.8741,-0.936747,1.350746,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024
1,0.945023,-1.137644,0.910853,0.684455,-1.777823,0.921902,1.073226,1.166389,0.946633
2,1.958484,-0.131772,0.910853,0.018164,0.165682,0.400636,1.703116,1.755063,1.742237
3,-0.995648,0.8741,-0.32088,2.017037,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334
4,-0.995648,-1.137644,1.52672,-0.648127,0.25823,0.400636,-1.176382,-1.152082,-1.136403


In [13]:
X_test.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__cut,num_pipeline__color,num_pipeline__clarity,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z
0,-0.629077,0.8741,-1.552614,-0.648127,0.25823,-0.12063,-0.600482,-0.581521,-0.572248
1,2.605374,-1.137644,0.294987,-1.314417,-2.148014,-0.12063,2.126042,2.198832,1.959219
2,-1.125026,-0.131772,-0.936747,2.017037,-1.222536,0.921902,-1.374347,-1.414721,-1.46911
3,-1.017211,-0.131772,1.52672,2.017037,-0.574701,0.921902,-1.158385,-1.161138,-1.194265
4,0.858771,0.8741,0.910853,-0.648127,0.628421,-0.641897,0.947248,0.985258,1.004495


In [14]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

from sklearn.metrics import r2_score,mean_absolute_error, mean_squared_error

In [15]:
regression=LinearRegression()

regression.fit(X_train,y_train)

In [16]:
regression.coef_

array([ 6432.97591819,    68.80035873,  -464.67990411,   652.10059539,
        -132.34206204,   -70.48787525, -1701.38593925,  -494.17005097,
         -76.32351645])

In [17]:
regression.intercept_

3976.8787389023005

In [32]:
import numpy as np
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_absolute_error(true,predicted))
    r2_square=r2_score(true,predicted)
    return mae,mse,rmse,r2_square

In [38]:
# Train multiple models

models= {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}

model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    y_pred=model.predict(X_test)

    mae,mse,rmse,r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Training Preference")
    print("RMSE: ",rmse)
    print("MAE: ",mae)
    print("R2 SCORE",r2_square)

    r2_list.append(r2_score)

    print('*'*35)
    print("\n")

LinearRegression
Model Training Preference
RMSE:  25.98222136397781
MAE:  675.075827006745
R2 SCORE 0.9362906819996045
***********************************


Lasso
Model Training Preference
RMSE:  26.004619823831625
MAE:  676.2402521820167
R2 SCORE 0.9362869797815188
***********************************


Ridge
Model Training Preference
RMSE:  25.982835930246992
MAE:  675.1077629781341
R2 SCORE 0.9362900967491631
***********************************


ElasticNet
Model Training Preference
RMSE:  32.572158287172265
MAE:  1060.9454954846049
R2 SCORE 0.8544952769396246
***********************************


