### Model Training

In [290]:
import pandas as pd

In [291]:
df = pd.read_csv("data/gemstone.csv")
df.head()


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [292]:
df = df.drop(labels=["id"], axis = 1)

In [293]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [294]:
### Independent and dependent features
X = df.drop(labels=["price"], axis= 1)
y = df[["price"]]

In [295]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77


In [296]:
y.head()

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453


In [297]:
### Define which columns should be ordinal-encoded and which should be scaled
categorical_columns = X.select_dtypes(include = "object").columns
numerical_columns = X.select_dtypes(exclude = "object").columns


In [298]:
categorical_columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [299]:
numerical_columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [300]:
### Define the custom ranking for each ordinal variable

cut_categories=["Fair", "Good", "Very Good", "Premium", "Ideal"]
color_categories = ["D" ,"E" ,"F" , "G" ,"H" , "I", "J"]
clarity_categories = ["I1", "SI2", "SI1" ,"VS2" , "VS1" , "VVS2" , "VVS1" ,"IF"]


In [301]:
from sklearn.impute import SimpleImputer ### Handling Missing Values
from sklearn.preprocessing import StandardScaler ### Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder ### Ordinal Encoding
### pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [302]:
### Numerical Pipeline
num_pipeline = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
]
)

### category Pipeline
cat_pipeline=Pipeline(
    steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordinalencoder", OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
    ("scaler", StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
("num_pipeline", num_pipeline, numerical_columns),
("cat_pipeline", cat_pipeline, categorical_columns)

])

In [303]:
### Train Test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [304]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train))
X_test = pd.DataFrame(preprocessor.transform(X_test))

In [305]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.8741,-0.936747,1.350746
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,0.910853,0.684455
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.131772,0.910853,0.018164
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.8741,-0.32088,2.017037
4,-0.995648,0.25823,0.400636,-1.176382,-1.152082,-1.136403,-1.137644,1.52672,-0.648127


In [306]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.629077,0.25823,-0.12063,-0.600482,-0.581521,-0.572248,0.8741,-1.552614,-0.648127
1,2.605374,-2.148014,-0.12063,2.126042,2.198832,1.959219,-1.137644,0.294987,-1.314417
2,-1.125026,-1.222536,0.921902,-1.374347,-1.414721,-1.46911,-0.131772,-0.936747,2.017037
3,-1.017211,-0.574701,0.921902,-1.158385,-1.161138,-1.194265,-0.131772,1.52672,2.017037
4,0.858771,0.628421,-0.641897,0.947248,0.985258,1.004495,0.8741,0.910853,-0.648127


In [307]:
### Model Training

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [308]:
regression= LinearRegression()


In [309]:
regression.fit(X_train, y_train)

LinearRegression()

In [310]:
regression.coef_

array([[ 6432.97591819,  -132.34206204,   -70.48787525, -1701.38593925,
         -494.17005097,   -76.32351645,    68.80035873,  -464.67990411,
          652.10059539]])

In [311]:
regression.intercept_

array([3976.8787389])

In [312]:
y_pred = regression.predict(X_test)

In [313]:
y_pred 

array([[ 1616.03275998],
       [15104.13631181],
       [ 1727.49228115],
       ...,
       [ 1878.27425152],
       [ 6295.06951547],
       [ 5976.94207688]])

In [314]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [315]:
evaluate_model(y_test,y_pred)

(675.0758270067446, 1014.6296630375483, 0.9362906819996045)

In [316]:
### Train multiple models

models={
"LinearRegression":LinearRegression(),
"Lasso": Lasso(),
"Ridge": Ridge(),
"ElasticNet":ElasticNet()
}

model_list=[]
r2_list=[]
trained_model_list=[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    ### Make predictions
    y_pred= model.predict(X_test)
    mae, rmse, r2_square = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Training Performane")
    print("RMSE", rmse)
    print("MAE", mae)
    print("R2 Score", r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performane
RMSE 1014.6296630375483
MAE 675.0758270067446
R2 Score 93.62906819996046


Lasso
Model Training Performane
RMSE 1014.659130275064
MAE 676.2421173665508
R2 Score 93.62869814082755


Ridge
Model Training Performane
RMSE 1014.6343233534396
MAE 675.107762978125
R2 Score 93.62900967491635


ElasticNet
Model Training Performane
RMSE 1533.3541245902313
MAE 1060.9432977143008
R2 Score 85.44967219374031




In [317]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']

In [318]:
 model = list(models.values())

In [319]:
 model

[LinearRegression(), Lasso(), Ridge(), ElasticNet()]