In [1]:
#import the required libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

#### Model Training

In [2]:
df = pd.read_csv("data\gemstone.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
#drop the 'id' columns
df = df.drop(labels = ["id"], axis = 1)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
#create independent and dependent dataset
X = df.drop(labels = ["price"], axis = 1)
y = df.price

In [5]:
X.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77


In [6]:
y

0         13619
1         13387
2          2772
3           666
4         14453
          ...  
193568     1130
193569     2874
193570     3036
193571      681
193572     2258
Name: price, Length: 193573, dtype: int64

In [7]:
#defining the categorical and numerical data from X dataset
categorical_cols = X.select_dtypes(include= "object").columns
numerical_cols = X.select_dtypes(exclude= "object").columns

In [8]:
categorical_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [9]:
numerical_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [10]:
#defining the custom ranking for each ordinal variables
cut_categorical = ["Fair", "Good", "Very Good", "Premium", "Ideal"]
color_categorical = ["D", "E", "F", "G", "H", "I", "J"]
clarity_categorical = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]

#### PipiLine Transformation

In [11]:
#import the required libararies
from sklearn.impute import SimpleImputer #handling the missing values
from sklearn.preprocessing import StandardScaler #feature scaling
from sklearn.preprocessing import OrdinalEncoder #ordinal encoding of categorical values

#pipeline libraries
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [12]:
#Numerical Pipeline
num_pipeline = Pipeline(
    steps = [
        ("impute", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

#categorical Pipeline
cat_pipeline = Pipeline(
    steps = [
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("ordinalencoder", OrdinalEncoder(categories=[cut_categorical, color_categorical, clarity_categorical])),
        ("scaler", StandardScaler())
    ]
)

#Joining two pipeline by column Transformer
preprocessor = ColumnTransformer([
    ("num_pipeline", num_pipeline, numerical_cols),
    ("cat_pipeline", cat_pipeline, categorical_cols)
])

In [13]:
#convert into train and test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 47)

In [14]:
#renaming the columns name and fit, transform the train and test dataset
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns= preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns= preprocessor.get_feature_names_out())

X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.496226,-0.392081,-1.15867,0.698922,0.7444,0.675387,0.871832,0.297813,0.016456
1,1.533583,2.577621,0.406472,1.293757,1.243259,1.573688,-3.153558,-1.549219,-0.648565
2,0.8204,0.071935,-1.15867,0.924238,0.953014,0.936184,-1.140863,2.144845,-1.313586
3,-0.994976,0.628754,-1.15867,-1.175709,-1.160337,-1.106726,0.871832,-0.933542,-0.648565
4,0.236886,0.907164,1.971613,0.347429,0.318101,0.429079,-1.140863,0.297813,-0.648565


In [15]:
X_test.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.994976,0.721558,0.928186,-1.184722,-1.151267,-1.106726,-0.134515,-0.933542,-0.648565
1,-0.152123,-0.67049,-0.636956,0.095074,0.045996,0.008906,-0.134515,1.529167,0.681477
2,-0.152123,-0.484884,-0.636956,0.040998,0.018786,-0.020071,0.871832,-0.317864,-1.313586
3,-0.497909,-1.041703,-0.636956,-0.346546,-0.316811,-0.411267,0.871832,-1.549219,0.016456
4,1.965815,-0.577687,-1.15867,1.762415,1.723979,1.631643,0.871832,1.529167,0.016456


In [16]:
#model training
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [17]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [18]:
#train multiple model at time
models = {
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "ElasticNet":ElasticNet()
}

trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    #make prediction
    y_pred = model.predict(X_test)

    #finding the metrics
    mae, rmse, r2_square = evaluate_model(y_pred, y_test)

    print(list(models.keys())[i],":")
    model_list.append(list(models.keys())[i])

    #printing the metrics data
    print("Model Training Performace: ")
    print("RMSE :", rmse)
    print("MAE :", mae)
    print("R2 score: ", (r2_square*100))

    r2_list.append(r2_square)

    print('-'*35)

LinearRegression :
Model Training Performace: 
RMSE : 1025.5632164694373
MAE : 678.0040409067242
R2 score:  93.09284329617297
-----------------------------------
Lasso :
Model Training Performace: 
RMSE : 1025.683728638837
MAE : 679.1902032120987
R2 score:  93.08226515698544
-----------------------------------
Ridge :
Model Training Performace: 
RMSE : 1025.5461094402797
MAE : 678.0291274020684
R2 score:  93.09274313879563
-----------------------------------
ElasticNet :
Model Training Performace: 
RMSE : 1548.6383105790273
MAE : 1067.8673921028178
R2 score:  77.56363758581038
-----------------------------------
