In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv("data/gemstone.csv")

In [3]:
data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
len(data)

193573

In [5]:
data=data.drop("id",axis=1)

In [6]:
# 
X=data.drop(["price"],axis=1)

In [7]:
y=data[["price"]]

In [9]:
cat_cols=X.select_dtypes(include="object").columns

In [10]:
num_cols=X.select_dtypes(exclude="object").columns

In [11]:
print(cat_cols)
print(num_cols)

Index(['cut', 'color', 'clarity'], dtype='object')
Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')


In [12]:
# Define the custom ranking for each ordinal variable (ITs directly assign rank wise---- by EDA Part)
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [13]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler  # handling featurre scaling 
from sklearn.preprocessing import OrdinalEncoder   # if feature are having ranks 
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [31]:

#list(data.columns[data.dtypes!="O"])

In [14]:
# Numerical pipeline 
num_pipline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median"))
        ,("scalar",StandardScaler())
    ]
)

# Categorical Pipeline
cat_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent"))
        ,("Ordinalencoder",OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories]))
        ,("scalar",StandardScaler())
    ]
)

In [15]:
## now to combine those two pipeline
preprocessor=ColumnTransformer(
    [
        ("num_pipline",num_pipline,num_cols)
        ,("cat_pipeline",cat_pipeline,cat_cols)
    ]
)

In [70]:
# train Test spliit 
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=32)

In [27]:
##feature_names_out = column_transformer.named_transformers_['numeric'].named_steps['scaler'].get_feature_names_out()

In [26]:
#
# #preprocessor.named_transformers_["num_pipline"].named_steps["scalar"].named_transformers_["imputer"].get_feature_names_out()

In [40]:
##preprocessor.get_feature_names_out

In [52]:
list(num_cols.values)

['carat', 'depth', 'table', 'x', 'y', 'z']

In [None]:
L=list(num_cols.values)
L.append("cut")
L.append("color")
L.append("clarity")

In [71]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=L)
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=L)

#### Model Training

In [75]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [76]:
regression=LinearRegression()
regression.fit(X_train,y_train)

LinearRegression()

In [80]:
regression.coef_

array([[ 6432.15885064,  -130.04060851,   -64.75184961, -1905.11295757,
         -292.6220071 ,   -79.7619032 ,    74.41515792,  -465.98515078,
          652.73563718]])

In [81]:
y_pred=regression.predict(X_test)

In [82]:
mean_squared_error(y_test,y_pred)

1021471.2631449206

In [83]:
# train multiple models
models={
    "LinearRegression":LinearRegression()
    ,"Lasso":Lasso()
    ,"Ridge":Ridge()
    ,"ElasticNet":ElasticNet()
}

In [91]:

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    # 
    y_pred=model.predict(X_test)
    print(f"For {model}")

    print(f"Root Mean squared error is {np.sqrt(mean_squared_error(y_test,y_pred))}")
    print(f"mean_absolute_error is {mean_absolute_error(y_test,y_pred)}")
    print(f"R2 square is {r2_score(y_test,y_pred)*100}")

    print("\n")

For LinearRegression()
Root Mean squared error is 1010.6786151615757
mean_absolute_error is 677.2021094891849
R2 square is 93.71701066078352


For Lasso()
Root Mean squared error is 1010.9051605753814
mean_absolute_error is 678.4035104917498
R2 square is 93.71419365857274


For Ridge()
Root Mean squared error is 1010.6730171043736
mean_absolute_error is 677.2308920416449
R2 square is 93.71708026240718


For ElasticNet()
Root Mean squared error is 1526.0636865619363
mean_absolute_error is 1058.8702566849374
R2 square is 85.67530162691462


