In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r"data/gemstone.csv")

In [3]:
df.drop(labels = ['id'], axis = 1, inplace = True)

In [4]:
# Independednt and dependent features

X = df.drop(labels = ['price'], axis = 1)
y = df[['price']]


In [5]:
from sklearn.model_selection import train_test_split
categorical_variables = X.select_dtypes(include = 'object').columns
numerical_variables = X.select_dtypes(exclude = 'object').columns

In [6]:
categorical_variables

Index(['cut', 'color', 'clarity'], dtype='object')

In [7]:
from sklearn.impute import SimpleImputer # this is to treat missing values
from sklearn.preprocessing import  StandardScaler

from sklearn.preprocessing import OrdinalEncoder
#pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
#Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'  ]
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [9]:
#Numerical pipeline
num_pipeline = Pipeline(
    steps =[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ]
)

#Categorical pipeline
cat_pipeline = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder', OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_variables),
    ('cat_pipeline',cat_pipeline,categorical_variables)

])

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.3, random_state=42)

In [11]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())


In [14]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error

regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [71]:
regressor.intercept_

array([3976.8787389])

In [23]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted),
    mse = mean_squared_error(true, predicted),
    rmse = np.sqrt(mean_squared_error(true, predicted)),
    r_square = r2_score(true, predicted)

    return mae, mse, rmse , r_square

In [35]:
# Train multiple models

models= {
    'linearRegressor': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'Elastic': ElasticNet()
}
trained_model_list = []
model_list = []
r2_list = []
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae, mse, rmse , r_square = evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])


    print('Model Training Performance')
    print('RMSE :', rmse)
    print('mae :',mae)
    print('mse :',mse)
    print('r2_score :',r_square*100)

    r2_list.append(r_square)

    print("="*35)
    print("\n")



    

linearRegressor
Model Training Performance
RMSE : (1014.6296630375463,)
mae : (675.0758270067483,)
mse : (1029473.3531156846,)
r2_score : 93.62906819996049


Ridge
Model Training Performance
RMSE : (1014.634323353441,)
mae : (675.1077629781341,)
mse : (1029482.8101268951,)
r2_score : 93.62900967491632


Lasso
Model Training Performance
RMSE : (1014.659130275064,)
mae : (676.2421173665509,)
mse : (1029533.1506505491,)
r2_score : 93.62869814082755


Elastic
Model Training Performance
RMSE : (1533.3541245902313,)
mae : (1060.9432977143008,)
mse : (2351174.871397875,)
r2_score : 85.44967219374031




In [33]:
model_list

['linearRegressor', 'Ridge', 'Lasso', 'Elastic']