In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.model_selection import RandomizedSearchCV,train_test_split
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [2]:
df=pd.read_csv("gemstone.csv")

In [3]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
df.drop(labels=["id"],axis=1,inplace=True)

In [5]:
# Independent and dependent feature
X=df.drop(labels=["price"],axis=1)
Y=df["price"]

In [6]:
Y

0         13619
1         13387
2          2772
3           666
4         14453
          ...  
193568     1130
193569     2874
193570     3036
193571      681
193572     2258
Name: price, Length: 193573, dtype: int64

In [7]:
# Segregating numerical and categorical variable
categorical_cols=X.select_dtypes(include="object").columns
numerical_cols=X.select_dtypes(exclude="object").columns

In [8]:
numerical_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [9]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [10]:
from sklearn.impute import SimpleImputer # Handling missing values
from sklearn.preprocessing import StandardScaler # Feature scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [11]:
## Numerical Pipeline
num_pipeline=Pipeline( 
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [12]:
# train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.33,random_state=42)

In [13]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [14]:
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

In [15]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,2.626061,-2.888129,0.400868,2.233112,2.216066,1.856561,-0.130933,1.525655,-1.314696
1,-0.845291,0.164716,0.922458,-0.915966,-0.908068,-0.890852,-0.130933,-0.937159,-0.648656
2,-0.845291,-1.500472,1.96564,-0.843987,-0.899013,-0.963153,-0.130933,-0.321455,-0.648656
3,-0.694363,-0.667878,-0.642314,-0.637048,-0.636405,-0.673951,0.874463,-0.937159,-1.314696
4,1.548002,-0.482857,1.444049,1.477333,1.455407,1.393839,-0.130933,1.525655,0.683424


In [16]:
regressor=LinearRegression()
regressor.fit(X_train,y_train)

In [17]:
regressor.coef_

array([ 6432.59272318,  -133.11853452,   -70.36485019, -1713.18964719,
        -490.48291102,   -68.02812257,    68.36709467,  -464.25812278,
         651.94096231])

In [18]:
regressor.intercept_


3979.27372333125

# Model Training

In [19]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate_model(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_square = r2_score(y_test, y_pred)
    
    return mae, rmse, r2_square


In [21]:

models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

trained_model_list = []
model_list = []
r2_list = []

# Loop to train and evaluate each model
for i in range(len(list(models))):
    model_name = list(models.keys())[i]
    model = models[model_name]
    model.fit(X_train, y_train)

    # Make Predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mae, rmse, r2_square = evaluate_model(y_test, y_pred)

    # Print results
    print(model_name)
    model_list.append(model_name)

    print('Model Training Performance')
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 score:", r2_square * 100)

    r2_list.append(r2_square)

    print('=' * 35)
    print('\n')

Linear Regression
Model Training Performance
RMSE: 1014.274933056864
MAE: 674.7352796098307
R2 score: 93.63893549824441


Lasso
Model Training Performance
RMSE: 1014.33661582731
MAE: 675.8986621286323
R2 score: 93.63816178295377


Ridge
Model Training Performance
RMSE: 1014.2792052203284
MAE: 674.7687088427434
R2 score: 93.6388819120545


K-Neighbors Regressor
Model Training Performance
RMSE: 673.3829543634894
MAE: 351.4798747651847
R2 score: 97.19622827328664


Decision Tree
Model Training Performance
RMSE: 842.1295408833595
MAE: 425.5209950949697
R2 score: 95.61493314658762


Random Forest Regressor
Model Training Performance
RMSE: 612.6285963033545
MAE: 310.7790544740386
R2 score: 97.67933237002114


XGBRegressor
Model Training Performance
RMSE: 590.8313443876792
MAE: 298.09049175790346
R2 score: 97.84153272361638


CatBoosting Regressor
Model Training Performance
RMSE: 577.5641573416863
MAE: 294.7098771010348
R2 score: 97.93738162578012


AdaBoost Regressor
Model Training Performan

In [22]:
max(r2_list)

0.9793738162578012