MODEL TRAINING

In [1]:
import pandas as pd
print(pd.__version__, pd.__file__)

df = pd.read_csv("C:\\Diamond Price Prediction\\notebook\\data\\gemstone.csv")
df.head()

2.3.1 c:\Users\rajat\anaconda\Lib\site-packages\pandas\__init__.py


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [2]:
df = df.drop(labels=['id'],axis=1)

In [3]:
## independent and dependent features
X = df.drop(columns=['price'])  # price is your target
y = df['price']


In [4]:
y

0         13619
1         13387
2          2772
3           666
4         14453
          ...  
193568     1130
193569     2874
193570     3036
193571      681
193572     2258
Name: price, Length: 193573, dtype: int64

In [5]:
## train test split
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=30)

In [6]:
## define which columns should be ordinal -encoded and which should be scaled
categorical_cols = x_train.select_dtypes(include='object').columns
numerical_cols = x_train.select_dtypes(exclude='object').columns

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

In [8]:
##  define custom ranking for each ordinal variable

cut_categories = ['Fair', 'Good','Very Good','Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = {"I1","SI2","SI1","VS2","VS1","VVS2","VVS1","IF"}

In [13]:
categorical_cols = ['cut','color','clarity']
numerical_cols = [col for col in x_train.columns if col not in categorical_cols]

In [27]:
# Numeric pipeline
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
    ]

)

# Categorical pipeline
categorical_pipeline = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1))
    ]

)



# Column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])


In [28]:
for col in categorical_cols:
    print(f"{col} unique values:", x_train[col].unique())
    print(f"{col} NaN count:", x_train[col].isna().sum())
    print("-" * 40)

cut unique values: ['Ideal' 'Good' 'Premium' 'Very Good' 'Fair']
cut NaN count: 0
----------------------------------------
color unique values: ['I' 'E' 'G' 'J' 'F' 'H' 'D']
color NaN count: 0
----------------------------------------
clarity unique values: ['VVS2' 'SI1' 'VS1' 'VS2' 'VVS1' 'SI2' 'IF' 'I1']
clarity NaN count: 0
----------------------------------------


In [29]:
print("X_train columns:", list(x_train.columns))
print("Numerical cols list:", numerical_cols)
print("Categorical cols list:", categorical_cols)

# Check if any are missing
missing_num = [col for col in numerical_cols if col not in x_train.columns]
missing_cat = [col for col in categorical_cols if col not in x_train.columns]
print("Missing numerical cols:", missing_num)
print("Missing categorical cols:", missing_cat)

X_train columns: ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']
Numerical cols list: ['carat', 'depth', 'table', 'x', 'y', 'z']
Categorical cols list: ['cut', 'color', 'clarity']
Missing numerical cols: []
Missing categorical cols: []


In [37]:
# 1️⃣ Run transformations
x_train_transformed = preprocessor.fit_transform(x_train)
x_test_transformed = preprocessor.transform(x_test)

# 2️⃣ Check the shapes vs expected feature count
print("Transformed X_train shape:", x_train_transformed.shape)
print("Expected feature count:", len(numerical_cols) + len(categorical_cols))
print("Numerical cols:", numerical_cols)
print("Categorical cols:", categorical_cols)

# 3️⃣ Fix feature names length mismatch
feature_names = list(numerical_cols) + list(categorical_cols)
if x_train_transformed.shape[1] != len(feature_names):
    print("⚠ Feature count mismatch — adjusting names...")
    feature_names = [f"feature_{i}" for i in range(x_train_transformed.shape[1])]

# 4️⃣ Create DataFrames
x_train = pd.DataFrame(x_train_transformed, columns=feature_names)
x_test = pd.DataFrame(x_test_transformed, columns=feature_names)

Transformed X_train shape: (135501, 9)
Expected feature count: 9
Numerical cols: ['carat', 'depth', 'table', 'x', 'y', 'z']
Categorical cols: ['cut', 'color', 'clarity']


In [38]:
# Apply transformations
x_train_transformed = preprocessor.fit_transform(x_train)
x_test_transformed = preprocessor.transform(x_test)
feature_names = list(numerical_cols) + list(categorical_cols)
x_train = pd.DataFrame(x_train_transformed,  columns=feature_names)
x_test = pd.DataFrame(x_test_transformed, columns=feature_names)

In [39]:
x_train.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
0,-0.975439,-0.849607,-0.121531,-1.042757,-1.08097,-1.12315,2.0,5.0,7.0
1,0.235195,1.833637,-0.121531,0.318447,0.279859,0.485354,1.0,1.0,2.0
2,0.494617,0.815855,0.3998,0.570855,0.606458,0.673737,3.0,3.0,4.0
3,-1.018676,0.260701,0.921131,-1.214034,-1.24427,-1.195605,3.0,3.0,5.0
4,-0.953821,-0.664555,-0.642862,-1.069801,-1.044681,-1.094168,2.0,6.0,7.0


In [40]:
## model training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [41]:
regression = LinearRegression()
regression.fit(x_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [45]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae, rmse, r2_square 

In [None]:
## train multiple models

models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    "Ridge":Ridge(),
    'ElasticNet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,y_train)
    
    ## make prediction
    y_pred = model.predict(x_test)
    
    mae, rmse, r2_square = evaluate_model(y_test,y_pred)
    
    print('Model Training Performance')
    print('RMSE',rmse)
    print('MAE',mae)
    print('R2 score',r2_square*100)
    
    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')
    
    


In [46]:
## train multiple models

models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    "Ridge":Ridge(),
    'ElasticNet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,y_train)
    
    ## make prediction
    y_pred = model.predict(x_test)
    
    mae, rmse, r2_square = evaluate_model(y_test,y_pred)
    
    print('Model Training Performance')
    print('RMSE',rmse)
    print('MAE',mae)
    print('R2 score',r2_square*100)
    
    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')
    
    


Model Training Performance
RMSE 1110.8722516130488
MAE 708.9493740926935
R2 score 92.42423558860466


Model Training Performance
RMSE 1110.7991387156956
MAE 709.8924383102544
R2 score 92.42523276511665


Model Training Performance
RMSE 1110.8745030060365
MAE 708.9704779173325
R2 score 92.42420488113109


Model Training Performance
RMSE 1577.0672778896103
MAE 1094.3974389350303
R2 score 84.7314194407832


