In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [2]:
df=pd.read_csv("C:\\Users\\User\\Downloads\\diamonds.csv")

In [3]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [4]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [4]:
y=df['price']
X=df.drop(['price','x','y','z'],axis=1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [6]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.75,random_state=24)

In [7]:
num_col=X_train.select_dtypes(include=['int64','float64']).columns
cat_col=X_train.select_dtypes(include=['object']).columns

In [21]:
num_col

Index(['carat', 'depth', 'table'], dtype='object')

In [8]:
cat_col

Index(['cut', 'color', 'clarity'], dtype='object')

In [9]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(40455, 6)
(40455,)
(13485, 6)
(13485,)


In [10]:
transformer=ColumnTransformer(transformers=[('t1',OrdinalEncoder(),cat_col),
                                            ('t2',StandardScaler(),num_col)])

In [11]:
X_train_trans=transformer.fit_transform(X_train)
X_test_trans=transformer.transform(X_test)

In [12]:
X_train_trans=pd.DataFrame(X_train_trans)
X_test_trans=pd.DataFrame(X_test_trans)

In [13]:
knn=KNeighborsRegressor(n_neighbors=7,weights='uniform',metric='minkowski',p=2)

In [14]:
knn.fit(X_train_trans,y_train)

In [15]:
y_test_pred=knn.predict(X_test_trans)
y_train_pred=knn.predict(X_train_trans)

In [16]:
r2_test=r2_score(y_test,y_test_pred)
r2_train=r2_score(y_train,y_train_pred)
mae_test=mean_absolute_error(y_test,y_test_pred)
rmse_test=np.sqrt(mean_squared_error(y_test,y_test_pred))

In [17]:
print(y_train_pred.shape)
print(y_test_pred.shape)

(40455,)
(13485,)


In [18]:
print(f"Test R2 Score:{r2_test:.4f}")
print(f"Train R2 Score:{r2_train:.4f}")
print(f"Test MAE:{mae_test:.2f}")
print(f"Test RMSE:{rmse_test:.2f}")

Test R2 Score:0.9492
Train R2 Score:0.9633
Test MAE:510.48
Test RMSE:898.64


In [22]:
import joblib
from sklearn.pipeline import Pipeline
diamond_pipeline = Pipeline(steps=[
    ('preprocessor', transformer), 
    ('model', knn)
])
joblib.dump(diamond_pipeline, "diamond_model.pkl")

['diamond_model.pkl']

In [20]:
import sklearn
print(sklearn.__version__)

1.6.1
