# Project Title:Diamond Price Prediction using KNN & Streamlit 
Deployment

# Part 1: End-to-End ML Pipeline

# 1 Load the dataset using pandas

In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [15]:
df = pd.read_csv('diamonds.csv')

In [16]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [18]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [21]:
df.duplicated().sum()

0

In [20]:
df = df.drop_duplicates()

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53794 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53794 non-null  float64
 1   cut      53794 non-null  object 
 2   color    53794 non-null  object 
 3   clarity  53794 non-null  object 
 4   depth    53794 non-null  float64
 5   table    53794 non-null  float64
 6   price    53794 non-null  int64  
 7   x        53794 non-null  float64
 8   y        53794 non-null  float64
 9   z        53794 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.5+ MB


# 2 Identify input features (X) and target variable (price)

In [23]:
y = df['price']

In [24]:
X = df.drop('price',axis=1)

# 3 Split the dataset into train and test (75:25 ratio)

In [29]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [30]:
print(X_train.shape)
print(X_test.shape)

(40345, 9)
(13449, 9)


In [31]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53794 entries, 0 to 53939
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53794 non-null  float64
 1   cut      53794 non-null  object 
 2   color    53794 non-null  object 
 3   clarity  53794 non-null  object 
 4   depth    53794 non-null  float64
 5   table    53794 non-null  float64
 6   x        53794 non-null  float64
 7   y        53794 non-null  float64
 8   z        53794 non-null  float64
dtypes: float64(6), object(3)
memory usage: 4.1+ MB


# 4 Perform data preprocessing: categorical encoding and numerical scaling

In [27]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [32]:
ts = ColumnTransformer(transformers=[('t1',OrdinalEncoder(),[1,2,3]),
                                    ('t2',StandardScaler(),[0,4,5,6,7,8])])

In [33]:
X_train_trans = ts.fit_transform(X_train)
X_train_trans = pd.DataFrame(X_train_trans)

In [34]:
X_train_trans.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,3.0,5.0,3.0,1.484443,0.316883,1.577255,1.346911,1.273994,1.343714
1,3.0,6.0,2.0,-0.923934,-0.10178,1.131703,-1.062547,-1.009652,-1.029124
2,3.0,4.0,4.0,1.632325,-0.032003,0.240601,1.552161,1.597729,1.555575
3,4.0,1.0,3.0,0.449263,1.014656,0.686152,0.579454,0.530278,0.679884
4,4.0,4.0,4.0,-0.163394,0.735547,-0.650501,-0.009525,0.031551,0.100799


In [35]:
X_test_trans = ts.transform(X_test)
X_test_trans = pd.DataFrame(X_test_trans)

In [36]:
X_test_trans.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,2.0,4.0,7.0,-1.071817,-0.032003,-0.650501,-1.285645,-1.228392,-1.240984
1,2.0,3.0,4.0,0.554894,-0.171557,-0.20495,0.722237,0.722769,0.694008
2,2.0,5.0,3.0,1.547821,-0.311112,0.686152,1.507541,1.448986,1.414334
3,4.0,3.0,1.0,0.850659,-1.63688,1.577255,1.061345,1.064003,0.821125
4,3.0,1.0,4.0,-0.923934,-0.311112,0.240601,-1.044699,-1.009652,-1.043248


# 5 Train a KNN Regressor model using sklearn

In [37]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor(n_neighbors=5)

In [38]:
knr.fit(X_train_trans,y_train)

In [39]:
# training phase Prediction
y_pred_train = knr.predict(X_train_trans)

In [40]:
from sklearn.metrics import r2_score
score = r2_score(y_pred_train,y_train)
print(score)

0.9748486910557114


In [41]:
# Testing Phase Prediction
y_pred = knr.predict(X_test_trans)

# 6 Evaluate the model using MAE, RMSE, and R2 score

In [42]:
from sklearn.metrics import r2_score
score = r2_score(y_pred,y_test)
print(score)

0.9615147260394901


# 7 Save the trained pipeline using pickle

In [44]:
import pickle


with open("Diamond.pkl", "wb") as file:
    pickle.dump(knr, file)

print("Model saved successfully as model.pkl")

Model saved successfully as model.pkl
