### Task - KNN from Scratch


#### 1.Load data

In [2]:
import pandas as pd

In [3]:
df=pd.read_csv(r"C:\Users\sharw\OneDrive\Desktop\ML\diamonds.csv")

In [4]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [5]:
df.shape

(53940, 10)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


#### Exploratory data analysis

In [7]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [8]:
df.duplicated().sum()

146

In [9]:
df=df.drop_duplicates()

In [10]:
df.duplicated().sum()

0

In [11]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53794.0,53794.0,53794.0,53794.0,53794.0,53794.0,53794.0
mean,0.79778,61.74808,57.458109,3933.065082,5.731214,5.734653,3.538714
std,0.47339,1.429909,2.233679,3988.11446,1.120695,1.141209,0.705037
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,951.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5326.75,6.54,6.54,4.03
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


#### 3.Segregating features and target

In [12]:
X=df.drop('price',axis=1)
y=df['price']

#### 4.Splitting data into train and test

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [15]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((43035, 9), (10759, 9), (43035,), (10759,))

#### 5. Train Data Preprocessing

In [17]:
#ordinal encoding 
from sklearn.preprocessing import OrdinalEncoder

In [18]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [19]:
df['color'].unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [20]:
df['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [21]:
#order 
# cut 
cut_order= ['Fair','Good','Very Good','Premium','Ideal']
#clarity 
clarity_order = ['I1','SI1','SI2','VS1', 'VS2','VVS1','VVS2','IF']
#color 
color_order= ['D','E','F','G','H','I','J']

In [22]:
oe=OrdinalEncoder(categories=[cut_order,color_order,clarity_order])

In [23]:
oe

In [24]:
from sklearn.preprocessing import RobustScaler

In [25]:
#Robust Scalar 
scaler=RobustScaler()
scaler

In [26]:
from sklearn.compose import ColumnTransformer

In [27]:
ct=ColumnTransformer(
    [('Scale',scaler,['carat','depth','table','x','y','z']),
    ('Ordinal',oe,['cut',"color","clarity"])],
    remainder='passthrough', #gives value for remaining column names
    verbose_feature_names_out=False #no prefix to column names
).set_output(transform='pandas')

In [28]:
ct

In [29]:
X_train_transformed = ct.fit_transform(X_train)
X_train_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
33544,-0.546875,0.133333,-0.666667,-0.666667,-0.648352,-0.646018,4.0,0.0,4.0
26752,2.359375,0.133333,0.0,1.459016,1.428571,1.469027,4.0,0.0,2.0
45914,-0.265625,-1.2,0.0,-0.245902,-0.225275,-0.318584,4.0,1.0,4.0
3071,0.15625,-0.066667,0.333333,0.147541,0.120879,0.132743,3.0,0.0,1.0
43829,-0.453125,-0.2,-0.333333,-0.480874,-0.489011,-0.495575,4.0,1.0,5.0


In [30]:
X_test_transformed= ct.transform(X_test)
X_test_transformed.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
43657,0.015625,2.066667,-1.0,-0.032787,-0.098901,0.088496,0.0,6.0,4.0
4274,0.3125,-0.533333,0.666667,0.245902,0.258242,0.212389,2.0,1.0,2.0
47412,-0.203125,0.2,-0.333333,-0.213115,-0.208791,-0.19469,4.0,3.0,3.0
44437,-0.296875,0.466667,1.0,-0.333333,-0.335165,-0.300885,3.0,1.0,4.0
13975,0.765625,0.333333,-0.666667,0.606557,0.60989,0.637168,4.0,5.0,1.0


#### KNN Scratch Implementation

In [32]:
query = X_test_transformed.sample(n=1, random_state=42)
query

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
31605,-0.625,-0.133333,1.333333,-0.770492,-0.758242,-0.769912,2.0,6.0,4.0


In [33]:
from sklearn.metrics import euclidean_distances

In [38]:
distances=pd.DataFrame(euclidean_distances(X_train_transformed,query))
distances

Unnamed: 0,0
0,6.641944
1,8.337931
2,5.727796
3,7.074265
4,5.748338
...,...
43030,3.457461
43031,5.393974
43032,4.142837
43033,3.863917


In [40]:
dist=distances.sort_values(by=0,ascending=True)
dist.head()

Unnamed: 0,0
40755,0.574844
38190,0.717426
1351,0.803701
35006,0.857724
42556,0.8734


In [41]:
# 5-similar points
k_5 = dist.iloc[:5, :]
k_5

Unnamed: 0,0
40755,0.574844
38190,0.717426
1351,0.803701
35006,0.857724
42556,0.8734


In [42]:
neighbor_indices = k_5.index
neighbor_indices

Index([40755, 38190, 1351, 35006, 42556], dtype='int64')

In [44]:
#finding targets classes of k_5
neighbor_labels = y_train.iloc[neighbor_indices]
neighbor_labels

38251    380
11711    596
11710    596
37444    984
3058     566
Name: price, dtype: int64

In [47]:
prediction = y_train[neighbor_indices].mean()

In [48]:
prediction

1470.0

#### 6. Model Building

In [51]:
from sklearn.neighbors import KNeighborsRegressor

In [52]:
knn=KNeighborsRegressor()
knn

In [53]:
knn.fit(X_train_transformed,y_train)

In [54]:
knn.score(X_train_transformed,y_train)

0.9744443614117048

#### 8. Model or Performance Evaluation

In [55]:
y_pred= knn.predict(X_test_transformed)
y_pred

array([ 2404.4,  3701.2,  1930.6, ...,   894.4, 14627. ,   810. ])

In [56]:
from sklearn.metrics import r2_score

In [57]:
r2_score(y_pred,y_test)

0.9563725512907589