In [55]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder,StandardScaler

In [56]:
data=pd.read_csv('diamonds.csv')
data

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


## price: price in US dollars ( 326−− 18,823)
## carat: weight of the diamond (0.2--5.01)
## cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
## color: diamond colour, from J (worst) to D (best)
## clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))
## x: length in mm (0--10.74)
## y: width in mm (0--58.9)
## z: depth in mm (0--31.8)
## depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
## table: width of top of diamond relative to widest point (43--95)

In [57]:
data.dtypes

Unnamed: 0      int64
carat         float64
cut            object
color          object
clarity        object
depth         float64
table         float64
price           int64
x             float64
y             float64
z             float64
dtype: object

In [58]:
data.isna().sum()

Unnamed: 0    0
carat         0
cut           0
color         0
clarity       0
depth         0
table         0
price         0
x             0
y             0
z             0
dtype: int64

In [59]:
data=data.drop('Unnamed: 0',axis=1)
data

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [60]:
data.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

In [61]:
data.cut.unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [62]:
data.clarity.unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [63]:
cut={'Ideal':5, 'Premium':4, 'Very Good':3, 'Good':2, 'Fair':1 }
color={'D':7, 'E':6, 'F':5, 'G':4, 'H':3, 'I':2, 'J':1}
clarity= {'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}

In [64]:
data['cut']= data['cut'].map(cut)

In [65]:
data['color']=data['color'].map(color)
data['clarity']= data['clarity'].map(clarity)

In [66]:
data.dtypes

carat      float64
cut          int64
color        int64
clarity      int64
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

In [67]:
x=data.drop('price',axis=1)
y=data['price']

In [68]:
x

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,5,6,2,61.5,55.0,3.95,3.98,2.43
1,0.21,4,6,3,59.8,61.0,3.89,3.84,2.31
2,0.23,2,6,5,56.9,65.0,4.05,4.07,2.31
3,0.29,4,2,4,62.4,58.0,4.20,4.23,2.63
4,0.31,2,1,2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
53935,0.72,5,7,3,60.8,57.0,5.75,5.76,3.50
53936,0.72,2,7,3,63.1,55.0,5.69,5.75,3.61
53937,0.70,3,7,3,62.8,60.0,5.66,5.68,3.56
53938,0.86,4,3,2,61.0,58.0,6.15,6.12,3.74


In [69]:
y

0         326
1         326
2         327
3         334
4         335
         ... 
53935    2757
53936    2757
53937    2757
53938    2757
53939    2757
Name: price, Length: 53940, dtype: int64

In [70]:
x_train, x_test,y_train,y_test = train_test_split(x,y, test_size=0.20, random_state=6)

In [71]:
model=KNeighborsRegressor(n_neighbors=20)

In [72]:
model.fit(x_train,y_train)

In [73]:
model.score(x_test,y_test)

0.9504498578432279

In [74]:
scaler=StandardScaler()
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.transform(x_test)

In [75]:
model2=KNeighborsRegressor(n_neighbors=20)

In [76]:
model2.fit(x_train_scaled,y_train)

In [77]:
model2.score(x_test_scaled,y_test)

0.9652204879663541

In [78]:
scaler1=MinMaxScaler()
x_train_scaled=scaler1.fit_transform(x_train)
x_test_scaled=scaler1.transform(x_test)

In [79]:
Model3=KNeighborsRegressor(n_neighbors=20)

In [80]:
Model3.fit(x_train_scaled,y_train)

In [81]:
Model3.score(x_test_scaled,y_test)

0.9575767426770042