In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [2]:
df=pd.read_csv("cubic_zirconia.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
4,5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


In [4]:
df.isna().sum()

Unnamed: 0      0
carat           0
cut             0
color           0
clarity         0
depth         697
table           0
x               0
y               0
z               0
price           0
dtype: int64

In [5]:
df.duplicated().sum()

0

In [6]:
import seaborn as sns

In [7]:
sns.heatmap(df.corr(),annot=True)

ValueError: could not convert string to float: 'Ideal'

In [11]:
df['depth'].min(),df['depth'].max()

(50.8, 73.6)

In [12]:
x=df[["depth","x","y","z"]]

In [13]:
from sklearn.impute import KNNImputer
impute_knn = KNNImputer(n_neighbors=10)
x=impute_knn.fit_transform(x)

In [14]:
x=pd.DataFrame(x,columns=["depth","x","y","z"])

In [15]:
x.isna().sum()

depth    0
x        0
y        0
z        0
dtype: int64

In [16]:
df.drop(["Unnamed: 0","depth","x","y","z"],axis=1,inplace=True)

In [17]:
df.head()

Unnamed: 0,carat,cut,color,clarity,table,price
0,0.3,Ideal,E,SI1,58.0,499
1,0.33,Premium,G,IF,58.0,984
2,0.9,Very Good,E,VVS2,60.0,6289
3,0.42,Ideal,F,VS1,56.0,1082
4,0.31,Ideal,F,VVS1,59.0,779


In [18]:
main_df=pd.concat([df,x],axis=1)

In [19]:
main_df.shape

(26967, 10)

In [20]:
main_df.isna().sum()

carat      0
cut        0
color      0
clarity    0
table      0
price      0
depth      0
x          0
y          0
z          0
dtype: int64

In [None]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-1.7.5-py3-none-win_amd64.whl (70.9 MB)
     ---------------------------------------- 70.9/70.9 MB 4.1 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5



[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [22]:
x=main_df.drop(["price"],axis=1)
y=df.price

In [23]:
# Carat -> Carat weight of the cubic zirconia.

# Cut ->  Describe the cut quality of the cubic zirconia. Quality is increasing order Fair,Good,Very Good,Premium,Ideal.

# Color ->  Colour of the cubic zirconia.With D being the best and J the worst.

# Clarity ->  cubic zirconia Clarity refers to the absence of the Inclusions and Blemishes. (In order from Best to Worst, FL= flawless, I3= level 3 inclusions) FL, IF, VVS1, VVS2, VS1, VS2, SI1, SI2, I1, I2, I3

# Depth ->  The Height of a cubic zirconia, measured from the Culet to the table, divided by its average Girdle Diameter.

# Table ->  The Width of the cubic zirconia's Table expressed as a Percentage of its Average Diameter.

# Price ->  the Price of the cubic zirconia.

# X -> Length of the cubic zirconia in mm.

# Y ->  Width of the cubic zirconia in mm.

# Z ->  Height of the cubic zirconia in mm.

In [24]:
cut={'Ideal':4, 'Premium':3, 'Very Good':2, 'Good':1, 'Fair':0}
clarity={'SI1':2, 'IF':7, 'VVS2':5, 'VS1':4, 'VVS1':6, 'VS2':3, 'SI2':1, 'I1':0}
color={'E':1, 'G':3, 'F':2, 'D':0, 'H':4, 'J':6, 'I':5}

In [25]:
x['carat'].max()

4.5

In [26]:
x['color']=x['color'].map(color)
x['cut']=x['cut'].map(cut)
x['clarity']=x['clarity'].map(clarity)

In [27]:
x.tail()
# x.z.describe()

Unnamed: 0,carat,cut,color,clarity,table,depth,x,y,z
26962,1.11,3,3,2,58.0,62.3,6.61,6.52,4.09
26963,0.33,4,4,7,55.0,61.9,4.44,4.42,2.74
26964,0.51,3,1,3,58.0,61.7,5.12,5.15,3.17
26965,0.27,2,2,5,56.0,61.8,4.19,4.2,2.6
26966,1.25,3,6,2,58.0,62.0,6.9,6.88,4.27


In [28]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(x,y,test_size=0.2)

In [29]:
from sklearn.linear_model import LinearRegression

In [30]:
xgbreg = xgb.XGBRegressor()

In [31]:
xgbreg.fit(Xtrain,Ytrain)

In [32]:
from sklearn.metrics import r2_score

In [33]:
xgbreg.predict(Xtest)

array([8996.693 , 3493.843 , 8163.147 , ...,  605.4413, 4653.68  ,
       3704.3528], dtype=float32)

In [34]:
r2_score(Ytest,xgbreg.predict(Xtest))

0.9789796186061188

In [95]:
l1=[]
l2=[]

In [98]:
for i in range(10):
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(x,y,test_size=0.2)
    xgbreg.fit(Xtrain,Ytrain)
    l1.append(r2_score(Ytest,xgbreg.predict(Xtest)))
    l2.append(r2_score(Ytrain,xgbreg.predict(Xtrain)))

In [99]:
l1,l2

([0.9812337955297017,
  0.9808955110278388,
  0.9825984704320396,
  0.9815681880487179,
  0.9808609331310945,
  0.9814701269336301,
  0.9804337455058934,
  0.9828658101256978,
  0.9816814820971254,
  0.9818907134276418,
  0.9815370142040669,
  0.981157013718071,
  0.9792735304086286,
  0.9801785177367129,
  0.9803159278347907,
  0.9826404584169722,
  0.9820510155458962,
  0.9800522700346112,
  0.9812804436610114,
  0.981205142768455],
 [0.9938926392805977,
  0.9938297728071335,
  0.9940947268760075,
  0.9940376962380354,
  0.9937870763047983,
  0.9936902622054185,
  0.9937001583437296,
  0.9937888674217942,
  0.9936956092331871,
  0.9936935316603408])

In [93]:
from sklearn.linear_model import LinearRegression

In [94]:
model=LinearRegression()

In [100]:
for i in range(10):
    model.fit(Xtrain,Ytrain)
    l2.append(r2_score(Ytest,model.predict(Xtest)))
    l1.append(r2_score(Ytest,xgbreg.predict(Xtest)))
    l2.append(r2_score(Ytrain,xgbreg.predict(Xtrain)))

In [101]:
l1

[0.9812337955297017,
 0.9808955110278388,
 0.9825984704320396,
 0.9815681880487179,
 0.9808609331310945,
 0.9814701269336301,
 0.9804337455058934,
 0.9828658101256978,
 0.9816814820971254,
 0.9818907134276418,
 0.9815370142040669,
 0.981157013718071,
 0.9792735304086286,
 0.9801785177367129,
 0.9803159278347907,
 0.9826404584169722,
 0.9820510155458962,
 0.9800522700346112,
 0.9812804436610114,
 0.981205142768455,
 0.981205142768455,
 0.981205142768455,
 0.981205142768455,
 0.981205142768455,
 0.981205142768455,
 0.981205142768455,
 0.981205142768455,
 0.981205142768455,
 0.981205142768455,
 0.981205142768455]

In [102]:
l2

[0.9938926392805977,
 0.9938297728071335,
 0.9940947268760075,
 0.9940376962380354,
 0.9937870763047983,
 0.9936902622054185,
 0.9937001583437296,
 0.9937888674217942,
 0.9936956092331871,
 0.9936935316603408,
 0.9020672078713383,
 0.9936935316603408,
 0.9020672078713383,
 0.9936935316603408,
 0.9020672078713383,
 0.9936935316603408,
 0.9020672078713383,
 0.9936935316603408,
 0.9020672078713383,
 0.9936935316603408,
 0.9020672078713383,
 0.9936935316603408,
 0.9020672078713383,
 0.9936935316603408,
 0.9020672078713383,
 0.9936935316603408,
 0.9020672078713383,
 0.9936935316603408,
 0.9020672078713383,
 0.9936935316603408]

In [107]:
from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler

In [108]:
pipeline=Pipeline([
    ("Model",xgb.XGBRegressor())
]
)

In [110]:
pipeline.fit(Xtrain,Ytrain)

Pipeline(steps=[('Model',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=None, early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, gamma=None, gpu_id=None,
                              grow_policy=None, importance_type=None,
                              interaction_constraints=None, learning_rate=None,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=None, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, n_estimators=100,
                              n_jobs=None, num_parallel_tree=None,
                          

In [111]:
r2_score(Ytest,pipeline.predict(Xtest))

0.981205142768455

In [112]:
!pip install pickle

ERROR: Could not find a version that satisfies the requirement pickle (from versions: none)
ERROR: No matching distribution found for pickle

[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [113]:
import pickle

In [130]:
pickle.dump(pipeline,open("gemstonereg.pkl","wb"))

In [117]:
df.color.unique()

array(['E', 'G', 'F', 'D', 'H', 'J', 'I'], dtype=object)

In [133]:
d1={}

for i,j in enumerate(df["color"].unique()):
    d1[i]=j

In [134]:
d1

{0: 'E', 1: 'G', 2: 'F', 3: 'D', 4: 'H', 5: 'J', 6: 'I'}

In [None]:
d1[pipe.predict(input)]