### **DIAMOND DATASET**

In [363]:
# All Import statements

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [364]:
df = pd.read_csv('/content/diamonds.csv')

drop_col = 'Unnamed: 0'
df.drop(drop_col, axis=1,inplace=True)
df.head()

Unnamed: 0,cut,color,clarity,carat_weight,cut_quality,lab,symmetry,polish,eye_clean,culet_size,...,meas_depth,girdle_min,girdle_max,fluor_color,fluor_intensity,fancy_color_dominant_color,fancy_color_secondary_color,fancy_color_overtone,fancy_color_intensity,total_sales_price
0,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,unknown,N,...,1.79,M,M,unknown,,unknown,unknown,unknown,unknown,200.0
1,Round,E,VVS2,0.09,Very Good,IGI,Very Good,Very Good,unknown,N,...,1.78,STK,STK,unknown,,unknown,unknown,unknown,unknown,200.0
2,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,unknown,unknown,...,1.77,TN,M,unknown,,unknown,unknown,unknown,unknown,200.0
3,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,unknown,unknown,...,1.78,M,STK,unknown,,unknown,unknown,unknown,unknown,200.0
4,Round,E,VVS2,0.09,Very Good,IGI,Very Good,Excellent,unknown,N,...,1.82,STK,STK,unknown,,unknown,unknown,unknown,unknown,200.0


In [365]:
df.isnull().sum()

cut                            0
color                          0
clarity                        0
carat_weight                   0
cut_quality                    0
lab                            0
symmetry                       0
polish                         0
eye_clean                      0
culet_size                     0
culet_condition                0
depth_percent                  0
table_percent                  0
meas_length                    0
meas_width                     0
meas_depth                     0
girdle_min                     0
girdle_max                     0
fluor_color                    0
fluor_intensity                0
fancy_color_dominant_color     0
fancy_color_secondary_color    0
fancy_color_overtone           0
fancy_color_intensity          0
total_sales_price              1
dtype: int64

In [366]:
# there is only 1 null feature having 1 null value replace it by taking mode

mode_for_total_sales_price = df['total_sales_price'].mode().iloc[0]

df['total_sales_price'].fillna(mode_for_total_sales_price, inplace=True)

In [367]:
df.describe(include='O').T

Unnamed: 0,count,unique,top,freq
cut,156488,11,Round,125598
color,156488,11,E,26234
clarity,156488,11,VS2,26812
cut_quality,156488,6,Excellent,98503
lab,156488,3,GIA,144236
symmetry,156488,5,Excellent,97030
polish,156488,5,Excellent,125117
eye_clean,156488,5,unknown,105894
culet_size,156488,8,N,92191
culet_condition,156488,4,unknown,147466


In [368]:
df.columns

Index(['cut', 'color', 'clarity', 'carat_weight', 'cut_quality', 'lab',
       'symmetry', 'polish', 'eye_clean', 'culet_size', 'culet_condition',
       'depth_percent', 'table_percent', 'meas_length', 'meas_width',
       'meas_depth', 'girdle_min', 'girdle_max', 'fluor_color',
       'fluor_intensity', 'fancy_color_dominant_color',
       'fancy_color_secondary_color', 'fancy_color_overtone',
       'fancy_color_intensity', 'total_sales_price'],
      dtype='object')

In [369]:
# as instead of having null value there is unknown value in every columns

def count_for_null_values(columns):
  for col in columns:
    counts = (df[col] == 'unknown').sum()
    print(f'{col}:  {counts}')

print('No of Rows: ', df.shape[0])
print('--------------------------')
count_for_null_values(df.columns)

No of Rows:  156488
--------------------------
cut:  0
color:  3023
clarity:  0
carat_weight:  0
cut_quality:  30711
lab:  0
symmetry:  0
polish:  0
eye_clean:  105894
culet_size:  63187
culet_condition:  147466
depth_percent:  0
table_percent:  0
meas_length:  0
meas_width:  0
meas_depth:  0
girdle_min:  63086
girdle_max:  63404
fluor_color:  147377
fluor_intensity:  17
fancy_color_dominant_color:  153463
fancy_color_secondary_color:  156179
fancy_color_overtone:  155640
fancy_color_intensity:  153465
total_sales_price:  0


### **Handling the unknown values**

In [370]:
temp_df = df.copy()

In [371]:
# color feature
# for color columns we will take the mode value

mode_for_feature_color = temp_df['color'].mode()[0]
temp_df['color'].replace('unknown', mode_for_feature_color, inplace=True)
(temp_df['color'] == 'unknown').sum() # check for null or unknown value

0

In [372]:
# cut_quality feature
# obtain the values either through the 4 C's of diamond or -- > #1
# through the features like Symmetry, Polish, Clarity --- > #2

import pandas as pd

diamonds = temp_df
diamonds['cut_quality'] = diamonds['cut_quality'].replace('unknown', pd.NA)

# Calculate the overall mode value of 'cut_quality' for non-null values
overall_mode_for_cut_quality = diamonds['cut_quality'].dropna().mode().iloc[0]

# Filter out rows with null 'cut_quality' values
filtered_diamonds = diamonds.dropna(subset=['cut_quality'])

# Create a dictionary to map unique combinations to 'cut_quality'
unique_combinations = filtered_diamonds[['symmetry', 'polish', 'clarity', 'cut_quality']].drop_duplicates()

# Create a dictionary to map unique combinations to 'cut_quality'
unique_combinations = diamonds[['symmetry', 'polish', 'clarity', 'cut_quality']].drop_duplicates()

# Convert NumPy arrays to tuples for use as dictionary keys
unique_combinations['key'] = list(zip(unique_combinations['symmetry'],
                                      unique_combinations['polish'],
                                      unique_combinations['clarity'],))

combination_to_cut_quality = dict(zip(unique_combinations['key'], unique_combinations['cut_quality']))

# Define a function to fill missing 'cut_quality' values
def fill_missing_cut_quality(row):
    if pd.isna(row['cut_quality']):
        key = (row['symmetry'], row['polish'], row['clarity'])
        if key in combination_to_cut_quality:
            if pd.notna(combination_to_cut_quality.get(key)):
                return combination_to_cut_quality[key]
        return overall_mode_for_cut_quality
    return row['cut_quality']

# Apply the function to fill missing 'cut_quality' values
diamonds['cut_quality'] = diamonds.apply(fill_missing_cut_quality, axis=1)
diamonds['cut_quality'] = diamonds['cut_quality'].replace(pd.NA, 'unknown')
temp_df = diamonds

In [373]:
temp_df['cut_quality']

0         Excellent
1         Very Good
2         Excellent
3         Excellent
4         Very Good
            ...    
156483    Excellent
156484    Very Good
156485    Excellent
156486    Excellent
156487    Excellent
Name: cut_quality, Length: 156488, dtype: object

In [374]:
# culet_size
# we will take the mode for it

mode_for_feature_culet_size = temp_df['culet_size'].mode()[0]
temp_df['culet_size'].replace('unknown', mode_for_feature_culet_size, inplace=True)
(temp_df['culet_size'] == 'unknown').sum() # check for null or unknown value

0

In [375]:
# griddle_min/max
# as this feature can only be derived from the physical attributes such as
# measurements, proportions, analysis or detailed gemological analysis
# we will choose the mode value of the feature to fill the null value or unknown value

gridle_min_without_unknown = temp_df[temp_df['girdle_min'] != 'unknown']
gridle_max_without_unknown = temp_df[temp_df['girdle_max'] != 'unknown']

mode_for_feature_girdle_min = gridle_min_without_unknown['girdle_min'].mode()[0]
mode_for_feature_girdle_max = gridle_max_without_unknown['girdle_max'].mode()[0]

temp_df['girdle_min'].replace('unknown', mode_for_feature_girdle_min, inplace=True)
temp_df['girdle_max'].replace('unknown', mode_for_feature_girdle_max, inplace=True)

print("Unknown in Girdle Min", (temp_df['girdle_min'] == 'unknown').sum()) # check for null or unknown value for girdle_min
print("Unknown in Girdle Max", (temp_df['girdle_min'] == 'unknown').sum()) # check for null or unknown value for girdle_max


Unknown in Girdle Min 0
Unknown in Girdle Max 0


In [376]:
# Removal of the below feature is beacause of presence of null values more than half of the total entites.
# eye_clean is also remove because of the as it is almost null for half of entites and also that is indepenent from person to person.
# In below there are also the features which are dependent feature of removed feature so we must have to remove those.

temp_df = temp_df.drop(columns=['eye_clean',
                      'culet_condition',
                      'fluor_color',
                      'fluor_intensity',
                      'fancy_color_dominant_color',
                      'fancy_color_secondary_color',
                      'fancy_color_overtone',
                      'fancy_color_intensity'])
temp_df

Unnamed: 0,cut,color,clarity,carat_weight,cut_quality,lab,symmetry,polish,culet_size,depth_percent,table_percent,meas_length,meas_width,meas_depth,girdle_min,girdle_max,total_sales_price
0,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,N,62.7,59.0,2.85,2.87,1.79,M,M,200.0
1,Round,E,VVS2,0.09,Very Good,IGI,Very Good,Very Good,N,61.9,59.0,2.84,2.89,1.78,STK,STK,200.0
2,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,N,61.1,59.0,2.88,2.90,1.77,TN,M,200.0
3,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,N,62.0,59.0,2.86,2.88,1.78,M,STK,200.0
4,Round,E,VVS2,0.09,Very Good,IGI,Very Good,Excellent,N,64.9,58.5,2.79,2.83,1.82,STK,STK,200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156483,Cushion Modified,E,IF,0.53,Excellent,GIA,Good,Good,N,68.7,65.0,4.70,4.31,2.97,STK,XTK,4198.0
156484,Radiant,E,SI2,0.88,Very Good,GIA,Good,Very Good,N,61.5,69.0,5.66,5.39,3.32,XTK,STK,4198.0
156485,Round,G,SI1,0.81,Excellent,GIA,Excellent,Excellent,N,58.8,61.0,6.07,6.10,3.58,M,M,4199.0
156486,Round,G,SI1,0.80,Excellent,GIA,Excellent,Excellent,N,62.4,60.0,5.89,5.91,3.68,TN,STK,4199.0


## **Encode categorical variables using techniques like one-hot encoding or label encoding**

In [377]:
# encoding can be done in two different ways
# for nominal attributes we use one-hot encoding
# for ordinal attributes we use label encoding


# below is the one-hot encoding
temp_df = pd.get_dummies(temp_df, columns=['cut', 'lab'], prefix=["cut", "lab"])

temp_df.head()

Unnamed: 0,color,clarity,carat_weight,cut_quality,symmetry,polish,culet_size,depth_percent,table_percent,meas_length,...,cut_Heart,cut_Marquise,cut_Oval,cut_Pear,cut_Princess,cut_Radiant,cut_Round,lab_GIA,lab_HRD,lab_IGI
0,E,VVS2,0.09,Excellent,Very Good,Very Good,N,62.7,59.0,2.85,...,0,0,0,0,0,0,1,0,0,1
1,E,VVS2,0.09,Very Good,Very Good,Very Good,N,61.9,59.0,2.84,...,0,0,0,0,0,0,1,0,0,1
2,E,VVS2,0.09,Excellent,Very Good,Very Good,N,61.1,59.0,2.88,...,0,0,0,0,0,0,1,0,0,1
3,E,VVS2,0.09,Excellent,Very Good,Very Good,N,62.0,59.0,2.86,...,0,0,0,0,0,0,1,0,0,1
4,E,VVS2,0.09,Very Good,Very Good,Excellent,N,64.9,58.5,2.79,...,0,0,0,0,0,0,1,0,0,1


In [378]:
# below is the label encoding for the ordinal attributes
features_to_encode = ['color', 'clarity', 'cut_quality','symmetry', 'polish', 'culet_size', 'girdle_min', 'girdle_max']

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Loop through the selected features and apply label encoding
for feature in features_to_encode:
    temp_df[feature] = label_encoder.fit_transform(temp_df[feature])

temp_df.head()

Unnamed: 0,color,clarity,carat_weight,cut_quality,symmetry,polish,culet_size,depth_percent,table_percent,meas_length,...,cut_Heart,cut_Marquise,cut_Oval,cut_Pear,cut_Princess,cut_Radiant,cut_Round,lab_GIA,lab_HRD,lab_IGI
0,1,10,0.09,0,4,4,3,62.7,59.0,2.85,...,0,0,0,0,0,0,1,0,0,1
1,1,10,0.09,4,4,4,3,61.9,59.0,2.84,...,0,0,0,0,0,0,1,0,0,1
2,1,10,0.09,0,4,4,3,61.1,59.0,2.88,...,0,0,0,0,0,0,1,0,0,1
3,1,10,0.09,0,4,4,3,62.0,59.0,2.86,...,0,0,0,0,0,0,1,0,0,1
4,1,10,0.09,4,4,0,3,64.9,58.5,2.79,...,0,0,0,0,0,0,1,0,0,1


In [379]:
temp_df.describe()

Unnamed: 0,color,clarity,carat_weight,cut_quality,symmetry,polish,culet_size,depth_percent,table_percent,meas_length,...,cut_Heart,cut_Marquise,cut_Oval,cut_Pear,cut_Princess,cut_Radiant,cut_Round,lab_GIA,lab_HRD,lab_IGI
count,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,...,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0
mean,2.975762,6.708233,0.42458,0.939976,1.495968,0.794291,3.015119,61.252608,57.15326,4.797847,...,0.017305,0.013797,0.043824,0.035076,0.023165,0.012704,0.802605,0.921706,0.006448,0.071846
std,2.328827,2.497797,0.176764,1.624331,1.923883,1.590969,0.211171,9.972082,10.099293,0.997598,...,0.130405,0.116646,0.204705,0.183973,0.150427,0.111993,0.398034,0.268634,0.080039,0.258233
min,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,5.0,0.3,0.0,0.0,0.0,3.0,61.2,56.5,4.28,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
50%,3.0,7.0,0.36,0.0,0.0,0.0,3.0,62.4,58.0,4.59,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
75%,5.0,9.0,0.5,2.0,4.0,0.0,3.0,63.1,60.0,5.16,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,9.0,10.0,2.19,4.0,4.0,4.0,6.0,98.7,94.0,93.66,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## **EDA**

In [381]:
temp_df.describe()

Unnamed: 0,color,clarity,carat_weight,cut_quality,symmetry,polish,culet_size,depth_percent,table_percent,meas_length,...,cut_Heart,cut_Marquise,cut_Oval,cut_Pear,cut_Princess,cut_Radiant,cut_Round,lab_GIA,lab_HRD,lab_IGI
count,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,...,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0,156488.0
mean,2.975762,6.708233,0.42458,0.939976,1.495968,0.794291,3.015119,61.252608,57.15326,4.797847,...,0.017305,0.013797,0.043824,0.035076,0.023165,0.012704,0.802605,0.921706,0.006448,0.071846
std,2.328827,2.497797,0.176764,1.624331,1.923883,1.590969,0.211171,9.972082,10.099293,0.997598,...,0.130405,0.116646,0.204705,0.183973,0.150427,0.111993,0.398034,0.268634,0.080039,0.258233
min,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,5.0,0.3,0.0,0.0,0.0,3.0,61.2,56.5,4.28,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
50%,3.0,7.0,0.36,0.0,0.0,0.0,3.0,62.4,58.0,4.59,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
75%,5.0,9.0,0.5,2.0,4.0,0.0,3.0,63.1,60.0,5.16,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,9.0,10.0,2.19,4.0,4.0,4.0,6.0,98.7,94.0,93.66,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
