### **DIAMOND DATASET**

In [None]:
# All Import statements

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv('/content/diamonds.csv')

drop_col = 'Unnamed: 0'
df.drop(drop_col, axis=1, inplace=True)
df.head()

Unnamed: 0,cut,color,clarity,carat_weight,cut_quality,lab,symmetry,polish,eye_clean,culet_size,...,meas_depth,girdle_min,girdle_max,fluor_color,fluor_intensity,fancy_color_dominant_color,fancy_color_secondary_color,fancy_color_overtone,fancy_color_intensity,total_sales_price
0,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,unknown,N,...,1.79,M,M,unknown,,unknown,unknown,unknown,unknown,200
1,Round,E,VVS2,0.09,Very Good,IGI,Very Good,Very Good,unknown,N,...,1.78,STK,STK,unknown,,unknown,unknown,unknown,unknown,200
2,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,unknown,unknown,...,1.77,TN,M,unknown,,unknown,unknown,unknown,unknown,200
3,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,unknown,unknown,...,1.78,M,STK,unknown,,unknown,unknown,unknown,unknown,200
4,Round,E,VVS2,0.09,Very Good,IGI,Very Good,Excellent,unknown,N,...,1.82,STK,STK,unknown,,unknown,unknown,unknown,unknown,200


In [None]:
df.isnull().sum()

cut                            0
color                          0
clarity                        0
carat_weight                   0
cut_quality                    0
lab                            0
symmetry                       0
polish                         0
eye_clean                      0
culet_size                     0
culet_condition                0
depth_percent                  0
table_percent                  0
meas_length                    0
meas_width                     0
meas_depth                     0
girdle_min                     0
girdle_max                     0
fluor_color                    0
fluor_intensity                0
fancy_color_dominant_color     0
fancy_color_secondary_color    0
fancy_color_overtone           0
fancy_color_intensity          0
total_sales_price              0
dtype: int64

In [None]:
# there is only 1 null feature having 1 null value replace it by taking mode

mode_for_total_sales_price = df['total_sales_price'].mode().iloc[0]

df['total_sales_price'].fillna(mode_for_total_sales_price, inplace=True)

In [None]:
df.describe(include='O').T

Unnamed: 0,count,unique,top,freq
cut,19731,10,Round,16611
color,19731,11,E,3229
clarity,19731,11,SI2,4034
cut_quality,19731,5,Excellent,12421
lab,19731,3,GIA,16766
symmetry,19731,4,Excellent,11530
polish,19731,4,Excellent,14490
eye_clean,19731,5,unknown,12563
culet_size,19731,5,N,10965
culet_condition,19731,4,unknown,17797


In [None]:
df.columns

Index(['cut', 'color', 'clarity', 'carat_weight', 'cut_quality', 'lab',
       'symmetry', 'polish', 'eye_clean', 'culet_size', 'culet_condition',
       'depth_percent', 'table_percent', 'meas_length', 'meas_width',
       'meas_depth', 'girdle_min', 'girdle_max', 'fluor_color',
       'fluor_intensity', 'fancy_color_dominant_color',
       'fancy_color_secondary_color', 'fancy_color_overtone',
       'fancy_color_intensity', 'total_sales_price'],
      dtype='object')

In [None]:
# as instead of having null value there is unknown value in every columns

def count_for_null_values(columns):
  for col in columns:
    counts = (df[col] == 'unknown').sum()
    print(f'{col}:  {counts}')

print('No of Rows: ', df.shape[0])
print('--------------------------')
count_for_null_values(df.columns)

No of Rows:  19731
--------------------------
cut:  0
color:  30
clarity:  0
carat_weight:  0
cut_quality:  3114
lab:  0
symmetry:  0
polish:  0
eye_clean:  12563
culet_size:  8645
culet_condition:  17797
depth_percent:  0
table_percent:  0
meas_length:  0
meas_width:  0
meas_depth:  0
girdle_min:  8315
girdle_max:  8356
fluor_color:  18656
fluor_intensity:  1
fancy_color_dominant_color:  19700
fancy_color_secondary_color:  19727
fancy_color_overtone:  19714
fancy_color_intensity:  19701
total_sales_price:  0


### **Handling the unknown values**

In [None]:
temp_df = df.copy()

In [None]:
# color feature
# for color columns we will take the mode value

mode_for_feature_color = temp_df['color'].mode()[0]
temp_df['color'].replace('unknown', mode_for_feature_color, inplace=True)
(temp_df['color'] == 'unknown').sum() # check for null or unknown value

0

In [None]:
# cut_quality feature
# obtain the values either through the 4 C's of diamond or -- > #1
# through the features like Symmetry, Polish, Clarity --- > #2

import pandas as pd

diamonds = temp_df
diamonds['cut_quality'] = diamonds['cut_quality'].replace('unknown', pd.NA)

# Calculate the overall mode value of 'cut_quality' for non-null values
overall_mode_for_cut_quality = diamonds['cut_quality'].dropna().mode().iloc[0]

# Filter out rows with null 'cut_quality' values
filtered_diamonds = diamonds.dropna(subset=['cut_quality'])

# Create a dictionary to map unique combinations to 'cut_quality'
unique_combinations = filtered_diamonds[['symmetry', 'polish', 'clarity', 'cut_quality']].drop_duplicates()

# Create a dictionary to map unique combinations to 'cut_quality'
unique_combinations = diamonds[['symmetry', 'polish', 'clarity', 'cut_quality']].drop_duplicates()

# Convert NumPy arrays to tuples for use as dictionary keys
unique_combinations['key'] = list(zip(unique_combinations['symmetry'],
                                      unique_combinations['polish'],
                                      unique_combinations['clarity'],))

combination_to_cut_quality = dict(zip(unique_combinations['key'], unique_combinations['cut_quality']))

# Define a function to fill missing 'cut_quality' values
def fill_missing_cut_quality(row):
    if pd.isna(row['cut_quality']):
        key = (row['symmetry'], row['polish'], row['clarity'])
        if key in combination_to_cut_quality:
            if pd.notna(combination_to_cut_quality.get(key)):
                return combination_to_cut_quality[key]
        return overall_mode_for_cut_quality
    return row['cut_quality']

# Apply the function to fill missing 'cut_quality' values
diamonds['cut_quality'] = diamonds.apply(fill_missing_cut_quality, axis=1)
diamonds['cut_quality'] = diamonds['cut_quality'].replace(pd.NA, 'unknown')
temp_df = diamonds

In [None]:
temp_df['cut_quality']

0        Excellent
1        Very Good
2        Excellent
3        Excellent
4        Very Good
           ...    
19726    Excellent
19727    Excellent
19728    Excellent
19729    Excellent
19730    Excellent
Name: cut_quality, Length: 19731, dtype: object

In [None]:
# culet_size
# we will take the mode for it

mode_for_feature_culet_size = temp_df['culet_size'].mode()[0]
temp_df['culet_size'].replace('unknown', mode_for_feature_culet_size, inplace=True)
(temp_df['culet_size'] == 'unknown').sum() # check for null or unknown value

0

In [None]:
# griddle_min/max
# as this feature can only be derived from the physical attributes such as
# measurements, proportions, analysis or detailed gemological analysis
# we will choose the mode value of the feature to fill the null value or unknown value

gridle_min_without_unknown = temp_df[temp_df['girdle_min'] != 'unknown']
gridle_max_without_unknown = temp_df[temp_df['girdle_max'] != 'unknown']

mode_for_feature_girdle_min = gridle_min_without_unknown['girdle_min'].mode()[0]
mode_for_feature_girdle_max = gridle_max_without_unknown['girdle_max'].mode()[0]

temp_df['girdle_min'].replace('unknown', mode_for_feature_girdle_min, inplace=True)
temp_df['girdle_max'].replace('unknown', mode_for_feature_girdle_max, inplace=True)

print("Unknown in Girdle Min", (temp_df['girdle_min'] == 'unknown').sum()) # check for null or unknown value for girdle_min
print("Unknown in Girdle Max", (temp_df['girdle_min'] == 'unknown').sum()) # check for null or unknown value for girdle_max


Unknown in Girdle Min 0
Unknown in Girdle Max 0


In [None]:
# Removal of the below feature is beacause of presence of null values more than half of the total entites.
# eye_clean is also remove because of the as it is almost null for half of entites and also that is indepenent from person to person.
# In below there are also the features which are dependent feature of removed feature so we must have to remove those.

temp_df = temp_df.drop(columns=['eye_clean',
                      'culet_condition',
                      'fluor_color',
                      'fluor_intensity',
                      'fancy_color_dominant_color',
                      'fancy_color_secondary_color',
                      'fancy_color_overtone',
                      'fancy_color_intensity'])
temp_df

Unnamed: 0,cut,color,clarity,carat_weight,cut_quality,lab,symmetry,polish,culet_size,depth_percent,table_percent,meas_length,meas_width,meas_depth,girdle_min,girdle_max,total_sales_price
0,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,N,62.7,59.0,2.85,2.87,1.79,M,M,200
1,Round,E,VVS2,0.09,Very Good,IGI,Very Good,Very Good,N,61.9,59.0,2.84,2.89,1.78,STK,STK,200
2,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,N,61.1,59.0,2.88,2.90,1.77,TN,M,200
3,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,N,62.0,59.0,2.86,2.88,1.78,M,STK,200
4,Round,E,VVS2,0.09,Very Good,IGI,Very Good,Excellent,N,64.9,58.5,2.79,2.83,1.82,STK,STK,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19726,Round,K,SI2,0.45,Excellent,GIA,Excellent,Excellent,N,62.4,58.0,4.88,4.91,3.06,STK,M,720
19727,Round,J,VS2,0.30,Excellent,GIA,Excellent,Excellent,N,59.0,59.0,4.41,4.43,2.61,M,TN,720
19728,Round,K,VVS2,0.30,Excellent,GIA,Excellent,Excellent,N,62.6,57.0,4.25,4.28,2.67,M,STK,720
19729,Round,I,SI1,0.30,Excellent,GIA,Excellent,Excellent,N,62.7,57.0,4.28,4.30,2.69,M,STK,720


## **Encode categorical variables using techniques like one-hot encoding or label encoding**

In [None]:
# encoding can be done in two different ways
# for nominal attributes we use one-hot encoding
# for ordinal attributes we use label encoding


# below is the one-hot encoding
temp_df = pd.get_dummies(temp_df, columns=['cut', 'lab'], prefix=["cut", "lab"])

temp_df.head()

Unnamed: 0,color,clarity,carat_weight,cut_quality,symmetry,polish,culet_size,depth_percent,table_percent,meas_length,...,cut_Heart,cut_Marquise,cut_Oval,cut_Pear,cut_Princess,cut_Radiant,cut_Round,lab_GIA,lab_HRD,lab_IGI
0,E,VVS2,0.09,Excellent,Very Good,Very Good,N,62.7,59.0,2.85,...,0,0,0,0,0,0,1,0,0,1
1,E,VVS2,0.09,Very Good,Very Good,Very Good,N,61.9,59.0,2.84,...,0,0,0,0,0,0,1,0,0,1
2,E,VVS2,0.09,Excellent,Very Good,Very Good,N,61.1,59.0,2.88,...,0,0,0,0,0,0,1,0,0,1
3,E,VVS2,0.09,Excellent,Very Good,Very Good,N,62.0,59.0,2.86,...,0,0,0,0,0,0,1,0,0,1
4,E,VVS2,0.09,Very Good,Very Good,Excellent,N,64.9,58.5,2.79,...,0,0,0,0,0,0,1,0,0,1


In [None]:
# below is the label encoding for the ordinal attributes
features_to_encode = ['color', 'clarity', 'cut_quality','symmetry', 'polish', 'culet_size', 'girdle_min', 'girdle_max']

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Loop through the selected features and apply label encoding
for feature in features_to_encode:
    temp_df[feature] = label_encoder.fit_transform(temp_df[feature])

temp_df.head()

Unnamed: 0,color,clarity,carat_weight,cut_quality,symmetry,polish,culet_size,depth_percent,table_percent,meas_length,...,cut_Heart,cut_Marquise,cut_Oval,cut_Pear,cut_Princess,cut_Radiant,cut_Round,lab_GIA,lab_HRD,lab_IGI
0,1,10,0.09,0,3,3,1,62.7,59.0,2.85,...,0,0,0,0,0,0,1,0,0,1
1,1,10,0.09,3,3,3,1,61.9,59.0,2.84,...,0,0,0,0,0,0,1,0,0,1
2,1,10,0.09,0,3,3,1,61.1,59.0,2.88,...,0,0,0,0,0,0,1,0,0,1
3,1,10,0.09,0,3,3,1,62.0,59.0,2.86,...,0,0,0,0,0,0,1,0,0,1
4,1,10,0.09,3,3,0,1,64.9,58.5,2.79,...,0,0,0,0,0,0,1,0,0,1


In [None]:
temp_df.describe()

Unnamed: 0,color,clarity,carat_weight,cut_quality,symmetry,polish,culet_size,depth_percent,table_percent,meas_length,...,cut_Heart,cut_Marquise,cut_Oval,cut_Pear,cut_Princess,cut_Radiant,cut_Round,lab_GIA,lab_HRD,lab_IGI
count,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,...,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0
mean,3.819624,6.32122,0.276109,0.711064,1.246009,0.796513,1.008008,59.908748,56.471711,4.199325,...,0.00522,0.016928,0.034007,0.043384,0.015863,0.001977,0.841873,0.849729,0.006234,0.144037
std,2.703858,2.672229,0.060873,1.270544,1.477897,1.324608,0.125092,12.354768,11.419555,0.814594,...,0.072064,0.129004,0.181253,0.203724,0.12495,0.044416,0.364869,0.357346,0.07871,0.351136
min,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,4.0,0.23,0.0,0.0,0.0,1.0,60.8,57.0,3.89,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
50%,4.0,7.0,0.3,0.0,0.0,0.0,1.0,62.1,58.0,4.25,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
75%,6.0,9.0,0.3,0.0,3.0,3.0,1.0,62.9,60.0,4.36,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,9.0,10.0,0.7,3.0,3.0,3.0,3.0,91.2,94.0,8.54,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## **EDA**

In [None]:
temp_df.describe()

Unnamed: 0,color,clarity,carat_weight,cut_quality,symmetry,polish,culet_size,depth_percent,table_percent,meas_length,...,cut_Heart,cut_Marquise,cut_Oval,cut_Pear,cut_Princess,cut_Radiant,cut_Round,lab_GIA,lab_HRD,lab_IGI
count,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,...,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0,19731.0
mean,3.819624,6.32122,0.276109,0.711064,1.246009,0.796513,1.008008,59.908748,56.471711,4.199325,...,0.00522,0.016928,0.034007,0.043384,0.015863,0.001977,0.841873,0.849729,0.006234,0.144037
std,2.703858,2.672229,0.060873,1.270544,1.477897,1.324608,0.125092,12.354768,11.419555,0.814594,...,0.072064,0.129004,0.181253,0.203724,0.12495,0.044416,0.364869,0.357346,0.07871,0.351136
min,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,4.0,0.23,0.0,0.0,0.0,1.0,60.8,57.0,3.89,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
50%,4.0,7.0,0.3,0.0,0.0,0.0,1.0,62.1,58.0,4.25,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
75%,6.0,9.0,0.3,0.0,3.0,3.0,1.0,62.9,60.0,4.36,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,9.0,10.0,0.7,3.0,3.0,3.0,3.0,91.2,94.0,8.54,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# Assuming you have a 'total_sales_price' column as the target variable
X = temp_df.drop(columns=['total_sales_price'])  # Features
y = df['total_sales_price']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (optional but recommended for SGD)
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

In [None]:
# Create an instance of the SGDRegressor
sgd_regressor = SGDRegressor(max_iter=1000, learning_rate='constant', eta0=0.01, random_state=42)

# Train the model on the training data
sgd_regressor.fit(X_train, y_train)

In [None]:
y_pred = sgd_regressor.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = sgd_regressor.score(X_test, y_test)

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r_squared}')

Mean Squared Error: 5.6710962617511415e+25
Root Mean Squared Error: 7530668138851.387
R-squared: -6.662232692336547e+21
