In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('/Users/riteshkumar/Downloads/ML projects/Diamond Price Prediction/diamonds.csv')
data

## **Dataset Information**

The dataset contains the following features:

- **price:** Price in US dollars  
- **carat:** Weight of the diamond  
- **cut:** Quality of the cut (Fair, Good, Very Good, Premium, Ideal)  
- **color:** Diamond color, from J (worst) to D (best)  
- **clarity:** Measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))  
- **x:** Length in mm  
- **y:** Width in mm
- **z:** Depth in mm
- **depth:** Total depth percentage
- **table:** Width of the top of the diamond relative to its widest point

## First Look

In [None]:
data.info()

# Drop unnecessary columns #1

In [None]:
data.drop('Unnamed: 0', axis=1, inplace=True)
data

# some info

In [None]:
data.describe().transpose()

In [None]:
print(data['clarity'].unique())     #I1 , SI2, SI1, VS2, VS1, VVS2, VVS1, IF
                                           # Worst =====> Best
print(data['cut'].unique())         # Fair, Good, Very Good, Premium, Ideal
                                           # Worst =====> Best
print(data['color'].unique())       # J, I, H, G, F, E, D
                                      # Worst =====> Best

# Mapping "clarity, cut, color" column to integer values (Ordinal Encoding)

In [None]:
clarity_map = {
    'I1': 1, 'SI2': 2, 'SI1': 3,
    'VS2': 4, 'VS1': 5,
    'VVS2': 6, 'VVS1': 7, 'IF': 8
}
data['clarity'] = data['clarity'].map(clarity_map)

cut_map = {
    'Fair': 1, 'Good': 2, 'Very Good': 3,
    'Premium': 4, 'Ideal': 5
}
data['cut'] = data['cut'].map(cut_map)

color_map = {
    'J': 1, 'I': 2, 'H': 3,
    'G': 4, 'F': 5, 'E': 6, 'D': 7
}
data['color'] = data['color'].map(color_map)

data     # mapped


In [None]:
print(data['clarity'].unique())

print(data['cut'].unique())

print(data['color'].unique())

In [None]:
data.info()

## Relations (graphs, heatmap)

In [None]:
for column in data.drop(columns=['price']).columns:
      plt.figure(figsize=(6, 4))
      plt.scatter(data[column], data['price'], color='blue')
      plt.title(f'{column} vs price')
      plt.xlabel(column)
      plt.ylabel('price')
      plt.grid(True)
      plt.tight_layout()
      plt.show()
      print()


In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.show()

## From the heatmap, we can see that all the features have either a positive or negative correlation with the price. However, the 'depth' feature has a very low correlation with the price, so dropping it will probably have a positive effect on our model's accuracy.

# Dropping 'depth' column

In [None]:
data = data.drop('depth', axis=1)

# Box plot to detect outliers

In [None]:
for feature in data.columns:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=data[feature])
    plt.title(f'Box Plot for {feature}')
    plt.show()

## Filtering outliers (columns: carat , depth , table , x , y , z)

In [None]:
data= data[data['carat'] < 1.9 ]
data = data[(data['table'] > 53 ) & (data['table'] < 61)]
data = data[data['x'] < 9.2]
data = data[data['y'] < 9.2]
data = data[(data['z'] < 5.8) & (data['z'] > 1.2)]
data

In [None]:
for feature in data.columns:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=data[feature])
    plt.title(f'Box Plot for {feature}')
    plt.show()

# Splitting Features & Output

In [None]:
X = data.drop(['price'] , axis=1 , inplace = False)
X

In [None]:
y = data['price']
y

# Dividing data (Train & Test)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=44, shuffle =True)

#Splitted Data
print('X_train shape is ' , X_train.shape)
print('X_test shape is ' , X_test.shape)
print('y_train shape is ' , y_train.shape)
print('y_test shape is ' , y_test.shape)

# Training

# #1 Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
LinearRegressionModel = make_pipeline(StandardScaler() ,LinearRegression(fit_intercept=True, copy_X=True,n_jobs=-1))
LinearRegressionModel.fit(X_train, y_train)
# 1st step: y = x1Θ1+x2Θ2+x3Θ3+x4Θ4+.......   ===> making the equation
# 2nd step: y = n1Θ1+n2Θ2+n3Θ3+n4Θ4+.......   ===> randomizing the weights
# 3rd step: calculating output using current weights
#           calculating cost function
#           updating theta
# 3rd step is repeated over all x , y train

# Evaluation

In [None]:
y_pred = LinearRegressionModel.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#Calculating Cost Function (MSE)
MSEValue = mean_squared_error(y_test, y_pred, multioutput='uniform_average')
print('Mean Squared Error Value is : ', MSEValue)

In [None]:
print('Linear Regression Train Score is : ' , LinearRegressionModel.score(X_train, y_train)*100,"%")
print('Linear Regression Test Score is : ' , LinearRegressionModel.score(X_test, y_test)*100,"%")

# #2 Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

RandomForestRegressorModel = RandomForestRegressor(n_estimators=100,max_depth=2, random_state=44)
RandomForestRegressorModel.fit(X_train, y_train)

In [None]:
print('Random Forest Regressor Train Score is : ' , RandomForestRegressorModel.score(X_train, y_train)*100,"%")
print('Random Forest Regressor Test Score is : ' , RandomForestRegressorModel.score(X_test, y_test)*100,"%")

In [None]:
#Calculating Cost Function (MSE)
y_pred = RandomForestRegressorModel.predict(X_test)

MSEValue = mean_squared_error(y_test, y_pred, multioutput='uniform_average')
print('Mean Squared Error Value is : ', MSEValue)

# #3 Decision Tree Rgeression

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
DecisionTreeRegressorModel = DecisionTreeRegressor( max_depth=3,random_state=33)
DecisionTreeRegressorModel.fit(X_train, y_train)

In [None]:
#Calculating Cost Function (MSE)
y_pred = DecisionTreeRegressorModel.predict(X_test)
MSEValue = mean_squared_error(y_test, y_pred, multioutput='uniform_average')
print('Mean Squared Error Value is : ', MSEValue)

In [None]:
print('DecisionTreeRegressor Train Score is : ' , DecisionTreeRegressorModel.score(X_train, y_train)*100,"%")
print('DecisionTreeRegressor Test Score is : ' , DecisionTreeRegressorModel.score(X_test, y_test)*100,"%")

# #4 KNN Regression

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
KNeighborsRegressorModel = KNeighborsRegressor(n_neighbors = 5, weights='uniform', #also can be : distance, or defined function
                                               algorithm = 'auto')    #also can be : ball_tree ,  kd_tree  , brute
KNeighborsRegressorModel.fit(X_train, y_train)

In [None]:
#Calculating Cost Function (MSE)
y_pred = KNeighborsRegressorModel.predict(X_test)
MSEValue = mean_squared_error(y_test, y_pred, multioutput='uniform_average')
print('Mean Squared Error Value is : ', MSEValue)

In [None]:
print("KNeighborsRegressorModel Train Score is : " , KNeighborsRegressorModel.score(X_train, y_train)*100,"%")
print("KNeighborsRegressorModel Test Score is : " , KNeighborsRegressorModel.score(X_test, y_test)*100,"%")

## Conclusion  
This project focused on predicting diamond prices using various machine learning models. Initial data exploration revealed that most features showed meaningful correlations with the price, except for `depth`, which was removed to potentially improve model performance. Additionally, outliers were filtered to reduce noise and improve accuracy.

We trained and evaluated four models: **K-Nearest Neighbors (KNN), Linear Regression, Decision Tree, and Random Forest.**  
The models ranked as follows based on their accuracy:  

1. **K-Nearest Neighbors (KNN)** achieved the highest performance, with a train score of **97.83%** and a test score of **96.78%**, demonstrating strong generalization.  
2. **Linear Regression** performed well, indicating a linear relationship between features and price, with a train score of **90.17%** and a test score of **90.48%**.  
3. **Decision Tree Regressor** showed reasonable accuracy, with a train score of **87.93%** and a test score of **88.19%**.  
4. **Random Forest Regressor** delivered lower accuracy than expected, with a train score of **81.81%** and a test score of **82.41%**, possibly due to overfitting or suboptimal hyperparameter settings.