# 📊 Diamonds Price Prediction using Linear and Lasso Regression

In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import zscore


In [None]:

# Load dataset
df = pd.read_csv('/content/diamonds.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
print(df.head())


In [None]:

# Check missing values
print(df.isnull().sum())


In [None]:

# Label Encoding
cut_mapping = {'Fair':1, 'Good':2, 'Very Good':3, 'Premium':4, 'Ideal':5}
color_mapping = {'J':7, 'I':6, 'H':5, 'G':4, 'F':3, 'E':2, 'D':1}
clarity_mapping = {'I1':8, 'SI2':7, 'SI1':6, 'VS2':5, 'VS1':4, 'VVS2':3, 'VVS1':2, 'IF':1}

df['cut'] = df['cut'].map(cut_mapping)
df['color'] = df['color'].map(color_mapping)
df['clarity'] = df['clarity'].map(clarity_mapping)


In [None]:

plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:

# Log-transform carat and price
df['log_carat'] = np.log1p(df['carat'])
df['log_price'] = np.log1p(df['price'])
df.drop(columns=['carat', 'price'], inplace=True)

# Remove outliers from depth and table
z_scores = df[['depth', 'table']].apply(zscore)
df_clean = df[(z_scores.abs() < 3).all(axis=1)]


In [None]:

X = df_clean.drop(columns=['log_price'])
y = df_clean['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:

linear = LinearRegression()
linear.fit(X_train_scaled, y_train)

y_pred_log_linear = linear.predict(X_test_scaled)
y_pred_linear = np.expm1(y_pred_log_linear)  # Reverse log1p
y_true = np.expm1(y_test)

# Metrics
linear_mse = mean_squared_error(y_true, y_pred_linear)
linear_mae = mean_absolute_error(y_true, y_pred_linear)
linear_r2 = r2_score(y_true, y_pred_linear)

print(f"Linear Regression -> MSE: {linear_mse:.2f}, MAE: {linear_mae:.2f}, R2: {linear_r2:.4f}")


In [None]:

plt.scatter(y_true, y_pred_linear, alpha=0.5)
plt.xlabel('True Price')
plt.ylabel('Predicted Price')
plt.title('Linear Regression: True vs Predicted Prices')
plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
plt.show()


In [None]:

lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train_scaled, y_train)

y_pred_log_lasso = lasso.predict(X_test_scaled)
y_pred_lasso = np.expm1(y_pred_log_lasso)

# Metrics
lasso_mse = mean_squared_error(y_true, y_pred_lasso)
lasso_mae = mean_absolute_error(y_true, y_pred_lasso)
lasso_r2 = r2_score(y_true, y_pred_lasso)

print(f"Lasso Regression -> MSE: {lasso_mse:.2f}, MAE: {lasso_mae:.2f}, R2: {lasso_r2:.4f}")


In [None]:

plt.scatter(y_true, y_pred_lasso, alpha=0.5)
plt.xlabel('True Price')
plt.ylabel('Predicted Price')
plt.title('Lasso Regression: True vs Predicted Prices')
plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
plt.show()


In [None]:

comparison = pd.DataFrame({
    'Model': ['Linear Regression', 'Lasso Regression'],
    'MSE': [linear_mse, lasso_mse],
    'MAE': [linear_mae, lasso_mae],
    'R2 Score': [linear_r2, lasso_r2]
})
print(comparison)
