In [1]:
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [7]:
# Load dataset
data = sns.load_dataset('diamonds')

# take a sample of 5000 rows
df = data.sample(n=5000, random_state=42)

# Features and target
X = df.drop('price', axis=1)
y = df['price']

# Identify categorical and numerical columns
cat_cols = ['cut', 'color', 'clarity']
num_cols = ['carat', 'depth', 'table', 'x', 'y', 'z']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [8]:
# Preprocessing for numerical and categorical columns
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])


In [10]:
models = {
    'LinearRegression': (
        Pipeline([
            ('preprocessing', preprocessor),
            ('regressor', LinearRegression())
        ]),
        {
            'regressor__fit_intercept': [True, False],
            'regressor__positive': [True, False]
        }
    ),

    'SVR': (
        Pipeline([
            ('preprocessing', preprocessor),
            ('regressor', SVR())
        ]),
        {
            'regressor__kernel': ['rbf'],
            'regressor__C': [1, 10]
        }
    ),

    'DecisionTreeRegressor': (
        Pipeline([
            ('preprocessing', preprocessor),
            ('regressor', DecisionTreeRegressor())
        ]),
        {
            'regressor__max_depth': [None, 5],
            'regressor__splitter': ['best']
        }
    ),

    'RandomForestRegressor': (
        Pipeline([
            ('preprocessing', preprocessor),
            ('regressor', RandomForestRegressor(random_state=42))
        ]),
        {
            'regressor__n_estimators': [10, 50],
            'regressor__max_depth': [None, 5]
        }
    ),

    'KNeighborsRegressor': (
        Pipeline([
            ('preprocessing', preprocessor),
            ('regressor', KNeighborsRegressor())
        ]),
        {
            'regressor__n_neighbors': [3, 5],
            'regressor__weights': ['uniform', 'distance']
        }
    ),

    'GradientBoostingRegressor': (
        Pipeline([
            ('preprocessing', preprocessor),
            ('regressor', GradientBoostingRegressor())
        ]),
        {
            'regressor__n_estimators': [10, 50],
            'regressor__learning_rate': [0.1, 0.01]
        }
    ),

    'XGBRegressor': (
        Pipeline([
            ('preprocessing', preprocessor),
            ('regressor', XGBRegressor(eval_metric='rmse', verbosity=0))
        ]),
        {
            'regressor__n_estimators': [10, 50],
            'regressor__learning_rate': [0.1, 0.01]
        }
    )
}


In [13]:
results = []

for name, (pipeline, params) in models.items():
    print(f"\nTraining {name}...")
    try:
        grid = GridSearchCV(pipeline, params, cv=5)
        grid.fit(X_train, y_train)
        y_pred = grid.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        print(f"Best Parameters: {grid.best_params_}")
        print(f"Mean Squared Error: {mse}")
        print(f"Mean Absolute Error: {mae}")
        print(f"R-squared: {r2}")

        results.append((name, mae, r2))

    except Exception as e:
        print(f"❌ Error in {name}: {e}")

# Show clean summary:
print("\n🔚 Summary of All Models:")
for name, mae, r2 in sorted(results, key=lambda x: x[1]):
    print(f"{name}: MAE = {mae:.2f}, R² = {r2:.4f}")




Training LinearRegression...
Best Parameters: {'regressor__fit_intercept': True, 'regressor__positive': False}
Mean Squared Error: 1830620.5029985497
Mean Absolute Error: 778.4729811038003
R-squared: 0.891558452507746

Training SVR...
Best Parameters: {'regressor__C': 10, 'regressor__kernel': 'rbf'}
Mean Squared Error: 9116962.776728842
Mean Absolute Error: 1443.8111518602773
R-squared: 0.45993309355033696

Training DecisionTreeRegressor...
Best Parameters: {'regressor__max_depth': None, 'regressor__splitter': 'best'}
Mean Squared Error: 1191496.1248
Mean Absolute Error: 521.9264
R-squared: 0.9294186406233875

Training RandomForestRegressor...
Best Parameters: {'regressor__max_depth': None, 'regressor__n_estimators': 50}
Mean Squared Error: 576196.1565200133
Mean Absolute Error: 388.2373573333333
R-squared: 0.9658675281033009

Training KNeighborsRegressor...
Best Parameters: {'regressor__n_neighbors': 5, 'regressor__weights': 'distance'}
Mean Squared Error: 1318199.9781723144
Mean Abs