In [None]:

# !pip install datasets scikit-learn pandas matplotlib seaborn tensorflow joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from datasets import load_dataset

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

print("✓ All libraries imported successfully")

In [None]:
#Load the dataset and thoroughly inspect its structure


# Load dataset
ds = load_dataset("ketan0/test_regression_preds")
df = ds['train'].to_pandas()

# Display basic information
print("\nDataset Shape:", df.shape)
print("\nColumn Names and Types:")
print(df.dtypes)
print("\nFirst 10 rows:")
print(df.head(10))
print("\nStatistical Summary:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:

# Properly identify features (X) and target variable (y)


# Inspect column names to understand structure
print("\nColumn names:", df.columns.tolist())


if len(df.columns) == 1:
    print("\nWarning: Dataset has only one column. Creating index-based feature.")
    print("Note: In a real scenario, you should have multiple proper features.")

    X = pd.DataFrame({'index': np.arange(len(df))})
    y = df.iloc[:, 0]

    print("\n✓ Created features from index")
else:
    # If multiple columns exist, use all but last as features
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    print(f"\n✓ Using {X.shape[1]} feature(s) and 1 target variable")

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget variable statistics:")
print(y.describe())

In [None]:
"""
Split data into training, validation, and test sets
- Training: 64% (80% of 80%)
- Validation: 16% (20% of 80%)
- Test: 20%
"""

# First split: separate test set (20%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Second split: separate training and validation from temp (80/20 split of temp)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2, random_state=42
)

print(f"\nTraining set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Validation set: {len(X_val)} samples ({len(X_val)/len(X)*100:.1f}%)")
print(f"Test set: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"Total: {len(X)} samples")

In [None]:


# Create a dictionary of different regression models to compare


models = {
    "Linear Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ]),

    "Ridge (α=0.1)": Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=0.1))
    ]),

    "Ridge (α=1.0)": Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=1.0))
    ]),

    "Ridge (α=10.0)": Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=10.0))
    ]),

    "Polynomial Deg-2": Pipeline([
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ]),

    "Polynomial Deg-3": Pipeline([
        ('poly', PolynomialFeatures(degree=3, include_bias=False)),
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ]),

    "Ridge Poly Deg-2": Pipeline([
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=1.0))
    ]),
}

print(f"\nDefined {len(models)} models for comparison")
for name in models.keys():
    print(f"  - {name}")

In [None]:

"""
Train each model and evaluate on both training and validation sets
"""
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    # Calculate metrics
    results[name] = {
        'Model': model,
        'Train MSE': mean_squared_error(y_train, y_train_pred),
        'Val MSE': mean_squared_error(y_val, y_val_pred),
        'Train RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred)),
        'Val RMSE': np.sqrt(mean_squared_error(y_val, y_val_pred)),
        'Train MAE': mean_absolute_error(y_train, y_train_pred),
        'Val MAE': mean_absolute_error(y_val, y_val_pred),
        'Train R2': r2_score(y_train, y_train_pred),
        'Val R2': r2_score(y_val, y_val_pred),
    }

    print(f"  ✓ Val MSE: {results[name]['Val MSE']:.4f}, Val R2: {results[name]['Val R2']:.4f}")

# Create results dataframe
results_df = pd.DataFrame(results).T

print("\n" + "="*80)
print("VALIDATION PERFORMANCE SUMMARY")
print("="*80)
print(results_df[['Val MSE', 'Val RMSE', 'Val MAE', 'Val R2']].sort_values('Val MSE'))

In [None]:

# Create visualizations to compare model performance


# Plot 1: Validation metrics comparison
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Model Performance Comparison (Validation Set)', fontsize=16, fontweight='bold')

model_names = list(results.keys())
val_mse = [results[name]['Val MSE'] for name in model_names]
val_rmse = [results[name]['Val RMSE'] for name in model_names]
val_r2 = [results[name]['Val R2'] for name in model_names]

# MSE
y_pos = np.arange(len(model_names))
axes[0].plot(val_mse, y_pos, marker='o', linewidth=2, markersize=8, color='steelblue')
axes[0].set_ylabel('Model')
axes[0].set_xlabel('Mean Squared Error (MSE)')
axes[0].set_title('Validation MSE (Lower is Better)')
axes[0].set_yticks(y_pos)
axes[0].set_yticklabels(model_names)
axes[0].invert_yaxis()
axes[0].grid(axis='x', alpha=0.3)

# RMSE
axes[1].plot(val_rmse, y_pos, marker='o', linewidth=2, markersize=8, color='darkorange')
axes[1].set_ylabel('Model')
axes[1].set_xlabel('Root Mean Squared Error (RMSE)')
axes[1].set_title('Validation RMSE (Lower is Better)')
axes[1].set_yticks(y_pos)
axes[1].set_yticklabels(model_names)
axes[1].invert_yaxis()
axes[1].grid(axis='x', alpha=0.3)

# R2
axes[2].plot(val_r2, y_pos, marker='o', linewidth=2, markersize=8, color='green')
axes[2].set_ylabel('Model')
axes[2].set_xlabel('R² Score')
axes[2].set_title('Validation R² (Higher is Better)')
axes[2].set_yticks(y_pos)
axes[2].set_yticklabels(model_names)
axes[2].invert_yaxis()
axes[2].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

# Plot 2: Training vs Validation MSE (Overfitting Detection)
train_mse = [results[name]['Train MSE'] for name in model_names]
val_mse = [results[name]['Val MSE'] for name in model_names]

x = np.arange(len(model_names))

fig, ax = plt.subplots(figsize=(14, 7))
ax.plot(x, train_mse, marker='o', linewidth=2, markersize=8, label='Training MSE', color='skyblue')
ax.plot(x, val_mse, marker='s', linewidth=2, markersize=8, label='Validation MSE', color='coral')

ax.set_xlabel('Model')
ax.set_ylabel('Mean Squared Error (MSE)')
ax.set_title('Training vs Validation MSE - Detecting Overfitting')
ax.set_xticks(x)
ax.set_xticklabels(model_names, rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
"""
Automatically select the best model based on validation MSE
"""
print("\n" + "="*80)
print("BEST MODEL SELECTION")
print("="*80)

# Find model with lowest validation MSE
best_model_name = min(results.keys(), key=lambda k: results[k]['Val MSE'])
best_model_pipeline = results[best_model_name]['Model']

print(f"\nBest Model: {best_model_name}")
print(f"   Validation MSE: {results[best_model_name]['Val MSE']:.4f}")
print(f"   Validation RMSE: {results[best_model_name]['Val RMSE']:.4f}")
print(f"   Validation R²: {results[best_model_name]['Val R2']:.4f}")

In [None]:

"""
Perform k-fold cross-validation on the best model using training+validation data
"""

cv_folds = 5
cv_scores = cross_val_score(
    best_model_pipeline,
    X_temp,
    y_temp,
    cv=cv_folds,
    scoring='neg_mean_squared_error'
)

cv_mse = -cv_scores
cv_rmse = np.sqrt(cv_mse)

print(f"\n{cv_folds}-Fold Cross-Validation Results:")
print(f"   MSE Scores: {cv_mse}")
print(f"   Mean MSE: {cv_mse.mean():.4f} (±{cv_mse.std():.4f})")
print(f"   Mean RMSE: {cv_rmse.mean():.4f} (±{cv_rmse.std():.4f})")

# Visualize CV scores
plt.figure(figsize=(10, 6))
fold_numbers = np.arange(1, cv_folds+1)
plt.plot(fold_numbers, cv_mse, marker='o', linewidth=2, markersize=10, color='teal')
plt.axhline(y=cv_mse.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean MSE: {cv_mse.mean():.4f}')
plt.xlabel('Fold Number')
plt.ylabel('Mean Squared Error (MSE)')
plt.title(f'{cv_folds}-Fold Cross-Validation MSE for {best_model_name}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()