In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from scipy.stats import mstats

seed = 8012004

# Load the training dataset
train_df = pd.read_csv("data/CW1_train.csv")

# Feature Engineering Function
def feature_engineering(df):
    # Create Volume, Surface Area, and Aspect Ratios
    df['volume'] = df['x'] * df['y'] * df['z']
    df['surface_area'] = 2 * (df['x'] * df['y'] + df['x'] * df['z'] + df['y'] * df['z'])
    df['aspect_ratio_xy'] = df['x'] / (df['y'] + 1e-6)
    df['aspect_ratio_xz'] = df['x'] / (df['z'] + 1e-6)
    df['aspect_ratio_yz'] = df['y'] / (df['z'] + 1e-6)

    # Winsorize Outliers
    for col in ['price', 'x', 'y', 'z', 'volume', 'surface_area']:
        if col in df.columns:
            df[col] = mstats.winsorize(df[col], limits=[0.01, 0.01])

    # Label Encoding for Categorical Variables
    label_encoders = {}
    for col in ['cut', 'color', 'clarity']:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le

    # Drop Redundant Features
    df.drop(columns=['x', 'y', 'z'], errors='ignore', inplace=True)

    return df

# Apply Feature Engineering
train_df = feature_engineering(train_df)

# Split dataset into features and target
X = train_df.drop(columns=['outcome'])
y = train_df['outcome']

# Standardize numerical features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

# Define Models for Comparison
models = {
    "RandomForest": RandomForestRegressor(random_state=seed, n_jobs = -1),
    "Ridge": Ridge(random_state=seed),
    "Lasso": Lasso(random_state=seed),
    "ElasticNet": ElasticNet(random_state=seed)
}

# Train and Evaluate Models
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    cv_score = cross_val_score(model, X, y, cv=10, scoring='r2').mean()

    results.append({
        "Model": name,
        "R² Score": r2,
        "MAE": mae,
        "Cross-Validation R²": cv_score
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results).sort_values(by="R² Score", ascending=False)

# Display results
import ace_tools_open as tools
tools.display_dataframe_to_user(name="Model Comparison Results", dataframe=results_df)


Model Comparison Results


Model,R² Score,MAE,Cross-Validation R²
Loading ITables v2.2.4 from the internet... (need help?),,,
