In [1]:
%env CLEARML_WEB_HOST=https://app.5ccsagap.er.kcl.ac.uk/
%env CLEARML_API_HOST=https://api.5ccsagap.er.kcl.ac.uk
%env CLEARML_FILES_HOST=https://files.5ccsagap.er.kcl.ac.uk
%env CLEARML_API_ACCESS_KEY=OYY8NIAZE2KGIB30TKDSDGGLCFQSBD
%env CLEARML_API_SECRET_KEY=0V65RLv6rElVM8eDRzyZLuM2jkwfOHYrqj-dIejcbFyKe-PtTcheGID6tGRU2C59vPw

seed = 8012004

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from scipy.stats import zscore, mstats


env: CLEARML_WEB_HOST=https://app.5ccsagap.er.kcl.ac.uk/
env: CLEARML_API_HOST=https://api.5ccsagap.er.kcl.ac.uk
env: CLEARML_FILES_HOST=https://files.5ccsagap.er.kcl.ac.uk
env: CLEARML_API_ACCESS_KEY=OYY8NIAZE2KGIB30TKDSDGGLCFQSBD
env: CLEARML_API_SECRET_KEY=0V65RLv6rElVM8eDRzyZLuM2jkwfOHYrqj-dIejcbFyKe-PtTcheGID6tGRU2C59vPw


In [2]:
# Import ClearML
from clearml import Task

# Initialize ClearML task (experiment tracking)
task = Task.init(project_name="CW1_Project", task_name="RandomForest_SubM", output_uri=True)

#Data Cleaning and Feature Engineering
def clean_and_engineer(df):
    # Feature engineering: Volume, Surface Area, Aspect Ratios
    df['volume'] = df['x'] * df['y'] * df['z']
    df['surface_area'] = 2 * (df['x'] * df['y'] + df['x'] * df['z'] + df['y'] * df['z'])
    df['aspect_ratio_xy'] = df['x'] / (df['y'] + 1e-6)
    df['aspect_ratio_xz'] = df['x'] / (df['z'] + 1e-6)
    df['aspect_ratio_yz'] = df['y'] / (df['z'] + 1e-6)

    # Winsorize Outliers (1% & 99% percentiles)
    for col in ['price', 'x', 'y', 'z', 'volume', 'surface_area']:
        if col in df.columns:
            df[col] = mstats.winsorize(df[col], limits=[0.01, 0.01])

    # Label Encoding for Categorical Variables
    for col in ['cut', 'color', 'clarity']:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])

    # Drop original dimension features
    df.drop(columns=['x', 'y', 'z'], errors='ignore', inplace=True)

    # Handle Missing Values
    df.fillna(df.median(), inplace=True)

    return df

# Load dataset and apply cleaning
train_df = pd.read_csv("data/CW1_train.csv")
train_df = clean_and_engineer(train_df)

X = train_df.drop(columns=['outcome'])
y = train_df['outcome']

# Standardize features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=seed)

#Feature Selection (Feature Importance from Baseline Model)
baseline_model = RandomForestRegressor(n_estimators=100, random_state=seed, n_jobs=-1)
baseline_model.fit(X_train, y_train)

# Get feature importance and select top features
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': baseline_model.feature_importances_})
important_features = importance_df[importance_df['Importance'] > 0.01]['Feature'].tolist()

# Keep only important features
X = X[important_features]

#Standardize Features After Feature Selection
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=seed)

# RandomizedSearchCV for Hyperparameter Tuning
param_grid = {
    'n_estimators': [300, 500, 700],
    'max_depth': [20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [4, 6, 8],
    'max_features': ['sqrt', None],
    'bootstrap': [True],
    'ccp_alpha': [0.001, 0.005, 0.01],
    'max_samples': [0.7, 0.8, 0.9]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=seed, n_jobs=-1),
    param_distributions=param_grid,
    n_iter=30,               # Limit to 30 iterations for efficiency
    cv=3,
    scoring='r2',
    n_jobs=-1,
    random_state=seed,
    verbose=2
)

random_search.fit(X_train, y_train)

#Evaluate Model Performance
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Predictions
y_pred = best_model.predict(X_test)

# Performance Metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Display Results
print(f"Best Hyperparameters: {best_params}")
print(f"Test Set R²: {r2:.4f}")
print(f"Test Set MAE: {mae:.4f}")

#Generate Submission CSV
test_df = pd.read_csv("data/CW1_test.csv")
test_df = clean_and_engineer(test_df)

# Ensure test features match training feature order
X_submission = test_df[important_features]

# Standardize test set using the same scaler
X_submission = pd.DataFrame(scaler.transform(X_submission), columns=important_features)

# Generate predictions
yhat = best_model.predict(X_submission)

# Create submission DataFrame
submission = pd.DataFrame({'yhat': yhat})
submission.to_csv('CW1_submission.csv', index=False)

print("Submission file 'CW1_submission.csv' generated successfully!")
# Close ClearML task
task.close()

ClearML Task: created new task id=cb3636b8c04a4c31ac7cb6ee399b2119
ClearML results page: https://app.5ccsagap.er.kcl.ac.uk/projects/f78baf0a8c954173b8cae46e34eb26b6/experiments/cb3636b8c04a4c31ac7cb6ee399b2119/output/log
2025-02-20 09:35:28,330 - clearml.Task - INFO - Storing jupyter notebook directly as code
CLEARML-SERVER new package available: UPGRADE to v2.0.0 is recommended!
Release Notes:
### Breaking Changes

MongoDB major version was upgraded from v5.x to 6.x.
Please note that if your current ClearML Server version is smaller than v1.17 (where MongoDB v5.x was first used), you'll need to first upgrade to ClearML Server v1.17.
#### Upgrading to ClearML Server v1.17 from a previous version
- If using docker-compose,  use the following docker-compose files:
  * [docker-compose file](https://github.com/allegroai/clearml-server/blob/2976ce69cc91550a3614996e8a8d8cd799af2efd/upgrade/1_17_to_2_0/docker-compose.yml)
  * [docker-compose file foe Windows](https://github.com/allegroai/clea