In [13]:
!pip install wandb -q

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import wandb
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

%matplotlib inline

In [15]:
# Login to W&B (paste your API key when prompted)
wandb.login()

# Initialize the main W&B project run
# This returns a 'run' object that holds the configuration and state
run = wandb.init(
    project="ridge-lasso-regression",
    name="data-preprocessing",
    config={
        "test_size": 0.25,
        "random_state": 42
    }
)

In [17]:
# Load dataset, skipping the first header row
df = pd.read_csv('Algerian_forest_fires_dataset_UPDATE.csv', header=1)

# Add a 'Region' column: 0 for Bejaia, 1 for Sidi-Bel Abbes
df.loc[:122, "Region"] = 0
df.loc[122:, "Region"] = 1

# Drop rows with any missing values
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# Remove the row that contains column headers in the middle of the data
df = df.drop(122).reset_index(drop=True)

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Convert column data types
df[['day', 'month', 'year', 'Temperature', 'RH', 'Ws']] = df[['day', 'month', 'year', 'Temperature', 'RH', 'Ws']].astype(int)

for col in ['Rain', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI']:
    df[col] = df[col].astype(float)

df['Region'] = df['Region'].astype(int)

# Encode 'Classes' column: 1 for 'fire', 0 for 'not fire'
df['Classes'] = np.where(df['Classes'].str.strip().str.contains('not fire', case=False), 0, 1)

# Log the cleaned dataset as a W&B Artifact
cleaned_data_path = "Algerian_forest_fires_cleaned.csv"
df.to_csv(cleaned_data_path, index=False)

artifact = wandb.Artifact("cleaned-algerian-fire-data", type="dataset")
artifact.add_file(cleaned_data_path)
wandb.log_artifact(artifact)

print("Data cleaning and preprocessing complete.")
df.head()

Data cleaning and preprocessing complete.


Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,0,0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,0,0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,0,0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,0,0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,0,0


In [19]:
# Define features (X) and target (y)
# We will predict the 'FWI' (Fire Weather Index)
X = df.drop(['day', 'month', 'year', 'FWI'], axis=1)
y = df['FWI']

# ---> FIX: Use the 'run' object returned from wandb.init()
# This is more explicit and reliable than using the global wandb.config
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=run.config.test_size, 
    random_state=run.config.random_state
)

# Store config values in local variables from the 'run' object
test_size_val = run.config.test_size
random_state_val = run.config.random_state

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save and log the scaler as an artifact
scaler_filename = "standard_scaler.pkl"
joblib.dump(scaler, scaler_filename)
scaler_artifact = wandb.Artifact("feature-scaler", type="preprocessor")
scaler_artifact.add_file(scaler_filename)
wandb.log_artifact(scaler_artifact)

print("Data has been split and scaled.")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

Data has been split and scaled.
X_train shape: (182, 11)
X_test shape: (61, 11)


In [21]:
# Define the models to train
models = {
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1)
}

for model_name, model in models.items():
    # Start a new W&B run for each model
    with wandb.init(
        project="ridge-lasso-regression",
        name=f"{model_name}-experiment-run",
        reinit=True,
        config={
            "model_type": model_name,
            "alpha": model.alpha,
            # Use the local variables stored previously
            "test_size": test_size_val,
            "random_state": random_state_val
        }
    ) as run:
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        print(f"--- {model_name} Model ---")
        print(f"Mean Squared Error (MSE): {mse:.4f}")
        print(f"R-squared (R2) Score: {r2:.4f}\n")

        # Log metrics to W&B
        wandb.log({"MSE": mse, "R2": r2})

        # Save the trained model file
        model_filename = f"{model_name.lower()}_model.pkl"
        joblib.dump(model, model_filename)

        # Log the model as a W&B Artifact
        model_artifact = wandb.Artifact(
            f"{model_name}-model",
            type="model",
            description=f"A trained {model_name} regression model.",
            metadata={"alpha": model.alpha, "r2_score": r2}
        )
        model_artifact.add_file(model_filename)
        wandb.log_artifact(model_artifact)

print("Model training and logging complete.")

--- Ridge Model ---
Mean Squared Error (MSE): 0.5677
R-squared (R2) Score: 0.9872



0,1
MSE,▁
R2,▁

0,1
MSE,0.56773
R2,0.98717


--- Lasso Model ---
Mean Squared Error (MSE): 0.7003
R-squared (R2) Score: 0.9842



0,1
MSE,▁
R2,▁

0,1
MSE,0.70033
R2,0.98418


Model training and logging complete.


In [23]:
wandb.finish()