In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report

# Load data
df = pd.read_csv("powergrid_material_demand_latest.csv")

# Encode categorical cols
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    if col != "Project_ID":
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# -------------------------
# Feature Engineering
# -------------------------
df['Project_Duration'] = df['Completion_Year'] - df['Start_Year']
df['MVA_per_Substation'] = df['Transformation_Capacity_MVA'] / (df['Substations_Count'] + 1)
df['Budget_per_MVA'] = df['Budget_Cr'] / (df['Transformation_Capacity_MVA'] + 1)
df['Budget_per_Line'] = df['Budget_Cr'] / (df['Line_Length_CKM'] + 1)

# -------------------------
# Features & Targets
# -------------------------
input_features = [
    'Location', 'Geographic_Region', 'Tower_Count', 'Substations_Count',
    'Tower_Type', 'Substation_Type', 'Tax_Rate', 'Line_Length_CKM',
    'Transformation_Capacity_MVA', 'Budget_Cr',
    # engineered
    'Project_Duration', 'MVA_per_Substation', 'Budget_per_MVA', 'Budget_per_Line'
]

regression_targets = [
    'Steel_Demand_Tons', 'Conductor_Demand_Km', 'Insulator_Demand_Nos'
]

classification_target = 'Transformer_Demand_Units'

# -------------------------
# Train/Test Split
# -------------------------
X = df[input_features]
y_reg = df[regression_targets]
y_cls = df[classification_target]

X_train, X_test, y_reg_train, y_reg_test, y_cls_train, y_cls_test = train_test_split(
    X, y_reg, y_cls, test_size=0.2, random_state=42
)

# -------------------------
# Regression Models
# -------------------------
reg_models = {}
for i, col in enumerate(regression_targets):
    print(f"\nTraining Regressor for: {col}")
    model = XGBRegressor(random_state=42, n_estimators=500, learning_rate=0.05, max_depth=6)
    model.fit(X_train, y_reg_train.iloc[:, i])
    reg_models[col] = model

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_reg_test.iloc[:, i], y_pred)
    r2 = r2_score(y_reg_test.iloc[:, i], y_pred)
    print(f"MSE: {mse:.2f}")
    print(f"R2: {r2:.4f}")
    print('-'*30)


print("\nTraining Regressor for Transformer_Demand_Units")
reg_cls = XGBRegressor(random_state=42, n_estimators=500, learning_rate=0.05, max_depth=6)
reg_cls.fit(X_train, y_cls_train)

y_cls_pred = reg_cls.predict(X_test)

# Regression metrics
mse = mean_squared_error(y_cls_test, y_cls_pred)
r2 = r2_score(y_cls_test, y_cls_pred)

print(f"MSE: {mse:.2f}")
print(f"R2: {r2:.4f}")

# (Optional) If you want to also see how it performs as discrete classes:
y_cls_pred_round = np.clip(np.round(y_cls_pred), 1, 5).astype(int)
acc = accuracy_score(y_cls_test, y_cls_pred_round)




Training Regressor for: Steel_Demand_Tons
MSE: 37036183.64
R2: 0.9832
------------------------------

Training Regressor for: Conductor_Demand_Km
MSE: 251.57
R2: 1.0000
------------------------------

Training Regressor for: Insulator_Demand_Nos
MSE: 58445400.00
R2: 0.9973
------------------------------

Training Regressor for Transformer_Demand_Units
MSE: 0.64
R2: 0.5274
