In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder , StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score,make_scorer
from sklearn.impute import SimpleImputer
import joblib

In [2]:
df = pd.read_csv("../data/synthetic_power_sector_projects.csv")
df.head()

Unnamed: 0,Project_ID,Project_Type,Capacity_MW_or_km,Region,Terrain,Planned_Duration_Months,Planned_Cost_Cr,Contractor_Experience_Level,Start_Year,Actual_Duration_Months,Actual_Cost_Cr
0,P1000,Wind Farm,141,North,Plain,53,1419,Low,2010,74,1938
1,P1001,Hydro Plant,416,North-East,Hilly,55,1584,High,2010,54,1415
2,P1002,Solar Plant,504,North-East,Plain,35,196,Medium,2021,46,301
3,P1003,Hydro Plant,477,North,Hilly,26,1221,Medium,2021,32,1442
4,P1004,Hydro Plant,558,North,Plain,43,913,Low,2023,46,940


In [25]:
df.shape

(50, 11)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Project_ID                   50 non-null     object
 1   Project_Type                 50 non-null     object
 2   Capacity_MW_or_km            50 non-null     int64 
 3   Region                       50 non-null     object
 4   Terrain                      50 non-null     object
 5   Planned_Duration_Months      50 non-null     int64 
 6   Planned_Cost_Cr              50 non-null     int64 
 7   Contractor_Experience_Level  50 non-null     object
 8   Start_Year                   50 non-null     int64 
 9   Actual_Duration_Months       50 non-null     int64 
 10  Actual_Cost_Cr               50 non-null     int64 
dtypes: int64(6), object(5)
memory usage: 4.4+ KB


In [4]:
df['Cost_Overrun_Pct'] = (df['Actual_Cost_Cr'] - df['Planned_Cost_Cr']) / df['Planned_Cost_Cr'] * 100
df['Duration_Overrun_Pct'] = (df['Actual_Duration_Months'] - df['Planned_Duration_Months']) / df['Planned_Duration_Months'] * 100

df['Planned_Cost_per_Unit'] = df['Planned_Cost_Cr'] / df['Capacity_MW_or_km']
df['Planned_Duration_per_Unit'] = df['Planned_Duration_Months'] / df['Capacity_MW_or_km']

df['Project_Size_Category'] = pd.qcut(df['Capacity_MW_or_km'], q=3, labels=['Small','Medium','Large'])
df['Budget_Category'] = pd.qcut(df['Planned_Cost_Cr'], q=3, labels=['Low','Medium','High'])
df['Era'] = pd.cut(df['Start_Year'], bins=[2009,2015,2020,2025], labels=['2010-15','2016-20','2021-25'])

df['Type_Terrain'] = df['Project_Type'] + "_" + df['Terrain']
df['Type_Experience'] = df['Project_Type'] + "_" + df['Contractor_Experience_Level']

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   Project_ID                   50 non-null     object  
 1   Project_Type                 50 non-null     object  
 2   Capacity_MW_or_km            50 non-null     int64   
 3   Region                       50 non-null     object  
 4   Terrain                      50 non-null     object  
 5   Planned_Duration_Months      50 non-null     int64   
 6   Planned_Cost_Cr              50 non-null     int64   
 7   Contractor_Experience_Level  50 non-null     object  
 8   Start_Year                   50 non-null     int64   
 9   Actual_Duration_Months       50 non-null     int64   
 10  Actual_Cost_Cr               50 non-null     int64   
 11  Cost_Overrun_Pct             50 non-null     float64 
 12  Duration_Overrun_Pct         50 non-null     float64 
 13  Planned

In [27]:
features = ['Capacity_MW_or_km','Planned_Duration_Months','Planned_Cost_Cr','Start_Year',
            'Planned_Cost_per_Unit','Planned_Duration_per_Unit',
            'Project_Size_Category','Budget_Category','Era',
            'Project_Type','Region','Terrain','Contractor_Experience_Level',
            'Type_Terrain','Type_Experience']

In [28]:
X = df[features]
y = df[['Cost_Overrun_Pct','Duration_Overrun_Pct']]

In [29]:
X.head()

Unnamed: 0,Capacity_MW_or_km,Planned_Duration_Months,Planned_Cost_Cr,Start_Year,Planned_Cost_per_Unit,Planned_Duration_per_Unit,Project_Size_Category,Budget_Category,Era,Project_Type,Region,Terrain,Contractor_Experience_Level,Type_Terrain,Type_Experience
0,141,53,1419,2010,10.06383,0.375887,Small,High,2010-15,Wind Farm,North,Plain,Low,Wind Farm_Plain,Wind Farm_Low
1,416,55,1584,2010,3.807692,0.132212,Small,High,2010-15,Hydro Plant,North-East,Hilly,High,Hydro Plant_Hilly,Hydro Plant_High
2,504,35,196,2021,0.388889,0.069444,Medium,Low,2021-25,Solar Plant,North-East,Plain,Medium,Solar Plant_Plain,Solar Plant_Medium
3,477,26,1221,2021,2.559748,0.054507,Medium,Medium,2021-25,Hydro Plant,North,Hilly,Medium,Hydro Plant_Hilly,Hydro Plant_Medium
4,558,43,913,2023,1.636201,0.077061,Medium,Medium,2021-25,Hydro Plant,North,Plain,Low,Hydro Plant_Plain,Hydro Plant_Low


In [30]:
y.head()

Unnamed: 0,Cost_Overrun_Pct,Duration_Overrun_Pct
0,36.575053,39.622642
1,-10.669192,-1.818182
2,53.571429,31.428571
3,18.099918,23.076923
4,2.957284,6.976744


In [31]:
X.shape

(50, 15)

In [32]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   Capacity_MW_or_km            50 non-null     int64   
 1   Planned_Duration_Months      50 non-null     int64   
 2   Planned_Cost_Cr              50 non-null     int64   
 3   Start_Year                   50 non-null     int64   
 4   Planned_Cost_per_Unit        50 non-null     float64 
 5   Planned_Duration_per_Unit    50 non-null     float64 
 6   Project_Size_Category        50 non-null     category
 7   Budget_Category              50 non-null     category
 8   Era                          50 non-null     category
 9   Project_Type                 50 non-null     object  
 10  Region                       50 non-null     object  
 11  Terrain                      50 non-null     object  
 12  Contractor_Experience_Level  50 non-null     object  
 13  Type_Te

In [33]:
cat_cols = [c for c in X.columns if X[c].dtype == 'object' or str(X[c].dtype)=='category']
num_cols = [c for c in X.columns if c not in cat_cols]
print("categorical columns:\n" , cat_cols)
print("--------------------------------------")
print("Numeric columns:\n" , num_cols)

categorical columns:
 ['Project_Size_Category', 'Budget_Category', 'Era', 'Project_Type', 'Region', 'Terrain', 'Contractor_Experience_Level', 'Type_Terrain', 'Type_Experience']
--------------------------------------
Numeric columns:
 ['Capacity_MW_or_km', 'Planned_Duration_Months', 'Planned_Cost_Cr', 'Start_Year', 'Planned_Cost_per_Unit', 'Planned_Duration_per_Unit']


In [34]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler' , StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer' , SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    [('num',numeric_transformer,num_cols),
     ('ohe',categorical_transformer,cat_cols)],
    remainder='passthrough')

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [36]:
rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)

model = Pipeline([
    ('prep', preprocessor),
    ('rf', MultiOutputRegressor(rf))
])

In [37]:
model.fit(X_train, y_train)

In [38]:
y_pred = model.predict(X_test)

In [39]:
mse_cost = mean_squared_error(y_test['Cost_Overrun_Pct'], y_pred[:,0])
rmse_cost = np.sqrt(mse_cost)
r2_cost = r2_score(y_test['Cost_Overrun_Pct'], y_pred[:,0])
mse_dur = mean_squared_error(y_test['Duration_Overrun_Pct'], y_pred[:,1])
rmse_dur = np.sqrt(mse_dur)
r2_dur = r2_score(y_test['Duration_Overrun_Pct'], y_pred[:,1])

In [40]:
print("Cost Overrun -> RMSE:", round(rmse_cost, 2), ", R^2:", round(r2_cost, 2))
print("Duration Overrun -> RMSE:", round(rmse_dur, 2), ", R^2:", round(r2_dur, 2))

Cost Overrun -> RMSE: 24.92 , R^2: -0.03
Duration Overrun -> RMSE: 22.53 , R^2: -0.4


In [43]:
rmse_scorer = make_scorer(mean_squared_error, squared=False)
r2_scorer = make_scorer(r2_score)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for target in ['Cost_Overrun_Pct', 'Duration_Overrun_Pct']:
    scores_rmse = cross_val_score(
        model, X, y,
        scoring=rmse_scorer, cv=kf, n_jobs=-1
    )
    scores_r2 = cross_val_score(
       model, X, y,
       scoring=r2_scorer, cv=kf, n_jobs=-1
    )

    
    print(f"Target: {target}")
    print("Mean RMSE:", round(scores_rmse.mean(), 2), " | Std:", round(scores_rmse.std(), 2))
    print("Mean R²  :", round(scores_r2.mean(), 2), " | Std:", round(scores_r2.std(), 2))
    print()

Target: Cost_Overrun_Pct
Mean RMSE: nan  | Std: nan
Mean R²  : -0.12  | Std: 0.13

Target: Duration_Overrun_Pct
Mean RMSE: nan  | Std: nan
Mean R²  : -0.12  | Std: 0.13



In [44]:
joblib.dump(model, "multioutput_rf_pipeline.joblib")
print("Model saved as multioutput_rf_pipeline.joblib")

loaded_model = joblib.load("multioutput_rf_pipeline.joblib")
print("Sample Predictions:\n", loaded_model.predict(X_test[:5]))

Model saved as multioutput_rf_pipeline.joblib
Sample Predictions:
 [[13.8765215  16.57478969]
 [18.60398699 10.694637  ]
 [29.78474814  8.66205056]
 [34.12863126 19.47089313]
 [29.94843959  1.22423129]]
