<a href="https://colab.research.google.com/github/SunbirdAI/lamwo-electrification-project/blob/main/notebooks/rank_minigrid_villages_PUE/train_PUE_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train Productive Use of Energy (PUE) predictor

Use subjectively labelled estimates of how well currently installed minigrids are operating to train a model that predicts a PUE value (0 - 1)

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

## Load and preprocess training data

In [2]:
data = pd.read_csv("existing_minigrid_trainingdata.csv")

# Selected features relevant for PUE calculation
features = ['building_count', 'permanent_building_count', 'educational_facilities',
            'health_facilities', 'social_facilities', 'services', 'primary_roads', 'secondary_roads',
            'tertiary_roads', 'unclassified_roads', 'percentage_crop_land', 'percentage_built_area',
            'mean_pvout_solar_radiation', 'mean_wind_speed']

X = data[features]
y = data['winch_prob']

In [3]:
X.head()

Unnamed: 0,building_count,permanent_building_count,educational_facilities,health_facilities,social_facilities,services,primary_roads,secondary_roads,tertiary_roads,unclassified_roads,percentage_crop_land,percentage_built_area,mean_pvout_solar_radiation,mean_wind_speed
0,601,171,1,1,0,0,0,0,1,0,15.15,58.55,0.0,0.0
1,198,7,1,0,0,0,0,0,0,0,17.375,8.2975,1595.776978,0.0
2,183,31,1,0,0,0,0,2,0,3,8.89,1.2875,1673.575684,0.0
3,131,20,1,0,0,0,0,1,4,3,17.726667,1.14,1578.909446,0.0
4,290,52,1,0,0,0,0,0,1,1,13.405,2.54,1644.825056,1.476709


In [4]:
y.head()

Unnamed: 0,winch_prob
0,0.85
1,0.6
2,0.17
3,0.14
4,0.45


Split 30% of the data randomly for evaluation

In [5]:
np.random.seed(42)

eval_indices = np.random.choice(X.index, size=int(0.3 * len(X)), replace=False)
X_eval = X.loc[eval_indices]
y_eval = y.loc[eval_indices]

In [6]:
len(X_eval)

7

Because the dataset is so limited (~25 records) we want to train on the whole dataset.

In [7]:
X_train = X
y_train = y

In [8]:
len(X_train)

25

Helper function to evaluate models

In [9]:
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{model_name} Performance on 30% Evaluation Set:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R² Score: {r2:.4f}")

## Train and evaluate with several algorithms

## 1. Linear Regression

In [10]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_eval)

evaluate_model(y_eval, y_pred_lin, "Linear Regression")


Linear Regression Performance on 30% Evaluation Set:
Mean Squared Error: 0.0143
R² Score: 0.6570


LR Coefficients

In [11]:
print("Linear Regression Coefficients:")
for feature, coef in zip(features, lin_reg.coef_):
    print(f"{feature}: {coef:.4f}")
print(f"Intercept: {lin_reg.intercept_:.4f}")

Linear Regression Coefficients:
building_count: 0.0001
permanent_building_count: 0.0008
educational_facilities: -0.0743
health_facilities: 0.0472
social_facilities: -0.0000
services: -0.0000
primary_roads: 0.0000
secondary_roads: -0.0610
tertiary_roads: -0.0279
unclassified_roads: -0.0167
percentage_crop_land: 0.0017
percentage_built_area: 0.0164
mean_pvout_solar_radiation: 0.0004
mean_wind_speed: 0.1129
Intercept: -0.2687


Save Linear Regression model

In [12]:
with open('lin_reg.pkl', 'wb') as f:
    pickle.dump(lin_reg, f)

Sample PUE formula (Linear Regression)

In [13]:
print("\nLinear PUE Formula:")
formula = f"PUE = {lin_reg.intercept_:.4f}"
for i, coef in enumerate(lin_reg.coef_):
    formula += f" + {coef:.4f} * {features[i]}"
print(formula)


Linear PUE Formula:
PUE = -0.2687 + 0.0001 * building_count + 0.0008 * permanent_building_count + -0.0743 * educational_facilities + 0.0472 * health_facilities + -0.0000 * social_facilities + -0.0000 * services + 0.0000 * primary_roads + -0.0610 * secondary_roads + -0.0279 * tertiary_roads + -0.0167 * unclassified_roads + 0.0017 * percentage_crop_land + 0.0164 * percentage_built_area + 0.0004 * mean_pvout_solar_radiation + 0.1129 * mean_wind_speed


## 2. Polynomial Regression (degree 2)

In [14]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_eval_poly = poly.transform(X_eval)

poly_reg = LinearRegression()
poly_reg.fit(X_train_poly, y_train)
y_pred_poly = poly_reg.predict(X_eval_poly)

evaluate_model(y_eval, y_pred_poly, "Polynomial Regression (degree 2)")


Polynomial Regression (degree 2) Performance on 30% Evaluation Set:
Mean Squared Error: 0.0000
R² Score: 0.9993


Save Polynomial Regression model and PolynomialFeatures object

In [15]:
with open('poly_reg.pkl', 'wb') as f:
    pickle.dump(poly_reg, f)
with open('poly_features.pkl', 'wb') as f:
    pickle.dump(poly, f)

## 3. Random Forest Regression

In [16]:
rf_reg = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_eval)

evaluate_model(y_eval, y_pred_rf, "Random Forest Regression")


Random Forest Regression Performance on 30% Evaluation Set:
Mean Squared Error: 0.0043
R² Score: 0.8970


Feature importance

In [17]:
print("\nRandom Forest Feature Importances:")
for feature, importance in zip(features, rf_reg.feature_importances_):
    print(f"{feature}: {importance:.4f}")


Random Forest Feature Importances:
building_count: 0.1624
permanent_building_count: 0.3704
educational_facilities: 0.0212
health_facilities: 0.0035
social_facilities: 0.0000
services: 0.0000
primary_roads: 0.0000
secondary_roads: 0.0129
tertiary_roads: 0.0095
unclassified_roads: 0.0288
percentage_crop_land: 0.0410
percentage_built_area: 0.3023
mean_pvout_solar_radiation: 0.0463
mean_wind_speed: 0.0018


Save Random Forest model

In [18]:
with open('rf_reg.pkl', 'wb') as f:
    pickle.dump(rf_reg, f)

## 4. XGBoost Regression

In [19]:
xgb_reg = xgb.XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42, objective='reg:squarederror')
xgb_reg.fit(X_train, y_train)
y_pred_xgb = xgb_reg.predict(X_eval)

evaluate_model(y_eval, y_pred_xgb, "XGBoost Regression")


XGBoost Regression Performance on 30% Evaluation Set:
Mean Squared Error: 0.0000
R² Score: 0.9988


Feature importance

In [20]:
print("\nXGBoost Feature Importances:")
for feature, importance in zip(features, xgb_reg.feature_importances_):
    print(f"{feature}: {importance:.4f}")


XGBoost Feature Importances:
building_count: 0.0599
permanent_building_count: 0.4093
educational_facilities: 0.0549
health_facilities: 0.0056
social_facilities: 0.0000
services: 0.0000
primary_roads: 0.0000
secondary_roads: 0.1154
tertiary_roads: 0.0381
unclassified_roads: 0.0170
percentage_crop_land: 0.0484
percentage_built_area: 0.2338
mean_pvout_solar_radiation: 0.0140
mean_wind_speed: 0.0036


Save XGBoost model

In [21]:
with open('xgb_reg.pkl', 'wb') as f:
    pickle.dump(xgb_reg, f)

Zip models

In [22]:
!zip -r ranking_models.zip *.pkl

  adding: lin_reg.pkl (deflated 24%)
  adding: poly_features.pkl (deflated 36%)
  adding: poly_reg.pkl (deflated 21%)
  adding: rf_reg.pkl (deflated 84%)
  adding: xgb_reg.pkl (deflated 87%)
