In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
data = pd.read_csv('India_Elec_data_(Jan2020-Mar2025).csv')

In [28]:
data.describe()

Unnamed: 0,Max Demand Met,Shortage During Peak,Energy Met,Drawl Schedule,OD(+) / UD(-),Max OD,Energy Shortage
count,62972.0,64006.0,64014.0,64024.0,64024.0,64020.0,64024.0
mean,5914.468621,31.847999,118.034654,53.667034,0.047723,283.421821,1.251784
std,6572.822893,175.700777,135.339533,63.055726,6.855528,301.096262,9.975461
min,36.0,-75.0,0.0,-1415.0,-908.0,-1065.0,-43.8
25%,358.0,0.0,6.6,4.7,-0.8,45.0,0.0
50%,3452.0,0.0,69.1,30.2,-0.1,200.0,0.0
75%,9666.25,0.0,194.4,86.3,0.3,438.0,0.0
max,30675.0,3311.0,685.0,1502.0,66.3,15623.0,1153.0


In [32]:
data["Date"] = pd.to_datetime(data["Date"], errors="coerce")

In [33]:
STATE = "Andhra Pradesh"
df = data[data["State"] == STATE].copy()

In [34]:
df = df.dropna(subset=["Drawl Schedule", "OD(+) / UD(-)"])

In [35]:
df["Actual_Drawl"] = df["Drawl Schedule"] + df["OD(+) / UD(-)"]

In [43]:
df = df.sort_values("Date").reset_index(drop=True)

**Correlation between drawl and actual drawn**

In [37]:
corr = df["Drawl Schedule"].corr(df["Actual_Drawl"])
print("Correlation (S_t, A_t):", corr)

Correlation (S_t, A_t): 0.9972094125053577


Error 

In [38]:
df["Error"] = df["Actual_Drawl"] - df["Drawl Schedule"]

error_stats = {
    "mean_error": df["Error"].mean(),
    "std_error": df["Error"].std(),
    "max_overdraw": df["Error"].max(),
    "max_underdraw": df["Error"].min()
}

print(error_stats)

{'mean_error': np.float64(0.037566702241195356), 'std_error': 1.627138258741535, 'max_overdraw': 21.799999999999997, 'max_underdraw': -8.0}


**Relational Error , Error / Predicted Drawl**

In [39]:
df["Rel_OD"] = df["Error"] / df["Drawl Schedule"]

rel_stats = {
    "mean_rel_od": df["Rel_OD"].mean(),
    "std_rel_od": df["Rel_OD"].std(),
    "max_rel_od": df["Rel_OD"].max(),
    "min_rel_od": df["Rel_OD"].min()
}

print(rel_stats)

{'mean_rel_od': np.float64(-inf), 'std_rel_od': nan, 'max_rel_od': 5.0, 'min_rel_od': np.float64(-inf)}


**Rolling OD -  Mistakes do they stay or move on**

In [44]:
WINDOW = 100  # days

df["OD_roll_mean"] = df["Error"].rolling(WINDOW).mean()
df["OD_roll_std"]  = df["Error"].rolling(WINDOW).std()

In [45]:
for lag in [1, 2, 3, 7]:
    df[f"OD_lag_{lag}"] = df["Error"].shift(lag)
    corr_lag = df[f"OD_lag_{lag}"].corr(df["Actual_Drawl"])
    print(f"Corr(OD_t-{lag}, A_t): {corr_lag}")

Corr(OD_t-1, A_t): 0.2764334845350078
Corr(OD_t-2, A_t): 0.2336037361346498
Corr(OD_t-3, A_t): 0.19137349506160423
Corr(OD_t-7, A_t): 0.11861216228691987


In [46]:
from sklearn.linear_model import LinearRegression

X = df[["Drawl Schedule"]]
y = df["Actual_Drawl"]

model = LinearRegression()
model.fit(X, y)

print("Slope:", model.coef_[0])
print("Intercept:", model.intercept_)
print("R^2:", model.score(X, y))

Slope: 1.0156041826865632
Intercept: -1.0706266980158006
R^2: 0.9944266123892803


**A Summation of this but for all the states and considering the mean**

In [47]:
df = pd.read_csv("India_Elec_data_(Jan2020-Mar2025).csv")

In [48]:
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

# Drop rows needed for drawl logic
df = df.dropna(subset=["Drawl Schedule", "OD(+) / UD(-)"])

# Actual Drawl
df["Actual_Drawl"] = df["Drawl Schedule"] + df["OD(+) / UD(-)"]

# Error (OD)
df["Error"] = df["Actual_Drawl"] - df["Drawl Schedule"]

# Relative OD 
df["Rel_OD"] = np.where(
    df["Drawl Schedule"] != 0,
    df["Error"] / df["Drawl Schedule"],
    np.nan
)


In [49]:
state_summary = (
    df.groupby("State")
      .agg(
          mean_scheduled_drawl=("Drawl Schedule", "mean"),
          mean_actual_drawl=("Actual_Drawl", "mean"),
          mean_error=("Error", "mean"),
          std_error=("Error", "std"),
          mean_rel_od=("Rel_OD", "mean"),
          std_rel_od=("Rel_OD", "std"),
          observations=("Error", "count")
      )
      .reset_index()
)

In [50]:
corr_list = []

for state, sdf in df.groupby("State"):
    if len(sdf) > 30:
        corr = sdf["Drawl Schedule"].corr(sdf["Actual_Drawl"])
        corr_list.append({"State": state, "corr_S_A": corr})

corr_df = pd.DataFrame(corr_list)

# Merge correlation into summary
state_summary = state_summary.merge(corr_df, on="State", how="left")

# Sort by predictability
state_summary = state_summary.sort_values(
    by="corr_S_A", ascending=False
).reset_index(drop=True)

print(state_summary)

                   State  mean_scheduled_drawl  mean_actual_drawl  mean_error  \
0                     HP             15.128368          14.963895   -0.164474   
1              Telangana             96.218220          96.202754   -0.015466   
2                 Punjab             95.984958          94.842055   -1.142903   
3                  NR UP            151.738030         151.068273   -0.669756   
4                     MP            143.355427         141.176291   -2.179136   
5            West Bengal             40.578549          39.756777   -0.821772   
6                     DD              2.462153          -0.106597   -2.568750   
7           SR Karnataka             72.663289          72.278462   -0.384828   
8                  Delhi             79.746928          78.663347   -1.083581   
9                 Kerala             54.513983          54.905350    0.391367   
10               Haryana            114.984799         114.604131   -0.380667   
11               Gujarat    