In [5]:
# sector_performance_forecasting.py
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
import joblib  # for saving/loading model

file_path = "ongoing_projects2_clean.csv"
df = pd.read_csv(file_path).rename(columns=lambda c: c.strip())

# --- Clean and rename columns ---
for c in df.columns:
    if "Original Cost" in c:
        df = df.rename(columns={c: "Original_Cost_RsCr"})
    if "Latest Revised Cost" in c:
        df = df.rename(columns={c: "Latest_Revised_Cost_RsCr"})
    if "Project Count" in c:
        df = df.rename(columns={c: "Project_Count"})
    if "Cumulative" in c:
        df = df.rename(columns={c: "Cumulative_Expenditure_RsCr"})

for col in ["Original_Cost_RsCr","Latest_Revised_Cost_RsCr","Project_Count","Cumulative_Expenditure_RsCr"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# --- Prepare data ---
df = df[df["Latest_Revised_Cost_RsCr"].notna() & (df["Latest_Revised_Cost_RsCr"] > 0)].copy()
df["Original_Cost_RsCr"] = df["Original_Cost_RsCr"].fillna(0)
df["Project_Count"] = df["Project_Count"].fillna(0)
df["Cumulative_Expenditure_RsCr"] = df["Cumulative_Expenditure_RsCr"].fillna(0)

X = df[["Original_Cost_RsCr","Project_Count","Cumulative_Expenditure_RsCr"]].values
y = df["Latest_Revised_Cost_RsCr"].values
y_log = np.log1p(y)  # log-transform to reduce skew

# --- Train/Test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

# --- Model training ---
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# --- Evaluate ---
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test_real = np.expm1(y_test)

rmse = math.sqrt(mean_squared_error(y_test_real, y_pred))
print(f"✅ RMSE on test set: {rmse:,.2f} Rs Cr")

# --- Feature importance ---
feat_imp = pd.DataFrame({
    "feature": ["Original_Cost_RsCr","Project_Count","Cumulative_Expenditure_RsCr"],
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False)
print("\nFeature importances:\n", feat_imp.to_string(index=False))

# --- Save predictions for inspection ---
pred_df = pd.DataFrame({
    "Original": X_test[:,0],
    "Project_Count": X_test[:,1],
    "Cumulative_Expenditure": X_test[:,2],
    "Actual_Latest_Revised_Cost": y_test_real,
    "Predicted_Latest_Revised_Cost": y_pred
})
pred_df.to_csv("forecast_predictions.csv", index=False)
print("\nSaved: forecast_predictions.csv")

# --- Save model for reuse ---
joblib.dump(model, "sector_forecast_model.pkl")
print("Saved model: sector_forecast_model.pkl")

✅ RMSE on test set: 2,839.38 Rs Cr

Feature importances:
                     feature  importance
         Original_Cost_RsCr    0.964358
Cumulative_Expenditure_RsCr    0.033854
              Project_Count    0.001788

Saved: forecast_predictions.csv
Saved model: sector_forecast_model.pkl


In [6]:

# ---------------------------------------------------------
# 🔮 Function to predict Latest Revised Cost for new projects
# ---------------------------------------------------------
def predict_new_project(original_cost, project_count, cumulative_expenditure, model_path="sector_forecast_model.pkl"):
    """
    Predicts the Latest Revised Cost (Rs Cr) for a new project.
    
    Parameters:
        original_cost (float): Original cost in Rs Cr
        project_count (int): Number of projects in that sector/state
        cumulative_expenditure (float): Cumulative expenditure so far (Rs Cr)
        model_path (str): Path to saved model
    
    Returns:
        float: Predicted Latest Revised Cost (Rs Cr)
    """
    model = joblib.load(model_path)
    X_new = np.array([[original_cost, project_count, cumulative_expenditure]])
    pred_log = model.predict(X_new)
    predicted_cost = np.expm1(pred_log[0])  # reverse log-transform
    return round(predicted_cost, 2)


In [7]:


# ✅ Example usage
if __name__ == "__main__":
    sample_pred = predict_new_project(original_cost=120.5, project_count=4, cumulative_expenditure=80.0)
    print(f"\n🔮 Predicted Latest Revised Cost: {sample_pred} Rs Cr")


🔮 Predicted Latest Revised Cost: 154.46 Rs Cr
