**Import all Required Libraries**

In [163]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import matplotlib.pyplot as plt

**Load RAW Data**

In [164]:
df_raw = pd.read_csv("/content/MH Population.csv")
df_raw.columns = df_raw.columns.str.strip()
print("Raw Shape:", df_raw.shape)

Raw Shape: (35, 28)


------
# **Regression Prediction**

In [165]:
reg_targets = ["Population_2021", "Population_2031"]
df_model_reg = df_raw.copy()

**Encode categorical columns for Regression**

In [166]:
cat_cols_reg = df_model_reg.select_dtypes(include="object").columns
cat_cols_reg = [col for col in cat_cols_reg if col not in reg_targets]
le_dict_reg = {}
for col in cat_cols_reg:
    le = LabelEncoder()
    df_model_reg[col] = le.fit_transform(df_model_reg[col])
    le_dict_reg[col] = le

**Split Features and Target**

In [167]:
X_reg = df_model_reg.drop(columns=reg_targets + ["District"])
y_reg = df_model_reg[reg_targets]

**Train-test split**

In [168]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
                X_reg, y_reg, test_size=0.2, random_state=42)


**Multi-output Regression**

In [169]:
reg_model = MultiOutputRegressor(
    RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1))

reg_model.fit(X_train_reg, y_train_reg)

**Metrics Evaluation**

In [170]:
r2_scores = reg_model.score(X_test_reg, y_test_reg)
print("Regression Test R²:", round(r2_scores, 2))

Regression Test R²: 0.94


**Predict all districts**

In [171]:
all_reg_predictions = reg_model.predict(X_reg)
df_reg_predictions = df_raw[["District"]].copy()
df_reg_predictions["Predicted_Population_2021"] = all_reg_predictions[:, 0].astype(int)
df_reg_predictions["Predicted_Population_2031"] = all_reg_predictions[:, 1].astype(int)

-------
# **CLASSIFICATION PREDICTION**

In [172]:
class_targets = [
    "Growth Category",
    "Urban Pressure Category",
    "Migration Zone",
    "Negative_Growth_Flag",
    "Pandemic Impact"]

In [173]:
df_model_clf = df_raw.copy()

In [174]:
df_model_clf.columns = df_model_clf.columns.str.strip()

**Ensure all class Targets Exist**

In [175]:
missing_cols = [col for col in class_targets if col not in df_model_clf.columns]
if missing_cols:
    raise ValueError(f"Missing classification columns: {missing_cols}")

In [176]:
feature_cols_clf = [col for col in df_model_clf.columns if col not in class_targets + reg_targets + ["District"]]

**Encode categorical features**

In [177]:
categorical_features = df_model_clf[feature_cols_clf].select_dtypes(include="object").columns
feature_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    df_model_clf[col] = le.fit_transform(df_model_clf[col])
    feature_encoders[col] = le

**Encode class targets**

In [178]:
target_encoders = {}
for col in class_targets:
    le = LabelEncoder()
    df_model_clf[col] = le.fit_transform(df_model_clf[col])
    target_encoders[col] = le

**Split features and targets**

In [179]:
X_clf = df_model_clf[feature_cols_clf]
y_clf = df_model_clf[class_targets]

**Train-test split**

In [180]:
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42)

**Multi-output classifier**

In [181]:
clf_model = MultiOutputClassifier(
    RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))

clf_model.fit(X_train_clf, y_train_clf)

In [182]:
from sklearn.metrics import accuracy_score

for i, col in enumerate(class_targets):
    acc = accuracy_score(y_clf[col], all_clf_predictions[:, i])
    print(f"{col} Accuracy: {acc:.2f}")

Growth Category Accuracy: 0.97
Urban Pressure Category Accuracy: 0.97
Migration Zone Accuracy: 0.94
Negative_Growth_Flag Accuracy: 1.00
Pandemic Impact Accuracy: 1.00


**Predict all districts**

In [183]:
all_clf_predictions = clf_model.predict(X_clf)

**Decode predictions**

In [184]:
df_clf_predictions = df_raw[["District"]].copy()
for i, col in enumerate(class_targets):
    df_clf_predictions[col] = target_encoders[col].inverse_transform(all_clf_predictions[:, i])

**MERGE REGRESSION + CLASSIFICATION**

In [185]:
df_final = df_reg_predictions.merge(df_clf_predictions, on="District")

**EXPORT FINAL CSV**

In [186]:
df_final

Unnamed: 0,District,Predicted_Population_2021,Predicted_Population_2031,Growth Category,Urban Pressure Category,Migration Zone,Negative_Growth_Flag,Pandemic Impact
0,Thane,11564305,13735062,Rapid Growth,Very High Density,High In-Migration,0,Mild Impact
1,Pune,10009266,11751463,Rapid Growth,High Density,High In-Migration,0,Mild Impact
2,Mumbai Suburban,9276498,10067167,Low Growth,Extreme Density,Extreme In-Migration,0,Severe Impact
3,Nashik,6217641,6969623,Moderate Growth,Moderate Density,Moderate In-Migration,0,Mild Impact
4,Nagpur,4874406,5241265,Moderate Growth,Moderate Density,Moderate In-Migration,0,Mild Impact
5,Ahmadnagar,4775993,5142766,Moderate Growth,Moderate Density,Moderate In-Migration,0,Mild Impact
6,Solapur,4571633,4886726,Moderate Growth,Moderate Density,Moderate In-Migration,0,Mild Impact
7,Jalgaon,4597111,4990674,Moderate Growth,Moderate Density,Moderate In-Migration,0,Mild Impact
8,Kolhapur,4196191,4446990,Moderate Growth,High Density,Moderate In-Migration,0,Mild Impact
9,Aurangabad,5197938,5620613,Rapid Growth,Moderate Density,High In-Migration,0,Mild Impact


In [187]:
df_final.to_csv("MH_Population_Full_Predictions.csv", index=False)