In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder


In [5]:
df = pd.read_csv("district_governance_risk_index_with_labels.csv")
df.head()


Unnamed: 0,district,total_enrolment,total_biometric,total_population,enrol_norm,bio_norm,demo_norm,governance_risk_score,risk_category
0,100000,218,0.0,1.0,0.004967,0.0,2e-06,0.201986,Low Risk
1,24 Paraganas North,6147,0.0,0.0,0.140683,0.0,0.0,0.256273,Low Risk
2,24 Paraganas South,490,0.0,0.0,0.011193,0.0,0.0,0.204477,Low Risk
3,ANGUL,1,5.0,9.0,0.0,8e-06,2.1e-05,0.199999,Low Risk
4,ANUGUL,13,245.0,140.0,0.000275,0.000404,0.000319,0.200208,Low Risk


In [6]:
X = df[[
    "total_enrolment",
    "total_biometric",
    "total_population"
]]

y = df["risk_category"]


In [7]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Check mapping
list(zip(le.classes_, le.transform(le.classes_)))


[('High Risk', 0), ('Low Risk', 1), ('Medium Risk', 2)]

In [9]:
Low Risk    → 0
Medium Risk → 1
High Risk   → 2


SyntaxError: invalid character '→' (U+2192) (2150583950.py, line 1)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42
)


In [12]:
model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

model.fit(X_train, y_train)


In [13]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9695431472081218

Classification Report:
               precision    recall  f1-score   support

           1       0.97      0.99      0.98       178
           2       0.93      0.74      0.82        19

    accuracy                           0.97       197
   macro avg       0.95      0.87      0.90       197
weighted avg       0.97      0.97      0.97       197



In [14]:
confusion_matrix(y_test, y_pred)


array([[177,   1],
       [  5,  14]], dtype=int64)

In [15]:
importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)

importance


Unnamed: 0,Feature,Importance
0,total_enrolment,0.429553
1,total_biometric,0.343164
2,total_population,0.227283


In [16]:
sample = pd.DataFrame({
    "total_enrolment": [850000],
    "total_biometric": [920000],
    "total_population": [1000000]
})

pred = model.predict(sample)
le.inverse_transform(pred)


array(['High Risk'], dtype=object)

In [17]:
['High Risk']


['High Risk']