In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report, auc
import warnings
warnings.filterwarnings('ignore')
# LOAD & COMBINE YOUR 3 CSV FILES
df1 = pd.read_csv('aadhar_enrollment1.csv')
df2 = pd.read_csv('aadhar_enrollment_2.csv')
df3 = pd.read_csv('aadhar_enrollment3.csv')
df = pd.concat([df1, df2, df3], ignore_index=True)
print("="*70)
print("DATA LOADED & COMBINED")
print("="*70)
print(f"Total rows: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst 5 rows:")
print(df.head())
print(f"\nData types:")
print(df.dtypes)

DATA LOADED & COMBINED
Total rows: 1006029
Columns: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']

First 5 rows:
         date          state          district  pincode  age_0_5  age_5_17  \
0  02-03-2025      Meghalaya  East Khasi Hills   793121       11        61   
1  09-03-2025      Karnataka   Bengaluru Urban   560043       14        33   
2  09-03-2025  Uttar Pradesh      Kanpur Nagar   208001       29        82   
3  09-03-2025  Uttar Pradesh           Aligarh   202133       62        29   
4  09-03-2025      Karnataka   Bengaluru Urban   560016       14        16   

   age_18_greater  
0              37  
1              39  
2              12  
3              15  
4              21  

Data types:
date              object
state             object
district          object
pincode            int64
age_0_5            int64
age_5_17           int64
age_18_greater     int64
dtype: object


In [4]:
# State-wise biometric failure rates (from UIDAI)
state_failure_rates = {
    'Jharkhand': 0.49,
    'Rajasthan': 0.37,
    'Madhya Pradesh': 0.32,
    'Chhattisgarh': 0.28,
    'Odisha': 0.25,
    'Bihar': 0.24,
    'Uttar Pradesh': 0.23,
    'Assam': 0.22,
    'Andhra Pradesh': 0.20,
    'West Bengal': 0.21,
    'Telangana': 0.18,
    'Gujarat': 0.17,
    'Karnataka': 0.16,
    'Maharashtra': 0.14,
    'Tamil Nadu': 0.15,
    'Meghalaya': 0.22,
    'Punjab': 0.19,
}

# Map failure rates to dataframe
df['state_failure_rate'] = df['state'].map(state_failure_rates)
df['state_failure_rate'] = df['state_failure_rate'].fillna(0.20)  # Default: 20%

# Create binary target: high-risk if state failure > 35%
df['high_risk'] = (df['state_failure_rate'] > 0.35).astype(int)

print("\n" + "="*70)
print("TARGET VARIABLE CREATED")
print("="*70)
print(f"\nFailure rates by state (top 10):")
print(df.groupby('state')['state_failure_rate'].first().sort_values(ascending=False).head(10))
print(f"\nTarget distribution:")
print(df['high_risk'].value_counts())
print(f"Percentage high-risk: {df['high_risk'].mean()*100:.1f}%")



TARGET VARIABLE CREATED

Failure rates by state (top 10):
state
Jharkhand         0.49
Rajasthan         0.37
Madhya Pradesh    0.32
Chhattisgarh      0.28
Odisha            0.25
Bihar             0.24
Uttar Pradesh     0.23
Meghalaya         0.22
Assam             0.22
West Bengal       0.21
Name: state_failure_rate, dtype: float64

Target distribution:
high_risk
0    926652
1     79377
Name: count, dtype: int64
Percentage high-risk: 7.9%


In [5]:
# Since your data has age groups as COUNTS, we need to create features differently

# Create age group columns (you already have them as counts)
# Rename for clarity
df['total_enrollment'] = df['age_0_5'] + df['age_5_17'] + df['age_18_greater']

# Create proportion features
df['prop_age_0_5'] = df['age_0_5'] / (df['total_enrollment'] + 1)
df['prop_age_5_17'] = df['age_5_17'] / (df['total_enrollment'] + 1)
df['prop_age_18_greater'] = df['age_18_greater'] / (df['total_enrollment'] + 1)

# Encode state
df['state_encoded'] = pd.factorize(df['state'])[0]

# Encode district
df['district_encoded'] = pd.factorize(df['district'])[0]

# Select features for model
features_to_use = [
    'state_encoded', 
    'prop_age_0_5', 
    'prop_age_5_17', 
    'prop_age_18_greater',
    'state_failure_rate'
]

print("\n" + "="*70)
print("FEATURES CREATED")
print("="*70)
print(f"Features: {features_to_use}")
print(f"Total enrollment stats:")
print(df['total_enrollment'].describe())



FEATURES CREATED
Features: ['state_encoded', 'prop_age_0_5', 'prop_age_5_17', 'prop_age_18_greater', 'state_failure_rate']
Total enrollment stats:
count    1.006029e+06
mean     5.403127e+00
std      3.158275e+01
min      1.000000e+00
25%      1.000000e+00
50%      2.000000e+00
75%      5.000000e+00
max      3.965000e+03
Name: total_enrollment, dtype: float64


In [6]:
print("\n" + "="*70)
print("MODEL TRAINING")
print("="*70)

# Prepare data
X = df[features_to_use].copy()
y = df['high_risk'].copy()

# Remove missing values
mask = X.notna().all(axis=1) & y.notna()
X = X[mask]
y = y[mask]

print(f"Data ready: {len(X)} samples, {X.shape[1]} features")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {len(X_train)} | Test: {len(X_test)}")
print(f"Train high-risk: {y_train.sum()} ({y_train.mean()*100:.1f}%)")
print(f"Test high-risk: {y_test.sum()} ({y_test.mean()*100:.1f}%)")

# Train model
print("\n‚è≥ Training Random Forest (1-2 minutes)...")
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)
print("‚úì Model trained!")

# Evaluate
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_pred_proba)

print(f"\n" + "="*70)
print("MODEL PERFORMANCE")
print("="*70)
print(f"ROC AUC: {auc_score:.4f}")
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Low Risk', 'High Risk']))

# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
print(f"\n5-Fold CV ROC AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# Feature importance
feature_importance = pd.Series(
    model.feature_importances_,
    index=features_to_use
).sort_values(ascending=False)

print(f"\nFeature Importance:")
print(feature_importance)

# Save model
import pickle
with open('biometric_fail_model.pkl', 'wb') as f:
    pickle.dump(model, f)
print("\n‚úì Model saved!")



MODEL TRAINING
Data ready: 1006029 samples, 5 features
Train: 804823 | Test: 201206
Train high-risk: 63502 (7.9%)
Test high-risk: 15875 (7.9%)

‚è≥ Training Random Forest (1-2 minutes)...
‚úì Model trained!

MODEL PERFORMANCE
ROC AUC: 1.0000

Confusion Matrix:
[[185331      0]
 [     0  15875]]

Classification Report:
              precision    recall  f1-score   support

    Low Risk       1.00      1.00      1.00    185331
   High Risk       1.00      1.00      1.00     15875

    accuracy                           1.00    201206
   macro avg       1.00      1.00      1.00    201206
weighted avg       1.00      1.00      1.00    201206


5-Fold CV ROC AUC: 1.0000 (+/- 0.0000)

Feature Importance:
state_failure_rate     0.845878
state_encoded          0.146856
prop_age_5_17          0.006173
prop_age_0_5           0.001081
prop_age_18_greater    0.000012
dtype: float64

‚úì Model saved!


In [7]:
print("\n" + "="*70)
print("GENERATING RISK SCORES")
print("="*70)

# Add risk scores
all_X = df[features_to_use].copy()
df['risk_score'] = model.predict_proba(all_X)[:, 1]

df['risk_category'] = pd.cut(
    df['risk_score'],
    bins=[0, 0.33, 0.67, 1.0],
    labels=['Low', 'Medium', 'High']
)

print(f"\nRisk Score Distribution:")
print(df['risk_category'].value_counts())

# High-risk locations
print(f"\nüéØ Top 15 High-Risk Locations (Risk Score > 0.60):")
high_risk_locs = df[df['risk_score'] > 0.60].groupby(['state', 'district']).size()
print(high_risk_locs.nlargest(15))

# Save
df.to_csv('aadhaar_with_risk_scores.csv', index=False)
print("\n‚úì Saved: aadhaar_with_risk_scores.csv")



GENERATING RISK SCORES

Risk Score Distribution:
risk_category
High      79377
Low         118
Medium        0
Name: count, dtype: int64

üéØ Top 15 High-Risk Locations (Risk Score > 0.60):
state      district  
Rajasthan  Jaipur        4670
           Sikar         2861
           Alwar         2720
           Nagaur        2672
           Ajmer         2528
           Jodhpur       2471
           Jhunjhunun    2419
           Udaipur       2369
           Pali          2297
Jharkhand  Dhanbad       2150
Rajasthan  Bharatpur     2081
           Bhilwara      1932
Jharkhand  Ranchi        1824
Rajasthan  Barmer        1742
           Kota          1681
dtype: int64

‚úì Saved: aadhaar_with_risk_scores.csv


In [8]:
# CHART 1: State Failure Rates
top_states = df.groupby('state')['state_failure_rate'].first().nlargest(10)

fig, ax = plt.subplots(figsize=(11, 7))
colors = ['#d62728' if x > 0.35 else '#1f77b4' for x in top_states.values]
top_states.plot(kind='barh', ax=ax, color=colors)

ax.set_xlabel('Authentication Failure Rate (%)', fontsize=12, fontweight='bold')
ax.set_ylabel('State', fontsize=12, fontweight='bold')
ax.set_title('Top 10 States by Biometric Authentication Failure Rate', 
             fontsize=14, fontweight='bold', pad=20)
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

for i, v in enumerate(top_states.values):
    ax.text(v + 0.01, i, f'{v:.1%}', va='center', fontweight='bold')

plt.tight_layout()
plt.savefig('chart_1_state_failures.png', dpi=300, bbox_inches='tight')
plt.close()
print("‚úì Chart 1 saved")



‚úì Chart 1 saved


In [9]:
# CHART 2: Risk vs Age Distribution
df['dominant_age_group'] = df[['age_0_5', 'age_5_17', 'age_18_greater']].idxmax(axis=1)
df['dominant_age_group'] = df['dominant_age_group'].map({
    'age_0_5': '0-5',
    'age_5_17': '5-17',
    'age_18_greater': '18+'
})

age_risk = df.groupby('dominant_age_group')['risk_score'].mean()

fig, ax = plt.subplots(figsize=(10, 6))
age_risk.plot(kind='bar', ax=ax, color=['#ff7f0e', '#2ca02c', '#d62728'])

ax.set_xlabel('Dominant Age Group', fontsize=12, fontweight='bold')
ax.set_ylabel('Average Risk Score', fontsize=12, fontweight='bold')
ax.set_title('Risk Score by Dominant Age Group in Location', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_xticklabels(age_risk.index, rotation=45)

for i, v in enumerate(age_risk.values):
    ax.text(i, v + 0.01, f'{v:.2f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('chart_2_age_risk.png', dpi=300, bbox_inches='tight')
plt.close()
print("‚úì Chart 2 saved")


‚úì Chart 2 saved


In [10]:
# CHART 3: Heatmap
pivot_table = df.pivot_table(
    values='risk_score',
    index='state',
    columns='dominant_age_group',
    aggfunc='mean'
)

top_states_list = df.groupby('state')['state_failure_rate'].mean().nlargest(10).index
pivot_table = pivot_table.loc[top_states_list]

fig, ax = plt.subplots(figsize=(10, 9))
sns.heatmap(
    pivot_table,
    annot=True,
    fmt='.2f',
    cmap='RdYlGn_r',
    cbar_kws={'label': 'Risk Score'},
    ax=ax,
    linewidths=0.5
)

ax.set_title('Risk Score: State √ó Dominant Age Group (Top 10 States)', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel('Dominant Age Group', fontsize=12, fontweight='bold')
ax.set_ylabel('State', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('chart_3_heatmap.png', dpi=300, bbox_inches='tight')
plt.close()
print("‚úì Chart 3 saved")


‚úì Chart 3 saved


In [11]:
# CHART 4: Feature Importance
feature_importance = pd.Series(
    model.feature_importances_,
    index=features_to_use
).sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
feature_importance.plot(kind='barh', ax=ax, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'])

ax.set_xlabel('Importance Score', fontsize=12, fontweight='bold')
ax.set_title('Feature Importance in Risk Prediction', 
             fontsize=14, fontweight='bold', pad=20)

for i, v in enumerate(feature_importance.values):
    ax.text(v + 0.01, i, f'{v:.3f}', va='center', fontweight='bold')

plt.tight_layout()
plt.savefig('chart_4_feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()
print("‚úì Chart 4 saved")


‚úì Chart 4 saved


In [12]:
# CHART 5: ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

fig, ax = plt.subplots(figsize=(10, 8))
ax.plot(fpr, tpr, color='#d62728', lw=2.5, 
        label=f'ROC Curve (AUC = {roc_auc:.3f})')
ax.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--', label='Random Classifier')
ax.fill_between(fpr, tpr, alpha=0.2, color='#d62728')

ax.set_xlabel('False Positive Rate', fontsize=12, fontweight='bold')
ax.set_ylabel('True Positive Rate', fontsize=12, fontweight='bold')
ax.set_title('ROC Curve: Biometric Failure Risk Prediction', 
             fontsize=14, fontweight='bold', pad=20)
ax.legend(loc='lower right', fontsize=11)
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('chart_5_roc_curve.png', dpi=300, bbox_inches='tight')
plt.close()
print("‚úì Chart 5 saved")


‚úì Chart 5 saved


In [13]:
print("\n" + "="*70)
print("KEY INSIGHTS (FOR REPORT)")
print("="*70)

print(f"\n1Ô∏è‚É£ HIGH-RISK STATES:")
high_risk_states = df.groupby('state')['state_failure_rate'].first().nlargest(5)
for state, rate in high_risk_states.items():
    print(f"   {state}: {rate:.1%}")

print(f"\n2Ô∏è‚É£ HIGHEST-RISK LOCATIONS:")
top_risk_locs = df.nlargest(5, 'risk_score')[['state', 'district', 'pincode', 'risk_score']]
print(top_risk_locs)

print(f"\n3Ô∏è‚É£ MODEL PERFORMANCE:")
print(f"   ROC AUC: {auc_score:.3f}")
print(f"   Feature importance ranking:")
for i, (feat, score) in enumerate(feature_importance.items(), 1):
    print(f"     {i}. {feat}: {score:.3f}")

print(f"\n4Ô∏è‚É£ ESTIMATED IMPACT:")
high_risk_pincodes = (df['high_risk'] == 1).sum()
print(f"   High-risk PIN codes: {high_risk_pincodes:,} ({high_risk_pincodes/len(df)*100:.1f}%)")
print(f"   Population affected: ~{df[df['high_risk']==1]['total_enrollment'].sum():,}")

print("\n" + "="*70)
print("‚úÖ ANALYSIS COMPLETE - READY FOR REPORT")
print("="*70)



KEY INSIGHTS (FOR REPORT)

1Ô∏è‚É£ HIGH-RISK STATES:
   Jharkhand: 49.0%
   Rajasthan: 37.0%
   Madhya Pradesh: 32.0%
   Chhattisgarh: 28.0%
   Odisha: 25.0%

2Ô∏è‚É£ HIGHEST-RISK LOCATIONS:
         state district  pincode  risk_score
19   Rajasthan    Sikar   332001         1.0
147  Jharkhand  Deoghar   815353         1.0
176  Jharkhand   Ranchi   834001         1.0
219  Rajasthan    Jalor   343049         1.0
233  Rajasthan  Jodhpur   342301         1.0

3Ô∏è‚É£ MODEL PERFORMANCE:
   ROC AUC: 1.000
   Feature importance ranking:
     1. state_failure_rate: 0.846
     2. state_encoded: 0.147
     3. prop_age_5_17: 0.006
     4. prop_age_0_5: 0.001
     5. prop_age_18_greater: 0.000

4Ô∏è‚É£ ESTIMATED IMPACT:
   High-risk PIN codes: 79,377 (7.9%)
   Population affected: ~505,997

‚úÖ ANALYSIS COMPLETE - READY FOR REPORT
