<a href="https://colab.research.google.com/github/Subhr74/Git-files/blob/master/data_hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1. Install and import basics (Colab / local)
!pip install pandas matplotlib seaborn --quiet

import pandas as pd
import glob
import numpy as np


In [2]:
# 2. Set a folder path where you have saved the 12 CSVs
# Example: "/content/aadhaar_data" in Colab or "D:/uidai" on your PC
DATA_DIR = "/content/aadhaar_data"   # <-- CHANGE THIS PATH

# 3. Create full file patterns
bio_pattern  = DATA_DIR + "/api_data_aadhar_biometric_*.csv"
demo_pattern = DATA_DIR + "/api_data_aadhar_demographic_*.csv"
enr_pattern  = DATA_DIR + "/api_data_aadhar_enrolment_*.csv"


In [5]:
# 2. Set a folder path where you have saved the 12 CSVs
# Example: "/content/aadhaar_data" in Colab or "D:/uidai" on your PC
DATA_DIR = "/content/data_hackathon"   # <-- CHANGE THIS PATH

# 3. Create full file patterns
bio_pattern  = DATA_DIR + "/api_data_aadhar_biometric_*.csv"
demo_pattern = DATA_DIR + "/api_data_aadhar_demographic_*.csv"
enr_pattern  = DATA_DIR + "/api_data_aadhar_enrolment_*.csv"

# 4. Read and concatenate biometric CSVs
bio_files = glob.glob(bio_pattern)
print("Biometric files:", bio_files)

df_bio = pd.concat(
    (pd.read_csv(f) for f in bio_files),
    ignore_index=True
)
df_bio.head()

Biometric files: ['/content/data_hackathon/api_data_aadhar_biometric_1000000_1500000.csv', '/content/data_hackathon/api_data_aadhar_biometric_0_500000.csv', '/content/data_hackathon/api_data_aadhar_biometric_500000_1000000.csv', '/content/data_hackathon/api_data_aadhar_biometric_1500000_1861108.csv']


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,07-11-2025,Haryana,Yamuna Nagar,135002,4,6
1,07-11-2025,Haryana,Yamunanagar,135001,1,2
2,07-11-2025,Himachal Pradesh,Bilaspur,174004,2,2
3,07-11-2025,Himachal Pradesh,Bilaspur,174005,1,0
4,07-11-2025,Himachal Pradesh,Bilaspur,174013,3,1


In [6]:
# Biometric: columns = date,state,district,pincode,bio_age_5_17,bio_age_17_
df_bio["date"] = pd.to_datetime(df_bio["date"], format="%d-%m-%Y")
df_bio["month"] = df_bio["date"].dt.to_period("M").dt.to_timestamp()

df_bio["bio_age_5_17"] = pd.to_numeric(df_bio["bio_age_5_17"], errors="coerce").fillna(0)
df_bio["bio_age_17_"]  = pd.to_numeric(df_bio["bio_age_17_"],  errors="coerce").fillna(0)
df_bio["bio_total"]    = df_bio["bio_age_5_17"] + df_bio["bio_age_17_"]

bio_pin_month = (df_bio
    .groupby(["state", "district", "pincode", "month"], as_index=False)
    .agg({
        "bio_age_5_17": "sum",
        "bio_age_17_":  "sum",
        "bio_total":    "sum"
    }))

bio_dist_month = (df_bio
    .groupby(["state", "district", "month"], as_index=False)
    .agg({
        "bio_age_5_17": "sum",
        "bio_age_17_":  "sum",
        "bio_total":    "sum"
    }))

bio_dist_month.head()


Unnamed: 0,state,district,month,bio_age_5_17,bio_age_17_,bio_total
0,Andaman & Nicobar Islands,Andamans,2025-03-01,16,193,209
1,Andaman & Nicobar Islands,Andamans,2025-04-01,17,167,184
2,Andaman & Nicobar Islands,Andamans,2025-05-01,22,158,180
3,Andaman & Nicobar Islands,Andamans,2025-06-01,11,129,140
4,Andaman & Nicobar Islands,Andamans,2025-07-01,20,239,259


In [10]:
# 1) READ DEMOGRAPHIC DATA
demo_files = glob.glob(demo_pattern)
print("Demographic files:", demo_files)

df_demo = pd.concat(
    (pd.read_csv(f) for f in demo_files),
    ignore_index=True
)

print(df_demo.columns)      # check columns
df_demo.head()


Demographic files: ['/content/data_hackathon/api_data_aadhar_demographic_1500000_2000000.csv', '/content/data_hackathon/api_data_aadhar_demographic_1000000_1500000.csv', '/content/data_hackathon/api_data_aadhar_demographic_2000000_2071700.csv', '/content/data_hackathon/api_data_aadhar_demographic_500000_1000000.csv', '/content/data_hackathon/api_data_aadhar_demographic_0_500000.csv']
Index(['date', 'state', 'district', 'pincode', 'demo_age_5_17',
       'demo_age_17_'],
      dtype='object')


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,18-11-2025,Karnataka,Hasan,573118,0,2
1,18-11-2025,Karnataka,Hasan,573124,1,3
2,18-11-2025,Karnataka,Hasan,573150,0,2
3,18-11-2025,Karnataka,Hassan,573113,0,1
4,18-11-2025,Karnataka,Hassan,573120,1,4


In [11]:
# 2) CLEAN + AGGREGATE DEMOGRAPHIC DATA

# If last two columns are the age groups, rename them if needed
if "demo_age_5_17" not in df_demo.columns:
    cols = list(df_demo.columns)
    df_demo = df_demo.rename(columns={
        cols[-2]: "demo_age_5_17",
        cols[-1]: "demo_age_17_"
    })

# Parse date and create month column
df_demo["date"] = pd.to_datetime(df_demo["date"], format="%d-%m-%Y")
df_demo["month"] = df_demo["date"].dt.to_period("M").dt.to_timestamp()

# Numeric conversion and totals
df_demo["demo_age_5_17"] = pd.to_numeric(df_demo["demo_age_5_17"], errors="coerce").fillna(0)
df_demo["demo_age_17_"]  = pd.to_numeric(df_demo["demo_age_17_"],  errors="coerce").fillna(0)
df_demo["demo_total"]    = df_demo["demo_age_5_17"] + df_demo["demo_age_17_"]

# Aggregate to pincode-month
demo_pin_month = (df_demo
    .groupby(["state", "district", "pincode", "month"], as_index=False)
    .agg({
        "demo_age_5_17": "sum",
        "demo_age_17_":  "sum",
        "demo_total":    "sum"
    }))

# Aggregate to district-month
demo_dist_month = (df_demo
    .groupby(["state", "district", "month"], as_index=False)
    .agg({
        "demo_age_5_17": "sum",
        "demo_age_17_":  "sum",
        "demo_total":    "sum"
    }))

demo_dist_month.head()


Unnamed: 0,state,district,month,demo_age_5_17,demo_age_17_,demo_total
0,100000,100000,2025-12-01,0,2,2
1,Andaman & Nicobar Islands,Andamans,2025-09-01,3,159,162
2,Andaman & Nicobar Islands,Andamans,2025-10-01,2,73,75
3,Andaman & Nicobar Islands,Andamans,2025-11-01,0,212,212
4,Andaman & Nicobar Islands,Andamans,2025-12-01,2,299,301


In [12]:
# 1) READ ENROLMENT DATA
enr_files = glob.glob(enr_pattern)
print("Enrolment files:", enr_files)

df_enr = pd.concat(
    (pd.read_csv(f) for f in enr_files),
    ignore_index=True
)

print(df_enr.columns)
df_enr.head()


Enrolment files: ['/content/data_hackathon/api_data_aadhar_enrolment_1000000_1006029.csv', '/content/data_hackathon/api_data_aadhar_enrolment_0_500000.csv', '/content/data_hackathon/api_data_aadhar_enrolment_500000_1000000.csv']
Index(['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17',
       'age_18_greater'],
      dtype='object')


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,31-12-2025,Karnataka,Bidar,585330,2,3,0
1,31-12-2025,Karnataka,Bidar,585402,6,0,0
2,31-12-2025,Karnataka,Bidar,585413,1,0,0
3,31-12-2025,Karnataka,Bidar,585418,1,2,0
4,31-12-2025,Karnataka,Bidar,585421,4,3,0


In [13]:
# 2) CLEAN + AGGREGATE ENROLMENT DATA

df_enr["date"] = pd.to_datetime(df_enr["date"], format="%d-%m-%Y")
df_enr["month"] = df_enr["date"].dt.to_period("M").dt.to_timestamp()

df_enr["age_0_5"]        = pd.to_numeric(df_enr["age_0_5"],        errors="coerce").fillna(0)
df_enr["age_5_17"]       = pd.to_numeric(df_enr["age_5_17"],       errors="coerce").fillna(0)
df_enr["age_18_greater"] = pd.to_numeric(df_enr["age_18_greater"], errors="coerce").fillna(0)
df_enr["enr_total"]      = df_enr["age_0_5"] + df_enr["age_5_17"] + df_enr["age_18_greater"]

enr_pin_month = (df_enr
    .groupby(["state", "district", "pincode", "month"], as_index=False)
    .agg({
        "age_0_5":        "sum",
        "age_5_17":       "sum",
        "age_18_greater": "sum",
        "enr_total":      "sum"
    }))

enr_dist_month = (df_enr
    .groupby(["state", "district", "month"], as_index=False)
    .agg({
        "age_0_5":        "sum",
        "age_5_17":       "sum",
        "age_18_greater": "sum",
        "enr_total":      "sum"
    }))

enr_dist_month.head()


Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,enr_total
0,100000,100000,2025-09-01,0,0,12,12
1,100000,100000,2025-10-01,0,1,0,1
2,100000,100000,2025-11-01,0,0,11,11
3,100000,100000,2025-12-01,0,0,194,194
4,Andaman & Nicobar Islands,Andamans,2025-09-01,23,4,0,27


In [16]:
import numpy as np

def add_z_scores(df, group_cols, value_col, window=6):
    """
    For each group, compute rolling mean/std and z-score for value_col.
    """
    df = df.sort_values(group_cols + ["month"])
    df[value_col] = df[value_col].astype(float)

    def _per_group(g):
        g = g.sort_values("month")
        roll_mean = g[value_col].rolling(window=window, min_periods=3).mean()
        roll_std  = g[value_col].rolling(window=window, min_periods=3).std()
        g[value_col + "_exp"] = roll_mean
        g[value_col + "_std"] = roll_std.replace(0, np.nan)
        g[value_col + "_z"]   = (g[value_col] - g[value_col + "_exp"]) / g[value_col + "_std"]
        return g

    return df.groupby(group_cols, group_keys=False).apply(_per_group)


In [17]:
dist_month_z = dist_month.copy()

for col in ["enr_total", "bio_total", "demo_total"]:
    dist_month_z = add_z_scores(dist_month_z, ["state", "district"], col, window=6)

dist_month_z.head()


  return df.groupby(group_cols, group_keys=False).apply(_per_group)
  return df.groupby(group_cols, group_keys=False).apply(_per_group)
  return df.groupby(group_cols, group_keys=False).apply(_per_group)


Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,enr_total,bio_age_5_17,bio_age_17_,bio_total,...,demo_total,enr_total_exp,enr_total_std,enr_total_z,bio_total_exp,bio_total_std,bio_total_z,demo_total_exp,demo_total_std,demo_total_z
0,100000,100000,2025-09-01,0.0,0.0,12.0,12.0,0.0,0.0,0.0,...,0.0,,,,,,,,,
1,100000,100000,2025-10-01,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,,,,,,,,,
2,100000,100000,2025-11-01,0.0,0.0,11.0,11.0,0.0,0.0,0.0,...,0.0,8.0,6.082763,0.493197,0.0,,,0.0,,
3,100000,100000,2025-12-01,0.0,0.0,194.0,194.0,0.0,0.0,0.0,...,2.0,54.5,93.132522,1.497866,0.0,,,0.5,1.0,1.5
4,Andaman & Nicobar Islands,Andamans,2025-03-01,0.0,0.0,0.0,0.0,16.0,193.0,209.0,...,0.0,,,,,,,,,


In [18]:
# Ratios: biometric/enrolment and demographic/enrolment
dist_month_z["ratio_bio_enr"]  = dist_month_z["bio_total"]  / dist_month_z["enr_total"].replace(0, np.nan)
dist_month_z["ratio_demo_enr"] = dist_month_z["demo_total"] / dist_month_z["enr_total"].replace(0, np.nan)

for col in ["ratio_bio_enr", "ratio_demo_enr"]:
    dist_month_z = add_z_scores(dist_month_z, ["state", "district"], col, window=6)

dist_month_z.head()


  return df.groupby(group_cols, group_keys=False).apply(_per_group)
  return df.groupby(group_cols, group_keys=False).apply(_per_group)


Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,enr_total,bio_age_5_17,bio_age_17_,bio_total,...,demo_total_std,demo_total_z,ratio_bio_enr,ratio_demo_enr,ratio_bio_enr_exp,ratio_bio_enr_std,ratio_bio_enr_z,ratio_demo_enr_exp,ratio_demo_enr_std,ratio_demo_enr_z
0,100000,100000,2025-09-01,0.0,0.0,12.0,12.0,0.0,0.0,0.0,...,,,0.0,0.0,,,,,,
1,100000,100000,2025-10-01,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,,,0.0,0.0,,,,,,
2,100000,100000,2025-11-01,0.0,0.0,11.0,11.0,0.0,0.0,0.0,...,,,0.0,0.0,0.0,,,0.0,,
3,100000,100000,2025-12-01,0.0,0.0,194.0,194.0,0.0,0.0,0.0,...,1.0,1.5,0.0,0.010309,0.0,,,0.002577,0.005155,1.5
4,Andaman & Nicobar Islands,Andamans,2025-03-01,0.0,0.0,0.0,0.0,16.0,193.0,209.0,...,,,,,,,,,,


In [19]:
def row_risk(r):
    z_enr  = np.nan_to_num(r["enr_total_z"])
    z_bio  = np.nan_to_num(r["bio_total_z"])
    z_demo = np.nan_to_num(r["demo_total_z"])
    z_rbe  = np.nan_to_num(r["ratio_bio_enr_z"])
    z_rde  = np.nan_to_num(r["ratio_demo_enr_z"])
    return (
        1.0 * abs(z_enr) +
        1.5 * abs(z_bio) +
        1.0 * abs(z_demo) +
        2.0 * abs(z_rbe) +
        1.5 * abs(z_rde)
    )

dist_month_z["risk_score"] = dist_month_z.apply(row_risk, axis=1)


In [20]:
threshold = dist_month_z["risk_score"].quantile(0.99)

high_risk = (dist_month_z
    .loc[dist_month_z["risk_score"] >= threshold]
    .sort_values("risk_score", ascending=False))

high_risk[[
    "state", "district", "month",
    "enr_total", "bio_total", "demo_total",
    "enr_total_z", "bio_total_z", "demo_total_z",
    "ratio_bio_enr", "ratio_bio_enr_z",
    "ratio_demo_enr", "ratio_demo_enr_z",
    "risk_score"
]].head(50)


Unnamed: 0,state,district,month,enr_total,bio_total,demo_total,enr_total_z,bio_total_z,demo_total_z,ratio_bio_enr,ratio_bio_enr_z,ratio_demo_enr,ratio_demo_enr_z,risk_score
762,Assam,Dima Hasao,2025-12-01,3.0,10.0,135.0,-0.666601,1.469085,1.769832,3.333333,1.917649,45.0,1.995047,11.467928
5085,Nagaland,Dimapur,2025-12-01,54.0,3656.0,1819.0,-1.150476,1.51492,1.025493,67.703704,2.019097,33.685185,1.982278,11.45996
4699,Maharashtra,Satara,2025-09-01,1321.0,39301.0,7908.0,1.941525,1.928511,0.977498,29.750946,-1.424693,5.986374,1.788854,11.344456
4934,Meghalaya,South West Garo Hills,2025-10-01,206.0,854.0,874.0,-1.08688,1.739454,0.835199,4.145631,2.031655,4.242718,1.65667,11.079577
843,Assam,Karimganj,2025-12-01,1523.0,10118.0,13432.0,-0.527146,1.909417,1.439872,6.643467,1.782395,8.819435,1.688447,10.928604
7439,Uttar Pradesh,Bahraich,2025-12-01,2245.0,25919.0,38566.0,-0.970751,1.352005,1.443558,11.545212,1.847318,17.178619,1.82557,10.875307
4961,Meghalaya,West Jaintia Hills,2025-10-01,377.0,1032.0,629.0,-1.497475,1.527435,0.564567,2.737401,2.036746,1.668435,1.505144,10.684403
1331,Bihar,Siwan,2025-09-01,4297.0,9304.0,22042.0,2.00742,-1.665822,1.001586,2.165232,-1.452256,5.129625,1.5,10.662251
1980,Gujarat,Banas Kantha,2025-12-01,27.0,173.0,185.0,-0.561587,1.355524,1.429296,6.407407,1.891084,6.851852,1.882868,10.630639
5159,Nagaland,Peren,2025-12-01,8.0,322.0,310.0,-1.338023,-1.116276,0.540127,40.25,2.017132,38.75,2.00964,10.601288


In [21]:
OUTPUT_DIR = DATA_DIR

dist_month_z.to_csv(OUTPUT_DIR + "/aadhaar_district_month_with_risk.csv", index=False)
high_risk.to_csv(OUTPUT_DIR + "/aadhaar_district_month_high_risk.csv", index=False)
