In [0]:
df_bio = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("/Volumes/workspace/default/clean_dataset/aadhar_biometric_full_data_cleaned.csv")
)


In [0]:
df_bio.display()

date,bio_age_5_17,bio_age_17_,state,pincode,district,latitude,longitude
2025-03-01,280,577,Haryana,123029,Mahendragarh,28.25814,76.139164
2025-03-01,144,369,Bihar,852121,Madhepura,25.883496,86.600625
2025-03-01,643,1091,Jammu and Kashmir,185101,Poonch,1,1
2025-03-01,256,980,Bihar,802158,Bhojpur,25.526374,84.319455
2025-03-01,271,815,Tamil Nadu,625514,Madurai,9.943498,77.97672
2025-03-01,155,529,Maharashtra,416702,Ratnagiri,16.62214,73.54305
2025-03-01,75,143,Gujarat,388130,Anand,22.46,72.88
2025-03-01,192,298,Gujarat,382421,Gandhinagar,23.19,72.61
2025-03-01,122,214,Odisha,759025,Dhenkanal,20.6771616,85.4927399
2025-03-01,67,85,Gujarat,396055,Valsad,20.55507,73.01101


In [0]:
df_bio.select("district").distinct().count()

789

In [0]:
df_bio.select("state").distinct().count()

38

In [0]:
from pyspark.sql.functions import countDistinct

df_bio.groupBy("district") \
      .agg(countDistinct("state").alias("state_count")) \
      .filter("state_count > 1") \
      .orderBy("state_count", ascending=False) \
      .show(1000,truncate=False)


+------------------------+-----------+
|district                |state_count|
+------------------------+-----------+
|Valsad                  |3          |
|Daman                   |3          |
|Dadra And Nagar Haveli  |3          |
|East Godavari           |3          |
|Karaikal                |2          |
|Cuddalore               |2          |
|Nagarkurnool            |2          |
|Narayanpet              |2          |
|Karimnagar              |2          |
|Hamirpur                |2          |
|Yadadri Bhuvanagiri     |2          |
|Kamareddy               |2          |
|Haridwar                |2          |
|Pondicherry             |2          |
|Shamator                |2          |
|Thiruvallur             |2          |
|Mancherial              |2          |
|Jogulamba Gadwal        |2          |
|Kargil                  |2          |
|Mahe                    |2          |
|Chittoor                |2          |
|Warangal Urban          |2          |
|Pratapgarh              

In [0]:
from pyspark.sql import Window
from pyspark.sql.functions import col, row_number, count

# Count records per district-state
df_ds_count = (
    df_bio.groupBy("district", "state")
          .count()
)

# Window to rank states per district
w = Window.partitionBy("district").orderBy(col("count").desc())

df_major_state = (
    df_ds_count
    .withColumn("rn", row_number().over(w))
    .filter(col("rn") == 1)
    .select("district", col("state").alias("final_state"))
)

# Join back to main dataframe
df_clean = (
    df_bio.drop("state")
          .join(df_major_state, on="district", how="left")
          .withColumnRenamed("final_state", "state")
)


In [0]:
df_clean.groupBy("district") \
        .agg(countDistinct("state").alias("state_count")) \
        .filter("state_count > 1") \
        .show()


+--------+-----------+
|district|state_count|
+--------+-----------+
+--------+-----------+



In [0]:
from pyspark.sql.functions import when

df_clean = df_clean.withColumn(
    "state",
    when(col("district").isin("Daman", "Diu", "Dadra And Nagar Haveli"),
         "Dadra and Nagar Haveli and Daman and Diu")
    .when(col("district").isin("Leh", "Kargil"),
         "Ladakh")
    .otherwise(col("state"))
)


In [0]:
df_age=df_clean.groupBy("district","state").sum("bio_age_5_17").orderBy("sum(bio_age_5_17)", ascending=False)



In [0]:
df_age.display()

district,state,sum(bio_age_5_17)
Pune,Maharashtra,277807
Kurnool,Andhra Pradesh,246097
East Godavari,Andhra Pradesh,212831
Nashik,Maharashtra,208859
Visakhapatanam,Andhra Pradesh,205606
Sitapur,Uttar Pradesh,196826
Anantapur,Andhra Pradesh,192690
Guntur,Andhra Pradesh,192129
Thane,Maharashtra,188322
Mumbai Suburban,Maharashtra,187955


In [0]:
df_age.coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("/Volumes/workspace/default/clean_dataset/district_max_biometric_age_5_17")

In [0]:
df_clean.printSchema()

root
 |-- district: string (nullable = true)
 |-- date: date (nullable = true)
 |-- bio_age_5_17: integer (nullable = true)
 |-- bio_age_17_: integer (nullable = true)
 |-- pincode: integer (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- state: string (nullable = true)



In [0]:
df_age=df_clean.groupBy("district","state").sum("bio_age_17_").orderBy("sum(bio_age_17_)", ascending=False)


df_age.coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("/Volumes/workspace/default/clean_dataset/district_max_biometric_age_17")

#Clean Dataset

In [0]:
df_enrol = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("/Volumes/workspace/default/clean_dataset/aadhar_enrolment_full_data_cleaned.csv")
)


In [0]:
df_enrol.display()

date,pincode,age_0_5,age_5_17,age_18_greater,district,state,latitude,longitude
2025-03-02,793121,11,61,37,East Khasi Hills,Meghalaya,25.4419064,91.6690494
2025-03-09,560043,14,33,39,Bengaluru Urban,Karnataka,13.0137778,77.6523889
2025-03-09,208001,29,82,12,Kanpur Nagar,Uttar Pradesh,26.4695434,80.3260767
2025-03-09,202133,62,29,15,Aligarh,Uttar Pradesh,27.9516768,78.4680902
2025-03-09,560016,14,16,21,Bengaluru Urban,Karnataka,12.9853889,77.6771389
2025-03-09,843331,20,49,12,Sitamarhi,Bihar,17.3737,78.4898
2025-03-09,843330,23,24,42,Sitamarhi,Bihar,26.8429,85.58491
2025-03-09,271865,26,60,14,Bahraich,Uttar Pradesh,27.865943,81.496752
2025-03-09,283204,28,26,10,Firozabad,Uttar Pradesh,24.885596,78.885796
2025-03-09,845418,30,48,10,Purbi Champaran,Bihar,26.6912,85.1686


In [0]:
from pyspark.sql.functions import countDistinct

df_enrol.groupBy("district") \
      .agg(countDistinct("state").alias("state_count")) \
      .filter("state_count > 1") \
      .orderBy("state_count", ascending=False) \
      .show(1000,truncate=False)


+-----------+-----------+
|district   |state_count|
+-----------+-----------+
|Hamirpur   |2          |
|Pondicherry|2          |
|Kargil     |2          |
|Pratapgarh |2          |
|Aurangabad |2          |
|Bilaspur   |2          |
|Balrampur  |2          |
|Baleshwar  |2          |
+-----------+-----------+



In [0]:
from pyspark.sql.functions import concat_ws, lower, regexp_replace

df_final = df_enrol.withColumn(
    "district_key",
    regexp_replace(
        lower(concat_ws("_", "state", "district")),
        r"\s+",
        "_"
    )
)


In [0]:
from pyspark.sql.functions import when

df_final = df_final.withColumn(
    "district",
    when(col("district") == "Baleshwar", "Balasore")
    .otherwise(col("district"))
)


In [0]:
df_final.printSchema()

root
 |-- date: date (nullable = true)
 |-- pincode: integer (nullable = true)
 |-- age_0_5: integer (nullable = true)
 |-- age_5_17: integer (nullable = true)
 |-- age_18_greater: integer (nullable = true)
 |-- district: string (nullable = true)
 |-- state: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- district_key: string (nullable = false)



In [0]:
df_age = df_final.groupBy("district", "state").sum("age_0_5").orderBy("sum(age_0_5)", ascending=False)
df_age.coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("/Volumes/workspace/default/clean_dataset/district_max_enroll_age_0_5")

In [0]:
df_final.printSchema()

root
 |-- date: date (nullable = true)
 |-- pincode: integer (nullable = true)
 |-- age_0_5: integer (nullable = true)
 |-- age_5_17: integer (nullable = true)
 |-- age_18_greater: integer (nullable = true)
 |-- district: string (nullable = true)
 |-- state: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- district_key: string (nullable = false)



In [0]:
df_age=df_final.groupBy("district","state").sum("age_5_17").orderBy("sum(age_5_17)", ascending=False)
df_age.coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("/Volumes/workspace/default/clean_dataset/district_max_elroll_age_5_17")

In [0]:
df_age=df_final.groupBy("district","state").sum("age_18_greater").orderBy("sum(age_18_greater)", ascending=False)
df_age.coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("/Volumes/workspace/default/clean_dataset/district_max_elroll_age_17")

In [0]:
df_final.coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("/Volumes/workspace/default/clean_dataset/district_max_enrollement")


#Clean DataSet

In [0]:
df_demo = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("/Volumes/workspace/default/clean_dataset/aadhar_demographic_full_data_cleaned.csv")
)

In [0]:
df_demo.display()

date,pincode,demo_age_5_17,demo_age_17_,district,state,latitude,longitude
2025-03-01,273213,49,529,Gorakhpur,Uttar Pradesh,26.499422,83.420327
2025-03-01,517132,22,375,Chittoor,Andhra Pradesh,13.0503,79.1707
2025-03-01,360006,65,765,Rajkot,Gujarat,22.3569496,70.7377804
2025-03-01,532484,24,314,Srikakulam,Andhra Pradesh,18.3523785,83.8586956
2025-03-01,313801,45,785,Udaipur,Rajasthan,24.25,73.7
2025-03-01,332028,28,285,Sikar,Rajasthan,27.5726158,74.8682569
2025-03-01,572201,88,332,Tumakuru,Karnataka,12.9829466,76.8628964
2025-03-01,273211,61,836,Gorakhpur,Uttar Pradesh,26.443438,83.471631
2025-03-01,518313,83,986,Kurnool,Andhra Pradesh,0,0
2025-03-01,721148,13,281,Medinipur West,West Bengal,24.0497,87.5756


In [0]:
from pyspark.sql.functions import countDistinct

df_demo.groupBy("district") \
      .agg(countDistinct("state").alias("state_count")) \
      .filter("state_count > 1") \
      .orderBy("state_count", ascending=False) \
      .show(1000,truncate=False)


+-----------+-----------+
|district   |state_count|
+-----------+-----------+
|Hamirpur   |2          |
|Pondicherry|2          |
|Kargil     |2          |
|Pratapgarh |2          |
|Aurangabad |2          |
|Bilaspur   |2          |
|Balrampur  |2          |
|Baleshwar  |2          |
+-----------+-----------+



In [0]:
from pyspark.sql.functions import concat_ws, lower, regexp_replace

df_final = df_demo.withColumn(
    "district_key",
    regexp_replace(
        lower(concat_ws("_", "state", "district")),
        r"\s+",
        "_"
    )
)


In [0]:
df_age=df_final.groupBy("district","state").sum("demo_age_17_").orderBy("sum(demo_age_17_)", ascending=False)
df_age.coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("/Volumes/workspace/default/clean_dataset/district_max_demo_age_17")

In [0]:
df_age.display()

district,state,sum(demo_age_17_)
24 Paraganas South,West Bengal,302522
Pune,Maharashtra,302080
Thane,Maharashtra,260395
Murshidabad,West Bengal,242463
Surat,Gujarat,229724
Bengaluru Urban,Karnataka,222904
24 Paraganas North,West Bengal,207243
Mumbai Suburban,Maharashtra,199513
Solapur,Maharashtra,194083
Nashik,Maharashtra,178445


In [0]:
df_age=df_final.groupBy("district","state").sum("demo_age_5_17").orderBy("sum(demo_age_5_17)", ascending=False)
df_age.coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("/Volumes/workspace/default/clean_dataset/district_max_demo_age_5_17")

In [0]:
df_final.coalesce(1) \
    .write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("/Volumes/workspace/default/clean_dataset/district_max_demographic")
