In [4]:
import pandas as pd
# Load datasets
df1 = pd.read_csv('api_data_aadhar_demographic_0_500000.csv')
df2 = pd.read_csv('api_data_aadhar_demographic_500000_1000000.csv')
df3 = pd.read_csv('api_data_aadhar_demographic_1000000_1500000.csv')
df4 = pd.read_csv('api_data_aadhar_demographic_1500000_2000000.csv')
df5 = pd.read_csv('api_data_aadhar_demographic_2000000_2071700.csv')
# Concatenate rows
final_df = pd.concat([df1, df2, df3], ignore_index=True)

# Convert date column
final_df['date'] = pd.to_datetime(final_df['date'], format="%d-%m-%Y")

# Basic data quality checks
final_df.drop_duplicates(inplace=True)
final_df.fillna(0, inplace=True)

# Save final dataset
final_df.to_csv('aadhaar_demographic_final.csv', index=False)

States names availble in column

In [None]:
import pandas as pd

df = pd.read_csv("aadhaar_demographic_final.csv")

# Get unique states (sorted)
states = sorted(df['state'].dropna().unique())

# Print states
print("Total number of unique states:", len(states))
for state in states:
    print(state)

Total number of unique states: 63
100000
Andaman & Nicobar Islands
Andaman and Nicobar Islands
Andhra Pradesh
Arunachal Pradesh
Assam
BALANAGAR
Bihar
Chandigarh
Chhatisgarh
Chhattisgarh
Dadra & Nagar Haveli
Dadra and Nagar Haveli
Dadra and Nagar Haveli and Daman and Diu
Daman & Diu
Daman and Diu
Darbhanga
Delhi
Goa
Gujarat
Haryana
Himachal Pradesh
Jaipur
Jammu & Kashmir
Jammu and Kashmir
Jharkhand
Karnataka
Kerala
Ladakh
Lakshadweep
Madanapalle
Madhya Pradesh
Maharashtra
Manipur
Meghalaya
Mizoram
Nagaland
ODISHA
Odisha
Orissa
Pondicherry
Puducherry
Punjab
Puttenahalli
Rajasthan
Sikkim
Tamil Nadu
Telangana
Tripura
Uttar Pradesh
Uttarakhand
Uttaranchal
WEST BENGAL
WESTBENGAL
West  Bengal
West Bangal
West Bengal
West Bengli
West bengal
Westbengal
andhra pradesh
odisha
west Bengal


Basic Analysis

In [2]:
import pandas as pd


# LOAD DATASET

# Load the final Aadhaar demographic dataset for analysis
df = pd.read_csv("aadhaar_demographic_final.csv")

print("Dataset loaded successfully")



# BASIC DATASET OVERVIEW

# Understand dataset size, structure, and column data types
print("\n--- BASIC DATASET OVERVIEW ---")
print("Total rows:", df.shape[0])
print("Total columns:", df.shape[1])

print("\nColumns:")
print(df.columns.tolist())

print("\nData types:")
print(df.dtypes)



# MISSING VALUES & DUPLICATES

# Identify data quality issues such as missing values and duplicate rows
print("\n--- MISSING VALUES & DUPLICATES ---")
print("Missing values per column:")
print(df.isnull().sum())

total_duplicates = df.duplicated().sum()
print("Total duplicate rows:", total_duplicates)



# STATE & DISTRICT DISTRIBUTION

# Analyze geographical coverage of the dataset
print("\n--- STATE & DISTRICT DISTRIBUTION ---")
print("Total unique states:", df["state"].nunique())
print("Total unique districts:", df["district"].nunique())

districts_per_state = df.groupby("state")["district"].nunique()
print("\nDistricts per state:")
print(districts_per_state)



# RECORD COUNT ANALYSIS

# Identify states and districts with the highest number of records
print("\n--- RECORD COUNT ANALYSIS ---")

state_counts = df["state"].value_counts()
print("\nRecords per state:")
print(state_counts)

district_counts = df["district"].value_counts().head(10)
print("\nTop 10 districts by record count:")
print(district_counts)



# DATE RANGE ANALYSIS

# Validate temporal coverage and detect invalid dates
print("\n--- DATE RANGE ANALYSIS ---")
df["date"] = pd.to_datetime(df["date"], errors="coerce")

print("Date range:")
print("From:", df["date"].min())
print("To:", df["date"].max())



# OUTLIER DETECTION (IQR METHOD)

# Detect unusually high or low demographic values
print("\n--- OUTLIER DETECTION (IQR METHOD) ---")

for col in ['demo_age_5_17', 'demo_age_17_']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"{col} outliers: {len(outliers)} ({len(outliers)/len(df)*100:.2f}%)")



# ZERO-VALUE VALIDATION

# Identify records where demographic values might be missing or unreported
print("\n--- ZERO-VALUE VALIDATION ---")

zero_both = df[
    (df['demo_age_5_17'] == 0) &
    (df['demo_age_17_'] == 0)
]
print(f"Rows with both age groups = 0: {len(zero_both)}")



# PINCODE VALIDATION

# Ensure pincodes fall within valid Indian postal code range
print("\n--- PINCODE VALIDATION ---")

invalid_pincodes = df[
    (df['pincode'] < 100000) |
    (df['pincode'] > 999999)
]

print(f"Invalid pincodes: {len(invalid_pincodes)}")

if len(invalid_pincodes) > 0:
    print("\nSample invalid pincodes:")
    print(invalid_pincodes[['state', 'district', 'pincode']].head())


print("\n--- PINCODE–STATE CONFLICT ANALYSIS ---")

# Count how many unique states each pincode appears in
pincode_state_conflict = (
    df.groupby("pincode")["state"]
    .nunique()
    .reset_index(name="state_count")
)

# Filter pincodes mapped to more than one state
conflicting_pincodes = pincode_state_conflict[
    pincode_state_conflict["state_count"] > 1
]

print(f"Total pincodes linked to multiple states: {len(conflicting_pincodes)}")

# Show top conflicting pincodes
print("\nSample conflicting pincodes:")
print(conflicting_pincodes.head(10))


Dataset loaded successfully

--- BASIC DATASET OVERVIEW ---
Total rows: 1291855
Total columns: 6

Columns:
['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

Data types:
date             object
state            object
district         object
pincode           int64
demo_age_5_17     int64
demo_age_17_      int64
dtype: object

--- MISSING VALUES & DUPLICATES ---
Missing values per column:
date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17_     0
dtype: int64
Total duplicate rows: 0

--- STATE & DISTRICT DISTRIBUTION ---
Total unique states: 63
Total unique districts: 975

Districts per state:
state
100000                          1
Andaman & Nicobar Islands       3
Andaman and Nicobar Islands     3
Andhra Pradesh                 45
Arunachal Pradesh              25
                               ..
West bengal                     3
Westbengal                      2
andhra pradesh                  2
odisha  

In [3]:
df.head(2)

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,2025-03-01,Uttar Pradesh,Gorakhpur,273213,49,529
1,2025-03-01,Andhra Pradesh,Chittoor,517132,22,375
