In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

Libraries imported successfully!
Pandas version: 2.3.3
NumPy version: 2.4.1


## 1. Load All CSV Files from DemographicData Folder

In [2]:
# Define the data directory
data_dir = '../DemographicData/'

# Get all CSV files
csv_files = glob.glob(os.path.join(data_dir, '*.csv'))

print(f"Total CSV files found: {len(csv_files)}")
print(f"\nFirst 10 files:")
for i, file in enumerate(csv_files[:10], 1):
    print(f"{i}. {os.path.basename(file)}")

Total CSV files found: 70

First 10 files:
1. 19eac040-0b94-49fa-b239-4f2fd8677d53_7dcde79a721ffc4ecdc7f68f366ac414.csv
2. 19eac040-0b94-49fa-b239-4f2fd8677d53_fa802048c13528c797de0afaa774fdaa.csv
3. 19eac040-0b94-49fa-b239-4f2fd8677d53_d388120b43582663f7c85b0203be0efc.csv
4. 19eac040-0b94-49fa-b239-4f2fd8677d53_edc273ee70045c7cb71721466146e1a6.csv
5. 19eac040-0b94-49fa-b239-4f2fd8677d53_b46440737d913ff5eb923a6883d0fb61.csv
6. 19eac040-0b94-49fa-b239-4f2fd8677d53_4945fb4ff61ff6989d8eca62fec5a07f.csv
7. 19eac040-0b94-49fa-b239-4f2fd8677d53_abb2d7db85abababaee62af8e41db969.csv
8. 19eac040-0b94-49fa-b239-4f2fd8677d53_0f9f61219a5c46f7274f6dc93b3bd6dc.csv
9. 19eac040-0b94-49fa-b239-4f2fd8677d53_c4d016a8fe651670a733711f02ea143a.csv
10. 19eac040-0b94-49fa-b239-4f2fd8677d53_ce9f410f62313125d0fc55ab4ba972af.csv


In [3]:
# Load all CSV files and combine them
print("Loading all CSV files...")
print("This may take a few moments...\n")

df_list = []
failed_files = []

for i, file in enumerate(csv_files, 1):
    try:
        df_temp = pd.read_csv(file, low_memory=False)
        df_list.append(df_temp)
        if i % 10 == 0:
            print(f"Loaded {i}/{len(csv_files)} files...")
    except Exception as e:
        failed_files.append((file, str(e)))
        print(f"Failed to load {os.path.basename(file)}: {e}")

# Combine all dataframes
df_raw = pd.concat(df_list, ignore_index=True)

print(f"\n✓ Successfully loaded {len(df_list)} files")
print(f"✗ Failed to load {len(failed_files)} files")
print(f"\nTotal records loaded: {len(df_raw):,}")

Loading all CSV files...
This may take a few moments...



Loaded 10/70 files...


Loaded 20/70 files...


Loaded 30/70 files...
Loaded 40/70 files...


Loaded 50/70 files...


Loaded 60/70 files...


Loaded 70/70 files...

✓ Successfully loaded 70 files
✗ Failed to load 0 files

Total records loaded: 2,375,882


## 2. Verify Data Structure

In [4]:
# Display basic information
print("Dataset Shape:")
print(f"Rows: {df_raw.shape[0]:,}")
print(f"Columns: {df_raw.shape[1]}")
print("\nColumn Names:")
print(df_raw.columns.tolist())
print("\nData Types:")
print(df_raw.dtypes)
print("\nFirst 10 rows:")
df_raw.head(10)

Dataset Shape:
Rows: 2,375,882
Columns: 6

Column Names:
['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

Data Types:
date             object
state            object
district         object
pincode           int64
demo_age_5_17     int64
demo_age_17_      int64
dtype: object

First 10 rows:


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,26-12-2025,Jaipur,Near meera hospital,302016,0,1
1,22-12-2025,Jaipur,Near meera hospital,302016,0,1
2,13-12-2025,Nagpur,Near Uday nagar NIT garden,440024,0,1
3,22-12-2025,Nagpur,Near Uday nagar NIT garden,440024,0,1
4,14-12-2025,Nagpur,Near Uday nagar NIT garden,440024,0,1
5,03-12-2025,Nagpur,Near Uday nagar NIT garden,440024,0,1
6,01-09-2025,Orissa,Balangir,767061,0,1
7,01-09-2025,Orissa,Baleshwar,756003,1,0
8,01-09-2025,Orissa,Baleshwar,756036,0,4
9,01-09-2025,Orissa,Baleshwar,756046,1,1


In [5]:
# Check for expected columns
expected_columns = ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']
actual_columns = df_raw.columns.tolist()

print("Column Verification:")
print("=" * 50)
for col in expected_columns:
    if col in actual_columns:
        print(f"✓ {col} - Present")
    else:
        print(f"✗ {col} - Missing")

# Check for extra columns
extra_cols = set(actual_columns) - set(expected_columns)
if extra_cols:
    print(f"\nExtra columns found: {list(extra_cols)}")

Column Verification:
✓ date - Present
✓ state - Present
✓ district - Present
✓ pincode - Present
✓ demo_age_5_17 - Present
✓ demo_age_17_ - Present


In [6]:
# Basic statistics
print("Basic Statistics:")
print("=" * 50)
df_raw.describe(include='all')

Basic Statistics:


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
count,2375882,2375882,2375882,2375882.0,2375882.0,2375882.0
unique,106,70,989,,,
top,16-12-2025,Tamil Nadu,North 24 Parganas,,,
freq,28117,232752,14365,,,
mean,,,,526405.85,1.9,19.11
std,,,,196969.26,12.99,110.2
min,,,,100000.0,0.0,0.0
25%,,,,395017.0,0.0,2.0
50%,,,,524316.0,0.0,5.0
75%,,,,691520.0,2.0,14.0


In [7]:
# Check for missing values
print("Missing Values Analysis:")
print("=" * 50)
missing_counts = df_raw.isnull().sum()
missing_percent = (df_raw.isnull().sum() / len(df_raw)) * 100

missing_df = pd.DataFrame({
    'Column': df_raw.columns,
    'Missing Count': missing_counts,
    'Missing %': missing_percent
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df)
else:
    print("✓ No missing values found!")

Missing Values Analysis:


✓ No missing values found!


## 3. Date Standardization

In [8]:
# Create a copy for preprocessing
df = df_raw.copy()

# Check current date format
print("Sample date values before standardization:")
print(df['date'].head(10))
print(f"\nDate data type: {df['date'].dtype}")

Sample date values before standardization:
0    26-12-2025
1    22-12-2025
2    13-12-2025
3    22-12-2025
4    14-12-2025
5    03-12-2025
6    01-09-2025
7    01-09-2025
8    01-09-2025
9    01-09-2025
Name: date, dtype: object

Date data type: object


In [9]:
# Convert date to datetime format
print("Converting dates to standardized format...")

# Try different date formats
def parse_date(date_str):
    try:
        # Try DD-MM-YYYY format first
        return pd.to_datetime(date_str, format='%d-%m-%Y', errors='coerce')
    except:
        try:
            # Try automatic parsing
            return pd.to_datetime(date_str, errors='coerce')
        except:
            return pd.NaT

df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y', errors='coerce')

# Check for date parsing failures
date_nulls = df['date'].isnull().sum()
print(f"\nDate parsing results:")
print(f"Successfully parsed: {len(df) - date_nulls:,} ({((len(df) - date_nulls)/len(df)*100):.2f}%)")
print(f"Failed to parse: {date_nulls:,} ({(date_nulls/len(df)*100):.2f}%)")

# Display sample parsed dates
print("\nSample standardized dates:")
print(df['date'].head(10))

Converting dates to standardized format...



Date parsing results:
Successfully parsed: 2,375,882 (100.00%)
Failed to parse: 0 (0.00%)

Sample standardized dates:
0   2025-12-26
1   2025-12-22
2   2025-12-13
3   2025-12-22
4   2025-12-14
5   2025-12-03
6   2025-09-01
7   2025-09-01
8   2025-09-01
9   2025-09-01
Name: date, dtype: datetime64[ns]


In [10]:
# Extract additional date features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['year_month'] = df['date'].dt.to_period('M')
df['day'] = df['date'].dt.day
df['quarter'] = df['date'].dt.quarter

print("Date features extracted:")
print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")
print(f"\nUnique years: {sorted(df['year'].dropna().unique())}")
print(f"Unique months: {sorted(df['month'].dropna().unique())}")
print(f"\nSample with date features:")
df[['date', 'year', 'month', 'quarter', 'year_month']].head()

Date features extracted:

Date range: 2025-03-01 00:00:00 to 2026-01-03 00:00:00

Unique years: [np.int32(2025), np.int32(2026)]
Unique months: [np.int32(1), np.int32(3), np.int32(4), np.int32(5), np.int32(6), np.int32(7), np.int32(9), np.int32(10), np.int32(11), np.int32(12)]

Sample with date features:


Unnamed: 0,date,year,month,quarter,year_month
0,2025-12-26,2025,12,4,2025-12
1,2025-12-22,2025,12,4,2025-12
2,2025-12-13,2025,12,4,2025-12
3,2025-12-22,2025,12,4,2025-12
4,2025-12-14,2025,12,4,2025-12


## 4. District-Pincode Consistency Checks

In [11]:
# Standardize text columns
print("Standardizing state and district names...")

# Convert to title case and strip whitespace
df['state'] = df['state'].astype(str).str.strip().str.title()
df['district'] = df['district'].astype(str).str.strip().str.title()

# Convert pincode to string and handle invalid values
df['pincode'] = df['pincode'].astype(str).str.strip()

print("✓ Text standardization complete")

Standardizing state and district names...


✓ Text standardization complete


In [12]:
# Unique values check
print("Geographic Data Summary:")
print("=" * 50)
print(f"Unique States: {df['state'].nunique()}")
print(f"Unique Districts: {df['district'].nunique()}")
print(f"Unique Pincodes: {df['pincode'].nunique()}")

print("\nTop 10 States by record count:")
print(df['state'].value_counts().head(10))

Geographic Data Summary:


Unique States: 63


Unique Districts: 967


Unique Pincodes: 19772

Top 10 States by record count:
state
Tamil Nadu        232752
Andhra Pradesh    231487
Uttar Pradesh     196621
West Bengal       190517
Maharashtra       183245
Karnataka         181401
Kerala            121814
Gujarat           110091
Odisha            106845
Rajasthan         105878
Name: count, dtype: int64


In [13]:
# Create district-pincode mapping for consistency check
district_pincode_map = df.groupby('district')['pincode'].apply(lambda x: set(x)).to_dict()

print(f"Total district-pincode mappings: {len(district_pincode_map)}")
print("\nSample district-pincode relationships:")
for i, (district, pincodes) in enumerate(list(district_pincode_map.items())[:5]):
    print(f"{i+1}. {district}: {len(pincodes)} unique pincodes")
    print(f"   Sample pincodes: {list(pincodes)[:5]}")

Total district-pincode mappings: 967

Sample district-pincode relationships:
1. 100000: 1 unique pincodes
   Sample pincodes: ['100000']
2. 561203: 1 unique pincodes
   Sample pincodes: ['561203']
3. 5Th Cross: 1 unique pincodes
   Sample pincodes: ['560078']
4. Adilabad: 49 unique pincodes
   Sample pincodes: ['504272', '504301', '504251', '504307', '504104']
5. Agar Malwa: 6 unique pincodes
   Sample pincodes: ['465550', '465445', '465230', '465441', '465447']


In [14]:
# Check for invalid pincode values
print("Pincode Validation:")
print("=" * 50)

# Valid Indian pincodes are 6-digit numbers
df['pincode_valid'] = df['pincode'].str.match(r'^\d{6}$')

valid_pincodes = df['pincode_valid'].sum()
invalid_pincodes = len(df) - valid_pincodes

print(f"Valid pincodes (6 digits): {valid_pincodes:,} ({(valid_pincodes/len(df)*100):.2f}%)")
print(f"Invalid pincodes: {invalid_pincodes:,} ({(invalid_pincodes/len(df)*100):.2f}%)")

if invalid_pincodes > 0:
    print("\nSample invalid pincodes:")
    print(df[~df['pincode_valid']]['pincode'].value_counts().head(10))

Pincode Validation:


Valid pincodes (6 digits): 2,375,882 (100.00%)
Invalid pincodes: 0 (0.00%)


## 5. Missing Value Handling

In [15]:
# Comprehensive missing value analysis
print("Detailed Missing Value Analysis:")
print("=" * 50)

for col in df.columns:
    null_count = df[col].isnull().sum()
    null_percent = (null_count / len(df)) * 100
    
    if null_count > 0:
        print(f"\n{col}:")
        print(f"  Missing: {null_count:,} ({null_percent:.2f}%)")
        
        # Show sample of rows with missing values
        if null_count <= 10:
            print(f"  Sample rows: {df[df[col].isnull()].index.tolist()[:5]}")

Detailed Missing Value Analysis:


In [16]:
# Handle missing values
print("Handling Missing Values:")
print("=" * 50)

# Remove rows with missing dates (critical field)
rows_before = len(df)
df = df[df['date'].notna()]
rows_removed_date = rows_before - len(df)
print(f"Removed {rows_removed_date:,} rows with missing dates")

# Fill missing demographic values with 0 (reasonable assumption for update counts)
df['demo_age_5_17'] = df['demo_age_5_17'].fillna(0)
df['demo_age_17_'] = df['demo_age_17_'].fillna(0)
print("Filled missing demographic values with 0")

# Handle missing geographic data
df['state'] = df['state'].fillna('Unknown')
df['district'] = df['district'].fillna('Unknown')
df['pincode'] = df['pincode'].fillna('000000')
print("Filled missing geographic data with placeholders")

print(f"\n✓ Final dataset size: {len(df):,} rows")

Handling Missing Values:


Removed 0 rows with missing dates
Filled missing demographic values with 0


Filled missing geographic data with placeholders

✓ Final dataset size: 2,375,882 rows


## 6. Data Type Conversion & Validation

In [17]:
# Convert demographic columns to numeric
print("Converting demographic columns to numeric...")

df['demo_age_5_17'] = pd.to_numeric(df['demo_age_5_17'], errors='coerce').fillna(0).astype(int)
df['demo_age_17_'] = pd.to_numeric(df['demo_age_17_'], errors='coerce').fillna(0).astype(int)

print("✓ Demographic columns converted to integers")

# Display data types
print("\nFinal Data Types:")
print(df.dtypes)

Converting demographic columns to numeric...
✓ Demographic columns converted to integers

Final Data Types:
date             datetime64[ns]
state                    object
district                 object
pincode                  object
demo_age_5_17             int64
demo_age_17_              int64
year                      int32
month                     int32
year_month            period[M]
day                       int32
quarter                   int32
pincode_valid              bool
dtype: object


In [18]:
# Check for negative values (data quality check)
print("Data Quality Checks:")
print("=" * 50)

negative_child = (df['demo_age_5_17'] < 0).sum()
negative_adult = (df['demo_age_17_'] < 0).sum()

print(f"Negative values in demo_age_5_17: {negative_child}")
print(f"Negative values in demo_age_17_: {negative_adult}")

# Remove negative values if any
if negative_child > 0 or negative_adult > 0:
    df = df[(df['demo_age_5_17'] >= 0) & (df['demo_age_17_'] >= 0)]
    print(f"✓ Removed rows with negative values")

print(f"\nFinal clean dataset: {len(df):,} rows")

Data Quality Checks:
Negative values in demo_age_5_17: 0
Negative values in demo_age_17_: 0

Final clean dataset: 2,375,882 rows


## 7. Monthly Aggregation

In [19]:
# Create monthly aggregated dataset
print("Creating monthly aggregated dataset...")

# Aggregate by year_month, state, district, pincode
df_monthly = df.groupby(['year_month', 'state', 'district', 'pincode']).agg({
    'demo_age_5_17': 'sum',
    'demo_age_17_': 'sum',
    'date': 'count'  # Count of records (transactions)
}).reset_index()

df_monthly.rename(columns={'date': 'record_count'}, inplace=True)

print(f"✓ Monthly aggregation complete")
print(f"Original records: {len(df):,}")
print(f"Monthly aggregated records: {len(df_monthly):,}")
print(f"\nSample monthly data:")
df_monthly.head(10)

Creating monthly aggregated dataset...


✓ Monthly aggregation complete
Original records: 2,375,882
Monthly aggregated records: 159,507

Sample monthly data:


Unnamed: 0,year_month,state,district,pincode,demo_age_5_17,demo_age_17_,record_count
0,2025-03,Andaman And Nicobar Islands,Nicobar,744301,16,180,1
1,2025-03,Andaman And Nicobar Islands,North And Middle Andaman,744202,10,201,1
2,2025-03,Andaman And Nicobar Islands,South Andaman,744101,13,76,1
3,2025-03,Andaman And Nicobar Islands,South Andaman,744102,14,64,1
4,2025-03,Andaman And Nicobar Islands,South Andaman,744103,25,161,1
5,2025-03,Andaman And Nicobar Islands,South Andaman,744105,26,171,1
6,2025-03,Andaman And Nicobar Islands,South Andaman,744106,10,42,1
7,2025-03,Andhra Pradesh,Adilabad,504001,30,700,2
8,2025-03,Andhra Pradesh,Adilabad,504101,20,102,2
9,2025-03,Andhra Pradesh,Adilabad,504102,32,142,2


In [20]:
# Create district-level monthly aggregation (for analysis)
df_monthly_district = df.groupby(['year_month', 'state', 'district']).agg({
    'demo_age_5_17': 'sum',
    'demo_age_17_': 'sum',
    'date': 'count',
    'pincode': 'nunique'  # Number of unique pincodes
}).reset_index()

df_monthly_district.rename(columns={
    'date': 'record_count',
    'pincode': 'pincode_count'
}, inplace=True)

print(f"District-level monthly records: {len(df_monthly_district):,}")
print(f"\nSample district monthly data:")
df_monthly_district.head(10)

District-level monthly records: 7,062

Sample district monthly data:


Unnamed: 0,year_month,state,district,demo_age_5_17,demo_age_17_,record_count,pincode_count
0,2025-03,Andaman And Nicobar Islands,Nicobar,16,180,1,1
1,2025-03,Andaman And Nicobar Islands,North And Middle Andaman,10,201,1,1
2,2025-03,Andaman And Nicobar Islands,South Andaman,88,514,5,5
3,2025-03,Andhra Pradesh,Adilabad,457,5481,27,20
4,2025-03,Andhra Pradesh,Alluri Sitharama Raju,408,3291,12,9
5,2025-03,Andhra Pradesh,Anakapalli,272,2061,16,13
6,2025-03,Andhra Pradesh,Anantapur,868,16988,45,38
7,2025-03,Andhra Pradesh,Ananthapur,2143,7027,82,62
8,2025-03,Andhra Pradesh,Ananthapuramu,2001,6327,49,39
9,2025-03,Andhra Pradesh,Annamayya,426,2465,14,11


In [21]:
# Create state-level monthly aggregation
df_monthly_state = df.groupby(['year_month', 'state']).agg({
    'demo_age_5_17': 'sum',
    'demo_age_17_': 'sum',
    'date': 'count',
    'district': 'nunique',
    'pincode': 'nunique'
}).reset_index()

df_monthly_state.rename(columns={
    'date': 'record_count',
    'district': 'district_count',
    'pincode': 'pincode_count'
}, inplace=True)

print(f"State-level monthly records: {len(df_monthly_state):,}")
print(f"\nSample state monthly data:")
df_monthly_state.head(10)

State-level monthly records: 385

Sample state monthly data:


Unnamed: 0,year_month,state,demo_age_5_17,demo_age_17_,record_count,district_count,pincode_count
0,2025-03,Andaman And Nicobar Islands,114,895,7,3,7
1,2025-03,Andhra Pradesh,42308,405702,1431,41,927
2,2025-03,Arunachal Pradesh,860,7342,37,20,32
3,2025-03,Assam,15221,167680,439,32,254
4,2025-03,Bihar,83913,870097,921,38,835
5,2025-03,Chandigarh,1172,7921,19,1,19
6,2025-03,Chhattisgarh,15535,243895,294,34,203
7,2025-03,Dadra And Nagar Haveli,49,593,1,1,1
8,2025-03,Dadra And Nagar Haveli And Daman And Diu,77,969,4,3,4
9,2025-03,Daman And Diu,18,186,1,1,1


## 8. Create Derived Features

In [22]:
# Add derived features to monthly datasets
print("Creating derived features...")

# For pincode-level data
df_monthly['total_updates'] = df_monthly['demo_age_5_17'] + df_monthly['demo_age_17_']
df_monthly['child_update_share'] = (df_monthly['demo_age_5_17'] / 
                                     (df_monthly['total_updates'] + 0.0001)) * 100  # Avoid division by zero

# For district-level data
df_monthly_district['total_updates'] = df_monthly_district['demo_age_5_17'] + df_monthly_district['demo_age_17_']
df_monthly_district['child_update_share'] = (df_monthly_district['demo_age_5_17'] / 
                                              (df_monthly_district['total_updates'] + 0.0001)) * 100

# For state-level data
df_monthly_state['total_updates'] = df_monthly_state['demo_age_5_17'] + df_monthly_state['demo_age_17_']
df_monthly_state['child_update_share'] = (df_monthly_state['demo_age_5_17'] / 
                                           (df_monthly_state['total_updates'] + 0.0001)) * 100

print("✓ Derived features created:")
print("  - total_updates")
print("  - child_update_share")

print("\nSample with derived features:")
df_monthly_district[['year_month', 'state', 'district', 'demo_age_5_17', 
                     'demo_age_17_', 'total_updates', 'child_update_share']].head()

Creating derived features...
✓ Derived features created:
  - total_updates
  - child_update_share

Sample with derived features:


Unnamed: 0,year_month,state,district,demo_age_5_17,demo_age_17_,total_updates,child_update_share
0,2025-03,Andaman And Nicobar Islands,Nicobar,16,180,196,8.16
1,2025-03,Andaman And Nicobar Islands,North And Middle Andaman,10,201,211,4.74
2,2025-03,Andaman And Nicobar Islands,South Andaman,88,514,602,14.62
3,2025-03,Andhra Pradesh,Adilabad,457,5481,5938,7.7
4,2025-03,Andhra Pradesh,Alluri Sitharama Raju,408,3291,3699,11.03


## 9. Final Data Summary & Save

In [23]:
# Final summary statistics
print("=" * 70)
print("FINAL DATA PREPARATION SUMMARY")
print("=" * 70)

print("\n1. RAW DATASET (Daily Records):")
print(f"   Total Records: {len(df):,}")
print(f"   Date Range: {df['date'].min()} to {df['date'].max()}")
print(f"   Unique States: {df['state'].nunique()}")
print(f"   Unique Districts: {df['district'].nunique()}")
print(f"   Unique Pincodes: {df['pincode'].nunique()}")

print("\n2. MONTHLY AGGREGATED DATASETS:")
print(f"   Pincode-level: {len(df_monthly):,} records")
print(f"   District-level: {len(df_monthly_district):,} records")
print(f"   State-level: {len(df_monthly_state):,} records")

print("\n3. DEMOGRAPHIC STATISTICS:")
print(f"   Total child updates (5-17): {df['demo_age_5_17'].sum():,}")
print(f"   Total adult updates (17+): {df['demo_age_17_'].sum():,}")
print(f"   Total updates: {(df['demo_age_5_17'].sum() + df['demo_age_17_'].sum()):,}")

overall_child_share = (df['demo_age_5_17'].sum() / 
                       (df['demo_age_5_17'].sum() + df['demo_age_17_'].sum()) * 100)
print(f"   Overall child update share: {overall_child_share:.2f}%")

print("\n4. DATA QUALITY:")
print(f"   Rows with valid dates: {(~df['date'].isnull()).sum():,}")
print(f"   Rows with valid pincodes: {df['pincode_valid'].sum():,}")
print(f"   Missing values: {df.isnull().sum().sum()}")

FINAL DATA PREPARATION SUMMARY

1. RAW DATASET (Daily Records):
   Total Records: 2,375,882
   Date Range: 2025-03-01 00:00:00 to 2026-01-03 00:00:00
   Unique States: 63


   Unique Districts: 967
   Unique Pincodes: 19772

2. MONTHLY AGGREGATED DATASETS:
   Pincode-level: 159,507 records
   District-level: 7,062 records
   State-level: 385 records

3. DEMOGRAPHIC STATISTICS:
   Total child updates (5-17): 4,525,272
   Total adult updates (17+): 45,407,009
   Total updates: 49,932,281
   Overall child update share: 9.06%

4. DATA QUALITY:
   Rows with valid dates: 2,375,882
   Rows with valid pincodes: 2,375,882


   Missing values: 0


In [24]:
# Save processed datasets
print("\nSaving processed datasets...")

output_dir = '../outputs/'
os.makedirs(output_dir, exist_ok=True)

# Save as CSV
df.to_csv(f'{output_dir}df_clean_daily.csv', index=False)
df_monthly.to_csv(f'{output_dir}df_monthly_pincode.csv', index=False)
df_monthly_district.to_csv(f'{output_dir}df_monthly_district.csv', index=False)
df_monthly_state.to_csv(f'{output_dir}df_monthly_state.csv', index=False)

print("✓ Saved CSV files:")
print(f"  - df_clean_daily.csv ({len(df):,} rows)")
print(f"  - df_monthly_pincode.csv ({len(df_monthly):,} rows)")
print(f"  - df_monthly_district.csv ({len(df_monthly_district):,} rows)")
print(f"  - df_monthly_state.csv ({len(df_monthly_state):,} rows)")

# Also save as pickle for faster loading
df.to_pickle(f'{output_dir}df_clean_daily.pkl')
df_monthly.to_pickle(f'{output_dir}df_monthly_pincode.pkl')
df_monthly_district.to_pickle(f'{output_dir}df_monthly_district.pkl')
df_monthly_state.to_pickle(f'{output_dir}df_monthly_state.pkl')

print("\n✓ Saved pickle files for faster loading")


Saving processed datasets...


✓ Saved CSV files:
  - df_clean_daily.csv (2,375,882 rows)
  - df_monthly_pincode.csv (159,507 rows)
  - df_monthly_district.csv (7,062 rows)
  - df_monthly_state.csv (385 rows)



✓ Saved pickle files for faster loading


In [25]:
# Display sample of final cleaned data
print("\nSample of cleaned daily data:")
display(df[['date', 'year_month', 'state', 'district', 'pincode', 
            'demo_age_5_17', 'demo_age_17_']].head(10))

print("\nSample of monthly district data:")
display(df_monthly_district[['year_month', 'state', 'district', 
                             'demo_age_5_17', 'demo_age_17_', 
                             'total_updates', 'child_update_share']].head(10))


Sample of cleaned daily data:


Unnamed: 0,date,year_month,state,district,pincode,demo_age_5_17,demo_age_17_
0,2025-12-26,2025-12,Jaipur,Near Meera Hospital,302016,0,1
1,2025-12-22,2025-12,Jaipur,Near Meera Hospital,302016,0,1
2,2025-12-13,2025-12,Nagpur,Near Uday Nagar Nit Garden,440024,0,1
3,2025-12-22,2025-12,Nagpur,Near Uday Nagar Nit Garden,440024,0,1
4,2025-12-14,2025-12,Nagpur,Near Uday Nagar Nit Garden,440024,0,1
5,2025-12-03,2025-12,Nagpur,Near Uday Nagar Nit Garden,440024,0,1
6,2025-09-01,2025-09,Orissa,Balangir,767061,0,1
7,2025-09-01,2025-09,Orissa,Baleshwar,756003,1,0
8,2025-09-01,2025-09,Orissa,Baleshwar,756036,0,4
9,2025-09-01,2025-09,Orissa,Baleshwar,756046,1,1



Sample of monthly district data:


Unnamed: 0,year_month,state,district,demo_age_5_17,demo_age_17_,total_updates,child_update_share
0,2025-03,Andaman And Nicobar Islands,Nicobar,16,180,196,8.16
1,2025-03,Andaman And Nicobar Islands,North And Middle Andaman,10,201,211,4.74
2,2025-03,Andaman And Nicobar Islands,South Andaman,88,514,602,14.62
3,2025-03,Andhra Pradesh,Adilabad,457,5481,5938,7.7
4,2025-03,Andhra Pradesh,Alluri Sitharama Raju,408,3291,3699,11.03
5,2025-03,Andhra Pradesh,Anakapalli,272,2061,2333,11.66
6,2025-03,Andhra Pradesh,Anantapur,868,16988,17856,4.86
7,2025-03,Andhra Pradesh,Ananthapur,2143,7027,9170,23.37
8,2025-03,Andhra Pradesh,Ananthapuramu,2001,6327,8328,24.03
9,2025-03,Andhra Pradesh,Annamayya,426,2465,2891,14.74


## ✅ Data Preparation Complete!

### Summary of Accomplishments:

1. ✓ **Loaded all CSV files** from DemographicData folder
2. ✓ **Verified data structure** - confirmed all expected columns present
3. ✓ **Date standardization** - converted to datetime format and extracted features
4. ✓ **District-pincode consistency** - standardized text, validated pincodes
5. ✓ **Missing value handling** - filled/removed as appropriate
6. ✓ **Monthly aggregation** - created pincode, district, and state-level datasets
7. ✓ **Derived features** - calculated total_updates and child_update_share

### Next Steps:
- Proceed to univariate analysis
- Begin exploratory visualization
- Calculate baseline statistics