In [1]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import warnings
import os
import joblib  # For saving scalers and encoders
import json    # For saving model metadata

from sklearn.preprocessing import StandardScaler, LabelEncoder

# Suppress warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None

print("Setup complete. Libraries loaded.")

Setup complete. Libraries loaded.


In [2]:
# Cell 2: Constants and File Paths (Corrected)
# --- File Paths ---
RAW_DATA_FILE = 'cleaned_aadhaar_dataset.csv'
ARTIFACTS_DIR = 'artifacts' # This is the folder where all outputs will be saved

# --- Preprocessing Parameters ---
K_DISTRICT = 300  # Keep the Top 300 most common districts
K_PINCODE = 500   # Keep the Top 500 most common pincodes
OTHER_TOKEN = '<OTHER>'

# --- Feature Definitions (Corrected) ---
# We split categories into two types:
STRING_CATEGORICAL_FEATURES = ['state', 'district_topK', 'pincode_topK']
INTEGER_CATEGORICAL_FEATURES = ['month', 'day_of_week']
NUMERICAL_FEATURES = [
    'age_0_5', 'age_5_17', 'age_18_greater',
    'child_ratio', 'adult_ratio', 'dependent_ratio',
    'total_enrollments', 'z_score_state',
    'z_score_rolling', 'enrollment_volatility'
]

# --- Target Definitions ---
TARGET_TASK1 = 'is_anomaly'
TARGET_TASK2 = 'target_7d'
TARGET_TASK3 = 'high_inequality'

# Ensure the artifacts directory exists
os.makedirs(ARTIFACTS_DIR, exist_ok=True)
print(f"Artifacts will be saved to: {ARTIFACTS_DIR}/")
print(f"Top-K for District: {K_DISTRICT}")
print(f"Top-K for Pincode: {K_PINCODE}")

Artifacts will be saved to: artifacts/
Top-K for District: 300
Top-K for Pincode: 500


In [3]:
# Cell 3: Load Raw Data
print(f"Loading raw data: {RAW_DATA_FILE}...")
try:
    df = pd.read_csv('/home/vulcan/Abhay/Projects/ADA/Dataset/cleaned_aadhaar_dataset.csv')
except FileNotFoundError:
    print(f"Error: '{RAW_DATA_FILE}' not found.")
    raise

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['state', 'district', 'pincode', 'date']).reset_index(drop=True)
print(f"✓ Data loaded successfully: {df.shape}")

Loading raw data: cleaned_aadhaar_dataset.csv...
✓ Data loaded successfully: (219091, 7)


In [4]:
# Cell 4: Feature Engineering (Prerequisites + Model Features)
print("Running feature engineering...")

# --- 1. Prerequisites for Targets ---
df['total_enrollments'] = df['age_0_5'] + df['age_5_17'] + df['age_18_greater']

group_cols = ['state', 'district', 'pincode']
df['total_enrollments_rolling_mean_7d'] = df.groupby(group_cols)['total_enrollments'].transform(
    lambda x: x.rolling(7, min_periods=1).mean()
)
df['total_enrollments_rolling_std_7d'] = df.groupby(group_cols)['total_enrollments'].transform(
    lambda x: x.rolling(7, min_periods=1).std()
)
state_stats = df.groupby(['state', 'date']).agg({'total_enrollments': ['mean', 'std']}).reset_index()
state_stats.columns = ['state', 'date', 'state_mean', 'state_std']
df = df.merge(state_stats, on=['state', 'date'], how='left')
df['z_score_state'] = (df['total_enrollments'] - df['state_mean']) / (df['state_std'] + 1e-10)
df['z_score_rolling'] = (df['total_enrollments'] - df['total_enrollments_rolling_mean_7d']) / (df['total_enrollments_rolling_std_7d'] + 1e-10)
df['enrollment_volatility'] = df['total_enrollments_rolling_std_7d'] / (df['total_enrollments_rolling_mean_7d'] + 1e-10)

# --- 2. Features for the TabTransformer Model ---
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek
df['child_ratio'] = df['age_0_5'] / (df['total_enrollments'] + 1e-10)
df['adult_ratio'] = df['age_18_greater'] / (df['total_enrollments'] + 1e-10)
df['dependent_ratio'] = (df['age_0_5'] + df['age_5_17']) / (df['age_18_greater'] + 1e-10)

# --- 3. Sanitize ---
df = df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(0) # Fill NaNs from rolling/lags
print("✓ All prerequisite and model features created.")

Running feature engineering...
✓ All prerequisite and model features created.


In [5]:
# Cell 5: Target Variable Creation
print("Creating target variables (leak-free)...")

# Compute train/test cutoff BEFORE creating targets
TRAIN_TEST_CUTOFF = df['date'].quantile(0.8)
train_mask = df['date'] <= TRAIN_TEST_CUTOFF
print(f"  Train/Test cutoff date: {TRAIN_TEST_CUTOFF.date()}")

# === Task 1: Anomaly Detection ===
volatility_threshold = df.loc[train_mask, 'enrollment_volatility'].quantile(0.95)
df[TARGET_TASK1] = (
    (abs(df['z_score_rolling']) > 2) |
    (abs(df['z_score_state']) > 2.5) |
    (df['enrollment_volatility'] > volatility_threshold)
).astype(int)

# === Task 2: 7-Day Forecasting ===
df[TARGET_TASK2] = df.groupby(['state', 'district', 'pincode'])['total_enrollments'].shift(-7)

# === Task 3: Spatial Inequality ===
threshold_inequality = df.loc[train_mask, 'z_score_state'].quantile(0.90)
df[TARGET_TASK3] = (df['z_score_state'] > threshold_inequality).astype(int)
print("✓ Target variables are now in the DataFrame.")

Creating target variables (leak-free)...
  Train/Test cutoff date: 2025-09-25
✓ Target variables are now in the DataFrame.


In [6]:
# Cell 6: Train/Test Split
print("Splitting data into train_df and test_df...")
train_df = df[train_mask].copy()
test_df = df[~train_mask].copy()
print(f"  Train set shape: {train_df.shape}")
print(f"  Test set shape:  {test_df.shape}")

Splitting data into train_df and test_df...
  Train set shape: (179423, 23)
  Test set shape:  (39668, 23)


In [7]:
# Cell 7: Cardinality Reduction (The "Top-K" Fix)
print("Applying Top-K cardinality reduction...")
def apply_top_k(train_series, test_series, k, other_token=OTHER_TOKEN):
    top_k_values = train_series.value_counts().index[:k].tolist()
    train_mapped = train_series.apply(lambda x: x if x in top_k_values else other_token)
    test_mapped = test_series.apply(lambda x: x if x in top_k_values else other_token)
    return train_mapped, test_mapped

train_df['district_topK'], test_df['district_topK'] = apply_top_k(
    train_df['district'], test_df['district'], K_DISTRICT
)
train_df['pincode_topK'], test_df['pincode_topK'] = apply_top_k(
    train_df['pincode'], test_df['pincode'], K_PINCODE
)
print("✓ Cardinality reduction complete.")

Applying Top-K cardinality reduction...
✓ Cardinality reduction complete.


In [8]:
# Cell 8: Preprocessing (Label Encoders & Standard Scalers) (Corrected for JSON)
print("Fitting and applying encoders and scalers...")

# We will save all fitted objects
scalers = {}
encoders = {}
cardinalities = {} # Metadata for the model architecture

# --- 1. Numerical Features ---
print("  Fitting StandardScaler...")
scaler = StandardScaler()

# Fit ONLY on the training data
train_df[NUMERICAL_FEATURES] = scaler.fit_transform(train_df[NUMERICAL_FEATURES])

# Transform the test data using the *same* scaler
test_df[NUMERICAL_FEATURES] = scaler.transform(test_df[NUMERICAL_FEATURES])

# Save the fitted scaler
scalers['numerical'] = scaler
joblib.dump(scaler, os.path.join(ARTIFACTS_DIR, 'scaler.joblib'))
print("  ✓ Numerical features scaled. Scaler saved.")


# --- 2. String Categorical Features ---
print("  Fitting LabelEncoders for string features...")
for col in STRING_CATEGORICAL_FEATURES:
    encoder = LabelEncoder()
    
    unique_vals = train_df[col].unique().tolist()
    if OTHER_TOKEN not in unique_vals:
        unique_vals.append(OTHER_TOKEN)
        
    encoder.fit(unique_vals)
    
    # Transform both train and test
    train_df[col] = encoder.transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])
    
    # Save the encoder and its cardinality (number of unique values)
    encoders[col] = encoder
    cardinalities[col] = len(encoder.classes_) # len() returns a standard int
    
joblib.dump(encoders, os.path.join(ARTIFACTS_DIR, 'encoders.joblib'))
print(f"  ✓ String categorical features encoded. Encoders saved.")

# --- 3. Integer Categorical Features (FIXED) ---
print("  Calculating cardinalities for integer features...")
for col in INTEGER_CATEGORICAL_FEATURES:
    max_val = df[col].max() 
    
    # --- THE FIX: Cast to standard Python int ---
    cardinalities[col] = int(max_val) + 1 
    # ---
    
# Save the cardinalities as JSON for our PyTorch model
with open(os.path.join(ARTIFACTS_DIR, 'cardinalities.json'), 'w') as f:
    json.dump(cardinalities, f, indent=4)
    
print(f"  ✓ Cardinalities saved to JSON: {cardinalities}")
print("✓ Preprocessing complete.")

Fitting and applying encoders and scalers...
  Fitting StandardScaler...
  ✓ Numerical features scaled. Scaler saved.
  Fitting LabelEncoders for string features...
  ✓ String categorical features encoded. Encoders saved.
  Calculating cardinalities for integer features...
  ✓ Cardinalities saved to JSON: {'state': 37, 'district_topK': 301, 'pincode_topK': 501, 'month': 11, 'day_of_week': 7}
✓ Preprocessing complete.


In [9]:
# Cell 9: Save Final Processed Data (Corrected)
print("Saving final processed dataframes...")

# Define the final columns needed for the model and for targeting
FINAL_COLS_TO_SAVE = (
    STRING_CATEGORICAL_FEATURES +
    INTEGER_CATEGORICAL_FEATURES +
    NUMERICAL_FEATURES +
    [TARGET_TASK1, TARGET_TASK2, TARGET_TASK3]
)

# Save as Parquet for efficiency and type preservation
train_df[FINAL_COLS_TO_SAVE].to_parquet(
    os.path.join(ARTIFACTS_DIR, 'train_processed.parquet'), 
    index=False
)
test_df[FINAL_COLS_TO_SAVE].to_parquet(
    os.path.join(ARTIFACTS_DIR, 'test_processed.parquet'), 
    index=False
)

print(f"✓ 'train_processed.parquet' and 'test_processed.parquet' saved to '{ARTIFACTS_DIR}'")
print("\nData preparation for TabTransformer is complete!")
print("You can now run the 'Advanced Training' notebook.")

Saving final processed dataframes...
✓ 'train_processed.parquet' and 'test_processed.parquet' saved to 'artifacts'

Data preparation for TabTransformer is complete!
You can now run the 'Advanced Training' notebook.
