In [None]:
#All the Required Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = (10,6)

In [None]:
#Importing All the Files

file_paths = [
    "1st.csv",
    "2nd.csv",
    "3rd.csv"
]

df_list = [pd.read_csv(file) for file in file_paths]

df = pd.concat(df_list, ignore_index=True)


In [None]:
df.shape

(189671, 7)

In [None]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189671 entries, 0 to 189670
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   date            189671 non-null  object 
 1   state           189671 non-null  object 
 2   district        189671 non-null  object 
 3   pincode         189669 non-null  float64
 4   age_0_5         189669 non-null  float64
 5   age_5_17        189669 non-null  float64
 6   age_18_greater  189669 non-null  float64
dtypes: float64(4), object(3)
memory usage: 10.1+ MB


In [None]:
for i, d in enumerate(df_list, start=1):
    print(f"File {i} columns:", list(d.columns))


File 1 columns: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']
File 2 columns: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']
File 3 columns: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']


In [None]:
df['date'] = pd.to_datetime(df['date'], dayfirst=True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
date,0
state,0
district,0
pincode,2
age_0_5,2
age_5_17,2
age_18_greater,2


In [None]:
df.fillna(0, inplace=True)
df.isnull().sum()

Unnamed: 0,0
date,0
state,0
district,0
pincode,0
age_0_5,0
age_5_17,0
age_18_greater,0


In [None]:
df['total_enrolments'] = (
    df['age_0_5'] + df['age_5_17'] + df['age_18_greater']
)


In [None]:
df.dtypes

Unnamed: 0,0
date,datetime64[ns]
state,object
district,object
pincode,float64
age_0_5,float64
age_5_17,float64
age_18_greater,float64
total_enrolments,float64


In [None]:
# Adult enrolments require biometric capture, hence higher operational weight

df['age_weighted_load'] = (
    1.2 * df['age_0_5'] +
    1.0 * df['age_5_17'] +
    1.5 * df['age_18_greater']
)

In [None]:
#Feature
monthly = (
    df.groupby([
        'state',
        'district',
        pd.Grouper(key='date', freq='ME')
    ])
    .agg({
        'total_enrolments': 'sum',
        'age_weighted_load': 'sum'
    })
    .reset_index()
)


In [None]:
#Feature

monthly['growth_rate'] = (
    monthly
    .groupby(['state','district'])['total_enrolments']
    .pct_change()
    .fillna(0)
)


In [None]:
monthly['volatility'] = (
    monthly
    .groupby(['state','district'])['total_enrolments']
    .transform('std')
)


In [None]:
#Using Isolation Forest for Anomaly Detection

from sklearn.ensemble import IsolationForest

iso = IsolationForest(
    contamination=0.04,
    random_state=42
)

monthly['anomaly'] = iso.fit_predict(
    monthly[['total_enrolments', 'growth_rate']]
)

monthly['anomaly'] = monthly['anomaly'].map({-1: 1, 1: 0})


In [None]:
#Moving all the Feature to a new CSV file for further Processing
monthly[['state','district','date',
         'total_enrolments',
         'age_weighted_load',
         'growth_rate',
         'volatility',
         'anomaly']].head()

monthly.to_csv("assi_features.csv", index=False)


In [None]:
data = pd.read_csv("assi_features.csv")
df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,total_enrolments,age_weighted_load
0,2025-03-02,Meghalaya,East Khasi Hills,793121.0,11.0,61.0,37.0,109.0,129.7
1,2025-03-09,Karnataka,Bengaluru Urban,560043.0,14.0,33.0,39.0,86.0,108.3
2,2025-03-09,Uttar Pradesh,Kanpur Nagar,208001.0,29.0,82.0,12.0,123.0,134.8
3,2025-03-09,Uttar Pradesh,Aligarh,202133.0,62.0,29.0,15.0,106.0,125.9
4,2025-03-09,Karnataka,Bengaluru Urban,560016.0,14.0,16.0,21.0,51.0,64.3
