In [14]:
import pandas as pd

# Load data
df = pd.read_csv('../data/raw/dashboard_ready_data.csv')

# Handle missing values (e.g., fill zeros or drop if needed; your data has many zeros which are valid)
df.fillna(0, inplace=True)  # Zeros indicate no reports

# Standardize names (title case)
df['state_name'] = df['state_name'].str.title()
df['district_name'] = df['district_name'].str.title()

# Remove duplicates (based on unique keys)
df.drop_duplicates(subset=['year', 'state_name', 'district_name', 'registration_circles'], inplace=True)

# Create new features (assuming no date column; if adding from new data, parse dates)
# Example: is_minor (age <18), is_female
df['total_minors'] = df[['male_below_5_years', 'male_5_to_14_years', 'male_14_to_18_years',
                         'female_below_5_years', 'female_5_to_14_years', 'female_14_to_18_years']].sum(axis=1)
df['is_high_minor'] = df['total_minors'] > df['total_missing'] * 0.5  # Flag if >50% minors

df['total_females'] = df.filter(like='female_').sum(axis=1)
df['is_high_female'] = df['total_females'] > df['total_missing'] * 0.5

# If adding dates from new data: df['missing_date'] = pd.to_datetime(df['missing_date'])
# df['days_since_missing'] = (pd.Timestamp.now() - df['missing_date']).dt.days
# df['season'] = df['missing_date'].dt.month % 12 // 3 + 1  # 1=Winter, 2=Spring, etc.

# Save cleaned
df.to_csv('../data/cleaned/cleaned_missing_data.csv', index=False)

In [16]:
df.head()


Unnamed: 0,id,year,state_name,state_code,district_name,district_code,registration_circles,male_below_5_years,male_5_to_14_years,male_14_to_18_years,...,trangender_14_to_18_years,trangender_18_to_30_years,trangender_30_to_45_years,trangender_45_to_60_years,transgender_60_years_and_above,total_missing,total_minors,is_high_minor,total_females,is_high_female
0,0,2017,Andhra Pradesh,28,Anantapur,502,Anantapur,1,26.0,34.0,...,0.0,0.0,0.0,0.0,0.0,795.0,297.0,False,581.0,True
1,1,2017,Andhra Pradesh,28,Chittoor,503,Chittoor,0,15.0,12.0,...,0.0,0.0,0.0,0.0,0.0,320.0,121.0,False,233.0,True
2,2,2017,Andhra Pradesh,28,Y.S.R.,504,Cuddapah,6,18.0,11.0,...,1.0,2.0,0.0,0.0,0.0,330.0,95.0,False,209.0,True
3,3,2017,Andhra Pradesh,28,East Godavari,505,East Godavari,0,0.0,49.0,...,0.0,0.0,0.0,0.0,0.0,668.0,167.0,False,397.0,True
4,4,2017,Andhra Pradesh,28,Anantapur,502,Guntakal Railway,0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,12.0,1.0,False,3.0,False
