# Notebook with Code to Produce the Plots in Issues [3](https://messenger.substack.com/p/thinking-in-public-the-gender-polyconflict) and [4](https://messenger.substack.com/p/thinking-in-public-the-gender-polyconflict-6bf)

## Some Background Info:
All of the plots come from one Census dataset: the B-01 dataset. Below is some information about this dataset.

### 1. Official description of the B-01 files
Main workers, marginal workers, non-workers and those marginal workers, non-workers seeking/available for work classified by age and sex.

### 2. `Population` Definition
```
Population = Main Workers + Marginal Workers (Worked for < 3 months)
    + Marginal Workers (Worked for 3-6 months) + Non-workers
```

### 3. Data Granularity:
- Caste: Available
- District: Available
- Gender: Available
- Age: Available

In [None]:
!pip install --quiet gspread google-auth google-api-python-client plotly \
  pandas==2.2.2 google-auth==2.38.0 openpyxl xlrd geopandas folium ipywidgets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.6 MB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m1.4/1.6 MB[0m [31m20.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m21.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Import apis
from google.colab import auth
from google.auth import default
from googleapiclient.discovery import build
import gspread
from google.colab import drive
from googleapiclient.http import MediaIoBaseDownload, MediaFileUpload
import pandas as pd
import io
import geopandas as gpd
from IPython.display import clear_output, display, HTML, Markdown
import ipywidgets as widgets
from ipywidgets import interactive
import folium
from folium import Choropleth
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
from contextlib import contextmanager
from datetime import datetime
import os
import itertools
import time
import re
from shapely.geometry import Point
from pyproj import Transformer

# Part 1: Organising the data

## Util Functions

In [None]:
# Function to find file ID by exact name in the given folder with pagination
def get_file_id(folder_id, file_name):
  query = f"'{folder_id}' in parents and name = '{file_name}' and trashed=false"
  page_token = None
  while True:
    results = drive_service.files().list(
      q=query,
      fields="nextPageToken, files(id, name)",
      pageSize=100,  # Fetch up to 100 files per page
      pageToken=page_token
    ).execute()

    files = results.get('files', [])
    for file in files:
      if file["name"] == file_name:
        return file["id"]  # Return the first exact match found

    page_token = results.get('nextPageToken')
    if not page_token:
      break  # No more pages left to check

  raise FileNotFoundError(
    f"No file named '{file_name}' found in folder {folder_id}")

In [None]:
# Function to read a Google Sheet from Drive as a Pandas DataFrame
def read_gsheet_from_drive(file_id):
  sheet = gc.open_by_key(file_id).sheet1 # Data is in the first sheet
  data = sheet.get_all_records()
  return pd.DataFrame(data)

In [None]:
# Function to ignore Pandas SettingWithCopyWarning
@contextmanager
def suppress_settingwithcopy_warning():
    with warnings.catch_warnings():
        warnings.simplefilter("ignore",
                              category=pd.errors.SettingWithCopyWarning)
        yield

## Read the data in

In [None]:
# Data location details
folder_id = \
  "1N7hefMlzIodrufTz7xHGM1boKv0Wvozq" # replace with the folder containing cleaned B-01 data
file_name_total = "B-01_caste_total"
file_name_sc = "B-01_caste_sc"
file_name_st = "B-01_caste_st"

# Authenticate and connect to Drive / Sheets
auth.authenticate_user()
creds, _ = default()
drive_service = build('drive', 'v3', credentials=creds)
drive.mount('/content/drive')
gc = gspread.authorize(creds)

# Read in files as pandas data frames
df_total = read_gsheet_from_drive(get_file_id(folder_id, file_name_total))
df_sc = read_gsheet_from_drive(get_file_id(folder_id, file_name_sc))
df_st = read_gsheet_from_drive(get_file_id(folder_id, file_name_st))

# Display the dataframes
print("Aggregated Data:")
display(df_total)

print("SC Data:")
display(df_sc)

print("ST Data:")
display(df_st)

Mounted at /content/drive
Aggregated Data:


Unnamed: 0,Table Name,State Code,District Code,Area Name,Total/Rural/Urban,Age-Group,Population: Total,Population: Males,Population: Females,Main Workers: Total,...,Marginal workers - Worked for 3 to 6 months: Females,Marginal workers - Seeking/available for work: Total,Marginal workers - Seeking/available for work: Males,Marginal workers - Seeking/available for work: Females,Non-workers - All: Total,Non-workers - All: Males,Non-workers - All: Females,Non-workers - Seeking/availabe for work: Total,Non-workers - Seeking/availabe for work: Males,Non-workers - Seeking/availabe for work: Females
0,B0101,19,0,State - WEST BENGAL (19),Total,5 to 9,8247809,4216763,4031046,45657,...,23831,25860,13766,12094,8140029,4158175,3981854,180228,93244,86984
1,B0101,19,0,State - WEST BENGAL (19),Total,10 to 14,9156523,4677506,4479017,188618,...,75041,140624,85811,54813,8714211,4395931,4318280,728959,385668,343291
2,B0101,19,0,State - WEST BENGAL (19),Total,15 to 19,9058031,4702325,4355706,1196710,...,272472,775592,521418,254174,6833111,3071115,3761996,3040410,1450284,1590126
3,B0101,19,0,State - WEST BENGAL (19),Total,20 to 24,8758322,4422630,4335692,2584570,...,434285,1114561,724752,389809,4732044,1383712,3348332,2795414,997620,1797794
4,B0101,19,0,State - WEST BENGAL (19),Total,25 to 29,7997909,4044904,3953005,3341148,...,438469,975024,594353,380671,3360340,512821,2847519,1833468,388728,1444740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32360,B0101,35,640,District - South Andaman (03),Urban,70 to 79,1766,982,784,204,...,6,2,1,1,1531,778,753,24,11,13
32361,B0101,35,640,District - South Andaman (03),Urban,80+,592,274,318,85,...,2,4,3,1,495,200,295,8,2,6
32362,B0101,35,640,District - South Andaman (03),Urban,Age not stated,188,94,94,57,...,2,7,6,1,122,43,79,17,4,13
32363,B0101,35,640,District - South Andaman (03),Urban,15 to 59,100409,54145,46264,50923,...,1170,2728,1864,864,45482,10354,35128,15187,4641,10546


SC Data:


Unnamed: 0,Table Name,State Code,District Code,Area Name,Total/Rural/Urban,Age-Group,Population: Total,Population: Males,Population: Females,Main Workers: Total,...,Marginal workers - Worked for 3 to 6 months: Females,Marginal workers - Seeking/available for work: Total,Marginal workers - Seeking/available for work: Males,Marginal workers - Seeking/available for work: Females,Non-workers - All: Total,Non-workers - All: Males,Non-workers - All: Females,Non-workers - Seeking/availabe for work: Total,Non-workers - Seeking/availabe for work: Males,Non-workers - Seeking/availabe for work: Females
0,B0201SC,19,0,State - WEST BENGAL (19),Total,5 to 14,4227264,2159561,2067703,51405,...,22967,40173,23646,16527,4100601,2082623,2017978,226592,118050,108542
1,B0201SC,19,0,State - WEST BENGAL (19),Total,15 to 34,7889186,4061438,3827748,2553919,...,443655,1025451,629442,396009,3979288,1175589,2803699,2058761,682766,1375995
2,B0201SC,19,0,State - WEST BENGAL (19),Total,35 to 59,5920736,3080517,2840219,3018932,...,392303,622861,339093,283768,1911343,123892,1787451,611754,44521,567233
3,B0201SC,19,0,State - WEST BENGAL (19),Total,60+,1636703,789016,847687,428201,...,41455,58458,37799,20659,1054811,331870,722941,72827,21786,51041
4,B0201SC,19,0,State - WEST BENGAL (19),Total,Age not stated,15829,8456,7373,3801,...,671,1092,666,426,10033,4271,5762,1248,468,780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13404,B0201SC,28,554,District - Chittoor (23),Urban,15 to 34,55182,26449,28733,17967,...,1085,1628,1032,596,33907,11539,22368,6378,2937,3441
13405,B0201SC,28,554,District - Chittoor (23),Urban,35 to 59,37187,18853,18334,21777,...,1049,1247,797,450,12234,1207,11027,873,173,700
13406,B0201SC,28,554,District - Chittoor (23),Urban,60+,8786,4047,4739,2111,...,148,127,79,48,6190,2255,3935,157,59,98
13407,B0201SC,28,554,District - Chittoor (23),Urban,Age not stated,1430,689,741,399,...,27,23,17,6,927,358,569,41,16,25


ST Data:


Unnamed: 0,Table Name,State Code,District Code,Area Name,Total/Rural/Urban,Age-Group,Population: Total,Population: Males,Population: Females,Main Workers: Total,...,Marginal workers - Worked for 3 to 6 months: Females,Marginal workers - Seeking/available for work: Total,Marginal workers - Seeking/available for work: Males,Marginal workers - Seeking/available for work: Females,Non-workers - All: Total,Non-workers - All: Males,Non-workers - All: Females,Non-workers - Seeking/availabe for work: Total,Non-workers - Seeking/availabe for work: Males,Non-workers - Seeking/availabe for work: Females
0,B0301ST,19,0,State - WEST BENGAL (19),Total,5 to 14,1132837,576065,556772,16148,...,10058,17319,9050,8269,1084872,550513,534359,57014,29126,27888
1,B0301ST,19,0,State - WEST BENGAL (19),Total,15 to 34,1916773,958823,957950,632671,...,216488,423583,217632,205951,724865,268374,456491,363654,141234,222420
2,B0301ST,19,0,State - WEST BENGAL (19),Total,35 to 59,1396927,698503,698424,718215,...,176841,267515,125946,141569,271462,39058,232404,89265,14653,74612
3,B0301ST,19,0,State - WEST BENGAL (19),Total,60+,368959,173237,195722,83771,...,18046,24994,14274,10720,226920,79360,147560,15629,5481,10148
4,B0301ST,19,0,State - WEST BENGAL (19),Total,Age not stated,4338,2293,2045,966,...,330,442,240,202,2549,1231,1318,237,94,143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12880,B0301ST,35,640,District - South Andaman (03),Urban,15 to 34,793,350,443,185,...,2,8,6,2,598,212,386,351,134,217
12881,B0301ST,35,640,District - South Andaman (03),Urban,35 to 59,500,281,219,325,...,0,1,1,0,172,18,154,33,7,26
12882,B0301ST,35,640,District - South Andaman (03),Urban,60+,18,10,8,3,...,0,1,1,0,14,6,8,1,1,0
12883,B0301ST,35,640,District - South Andaman (03),Urban,Age not stated,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Merging the Total, SC and ST datasets
### Considerations
We need to merge the data over the following dimensions:
```
State Code | District Code | Area Name | Total/Rural/Urban | Age-Group
```

However, cursory inspection of the data shows us that the `Age-Group` columns in the three datasets differ. The SC and ST datasets have fewer `Age-Group` levels than the Total dataset.

We could proceed to perform the merge in one of two ways:

1. Condense the Total dataset to have the same `Age-Group` levels as the other two, the "lowest common denominator" way, or
2. Expand the latter two to have the same `Age-Group` levels as former.

Way 1 would result in loss of age group granularity (our ability to do fine-grained analysis by age groups) on the Total dataset. Way 2 would require us to make assumptions about age distributions in the SC and ST communities.

### Decision
At this exploratory stage, we choose to go by way 1, as we can still retain the total dataset for fine-grained analysis (without merging, of course). If we eventually decide to work without caste variables, or without fine-grained analysis age group, way 1 would serve us well.

In [None]:
df_total['Age-Group'].value_counts()

Unnamed: 0_level_0,count
Age-Group,Unnamed: 1_level_1
5 to 9,2025
10 to 14,2025
15 to 19,2025
20 to 24,2025
25 to 29,2025
30 to 34,2025
35 to 39,2025
40 to 49,2025
50 to 59,2025
60 to 69,2025


In [None]:
df_sc['Age-Group'].value_counts()

Unnamed: 0_level_0,count
Age-Group,Unnamed: 1_level_1
5 to 14,1920
15 to 34,1920
35 to 59,1920
60+,1920
Age not stated,1920
15 to 59,1920
Total,1889


In [None]:
df_st['Age-Group'].value_counts()

Unnamed: 0_level_0,count
Age-Group,Unnamed: 1_level_1
5 to 14,1845
15 to 34,1845
35 to 59,1845
60+,1845
Age not stated,1845
15 to 59,1845
Total,1815


### Lowest Common Denominator Age-Groups
The LCD age groups are:

```
"5 to 14", "15 to 34", "35 to 59", "60+", "Age not stated", "15 to 59", and "Total"
```
Of these, the total dataset already has the following groups:

```
"60+", "Age not stated", "15 to 59", and "Total"
```

This means that we must aggregate the following age groups as shown below in the total dataset:

In [None]:
from IPython.display import Markdown

# Define the age group mapping
age_group_mapping = {
    "5 to 14": ["5 to 9", "10 to 14"],
    "15 to 34": ["15 to 19", "20 to 24", "25 to 29", "30 to 34"],
    "35 to 59": ["35 to 39", "40 to 49", "50 to 59"]
}

# Construct Markdown table
table_md = "| Groups to Aggregate | Aggregated Group |\n"
table_md += "|---------------------|------------------|\n"

for aggregated_group, age_ranges in age_group_mapping.items():
    table_md += f"| {str(age_ranges)} | \"{aggregated_group}\" |\n"

# Display Markdown table in Colab
display(Markdown(table_md))


| Groups to Aggregate | Aggregated Group |
|---------------------|------------------|
| ['5 to 9', '10 to 14'] | "5 to 14" |
| ['15 to 19', '20 to 24', '25 to 29', '30 to 34'] | "15 to 34" |
| ['35 to 39', '40 to 49', '50 to 59'] | "35 to 59" |


In [None]:
# Extract relevant columns
grouping_columns = list(df_total.columns[:6]) # Non-numeric grouping columns
aggregation_columns = list(df_total.columns[6:]) # Numeric columns to be summed

# Create a copy of df_total with an updated Age-Group for aggregation
df_total_condensed = df_total.copy()

# Replace individual age groups with their aggregated counterparts
df_total_condensed["Age-Group"] = df_total_condensed["Age-Group"].replace(
    {old: new for new, olds in age_group_mapping.items() for old in olds}
)

# Perform the aggregation
df_total_condensed = df_total_condensed.groupby(
    grouping_columns, as_index=False).sum()

We must also remove the age groups not in the SC and ST datasets, viz. `60 to 69`, `70 to 79` and `80+`; those age groups are aggregated as `60+` in the SC and ST datasets, and the `60+` aggregation was already available in the original Total dataset.

In [None]:
df_total_condensed = df_total_condensed[~df_total_condensed['Age-Group'].isin(
    ['60 to 69', '70 to 79', '80+'])]

In [None]:
df_total_condensed['Age-Group'].value_counts()

Unnamed: 0_level_0,count
Age-Group,Unnamed: 1_level_1
15 to 34,2025
15 to 59,2025
35 to 59,2025
5 to 14,2025
60+,2025
Age not stated,2025
Total,1990


In [None]:
df_total_condensed

Unnamed: 0,Table Name,State Code,District Code,Area Name,Total/Rural/Urban,Age-Group,Population: Total,Population: Males,Population: Females,Main Workers: Total,...,Marginal workers - Worked for 3 to 6 months: Females,Marginal workers - Seeking/available for work: Total,Marginal workers - Seeking/available for work: Males,Marginal workers - Seeking/available for work: Females,Non-workers - All: Total,Non-workers - All: Males,Non-workers - All: Females,Non-workers - Seeking/availabe for work: Total,Non-workers - Seeking/availabe for work: Males,Non-workers - Seeking/availabe for work: Females
0,B0101,1,0,State - JAMMU & KASHMIR (01),Rural,15 to 34,3128174,1618996,1509178,715652,...,224835,409065,260355,148710,1697272,635600,1061672,540622,235378,305244
1,B0101,1,0,State - JAMMU & KASHMIR (01),Rural,15 to 59,5123667,2663733,2459934,1514800,...,399158,649836,425894,223942,2361769,701647,1660122,649856,252982,396874
2,B0101,1,0,State - JAMMU & KASHMIR (01),Rural,35 to 59,1995493,1044737,950756,799148,...,174323,240771,165539,75232,664497,66047,598450,109234,17604,91630
3,B0101,1,0,State - JAMMU & KASHMIR (01),Rural,5 to 14,2206292,1160505,1045787,18316,...,20118,22153,12002,10151,2113937,1112108,1001829,70082,36642,33440
5,B0101,1,0,State - JAMMU & KASHMIR (01),Rural,60+,651969,342489,309480,135058,...,27471,36602,28624,7978,395749,142960,252789,17751,6922,10829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20207,B0101,35,640,District - South Andaman (03),Urban,35 to 59,44162,24216,19946,28537,...,401,672,460,212,14345,838,13507,2239,215,2024
20208,B0101,35,640,District - South Andaman (03),Urban,5 to 14,23048,11853,11195,363,...,94,75,35,40,22476,11529,10947,1026,531,495
20210,B0101,35,640,District - South Andaman (03),Urban,60+,7199,3999,3200,1325,...,31,26,21,5,5732,2734,2998,124,64,60
20213,B0101,35,640,District - South Andaman (03),Urban,Age not stated,188,94,94,57,...,2,7,6,1,122,43,79,17,4,13


## Merging the Total, SC and ST datasets - Continued
### Further Considerations
Now that age-groups have been normalised across the three datasets, we need to ensure we, can in fact, that merge the datasets meaningfully.

If along every dimension we wished to merge the data, viz. `"State Code", "District Code", "Area Name", "Total/Rural/Urban", "Age-Group"`, we had no missing data in any of the three datasets, we can, potentially, perform the merge without issue. However, that might not be the case.

So we proceed to check for any missing data.


In [None]:
# Define the merge keys
merge_keys = ["State Code", "District Code", "Area Name",
              "Total/Rural/Urban", "Age-Group"]

# Merge df_total_condensed with df_sc using an outer join and indicator flag
merge_total_sc = df_total_condensed.loc[:, merge_keys].merge(
    df_sc.loc[:, merge_keys], on=merge_keys, how="outer", indicator=True)
merge_total_sc.rename(columns={"_merge": "_merge_total_sc"}, inplace=True)

# Similarly, merge df_total_condensed with df_st
merge_total_sc_st = merge_total_sc.merge(
    df_st.loc[:, merge_keys], on=merge_keys, how="outer", indicator=True)
mismatch_cases = merge_total_sc_st.loc[
    (merge_total_sc_st["_merge_total_sc"] != "both") | (
        merge_total_sc_st["_merge"] != "both")]
mismatch_cases

Unnamed: 0,State Code,District Code,Area Name,Total/Rural/Urban,Age-Group,_merge_total_sc,_merge
754,3,0,State - PUNJAB (03),Rural,15 to 34,both,left_only
755,3,0,State - PUNJAB (03),Rural,15 to 59,both,left_only
756,3,0,State - PUNJAB (03),Rural,35 to 59,both,left_only
757,3,0,State - PUNJAB (03),Rural,5 to 14,both,left_only
758,3,0,State - PUNJAB (03),Rural,60+,both,left_only
...,...,...,...,...,...,...,...
14135,35,640,District - South Andaman (03),Urban,35 to 59,left_only,both
14136,35,640,District - South Andaman (03),Urban,5 to 14,left_only,both
14137,35,640,District - South Andaman (03),Urban,60+,left_only,both
14138,35,640,District - South Andaman (03),Urban,Age not stated,left_only,both


The `_merge_total_sc` column in a row indicates the presence of a corresponding data record in the Total and SC datasets:  
- `both` means the record exists in both datasets.  
- `left_only` means it exists only in the Total dataset.  
- `right_only` means it exists only in the SC dataset.  

Similarly, `_merge` indicates the presence of a corresponding record in the Total or SC datasets and the ST dataset:  
- `both` means the record exists in at least one of the Total or SC datasets and in the ST dataset.  
- `left_only` means it exists only in the Total or SC datasets, but not in the ST dataset.  
- `right_only` means it exists only in the ST dataset.

Reviewing the table displayed, we can conclude that there are, in fact, missing data which we may need to be concerned with.

However, we know from the [Metadata](https://docs.google.com/spreadsheets/d/1E3GAnfUaiAhIlU-F2v-9EGlXFymngRmQ-K1pMsawX9s/edit?gid=682402169#gid=682402169) file that certain states, e.g., Punjab, do not have any ST data (perhaps the state has no ST), and certain others, e.g., Nagaland, do not have any SC data (perhaps the state has no SC).

So we proceed to count the number of mismatches by state.

In [None]:
mismatches_by_state = mismatch_cases.loc[
  :, ["State Code", "_merge_total_sc",	"_merge"]].value_counts().reset_index()
state_code_names = mismatch_cases.loc[
  mismatch_cases["District Code"] == 0, ["State Code", "Area Name"]
  ].drop_duplicates()
mismatches_by_state.merge(state_code_names, on="State Code", how="left"
  )[["Area Name", "_merge_total_sc", "_merge", "count"]]

Unnamed: 0,Area Name,_merge_total_sc,_merge,count
0,State - HARYANA (06),both,left_only,461
1,State - PUNJAB (03),both,left_only,440
2,State - ARUNACHAL PRADESH (12),left_only,both,356
3,State - NAGALAND (13),left_only,both,251
4,State - NCT OF DELHI (07),both,left_only,209
5,State - PUDUCHERRY (34),both,left_only,104
6,State - ANDAMAN & NICOBAR ISLANDS (35),left_only,both,83
7,State - CHANDIGARH (04),both,left_only,41
8,State - LAKSHADWEEP (31),left_only,both,41


These results are as exactly as per the [Metadata](https://docs.google.com/spreadsheets/d/1E3GAnfUaiAhIlU-F2v-9EGlXFymngRmQ-K1pMsawX9s/edit?gid=682402169#gid=682402169) file. So the only missing values pertain to states where either SC or ST castes do not exist as per the census dataset. We can therefore impute missing data therein with zeroes.

## Merging the Total, SC and ST datasets - The Actual Merge Operation

In [None]:
# remove column "Table Name", which serves no function
del df_total_condensed["Table Name"], df_sc["Table Name"], df_st["Table Name"]

# rename attribute columns (the numeric ones - e.g., population) to have suffix
# "_Caste_ST" in the ST dataset - this is needed because the second merge below
# does not add the said suffix when run with `suffixes=("", "_Caste_ST")`
df_st_copy = df_st.rename(
    {col: col + "_Caste_ST" for col in df_st.columns if col not in merge_keys},
    axis=1)

# merge
merged_df = df_total_condensed.merge(
    df_sc, on=merge_keys, how="outer", suffixes=("_Caste_Total", "_Caste_SC")
  ).merge(df_st_copy, on=merge_keys, how="outer").fillna(0)
merged_df

Unnamed: 0,State Code,District Code,Area Name,Total/Rural/Urban,Age-Group,Population: Total_Caste_Total,Population: Males_Caste_Total,Population: Females_Caste_Total,Main Workers: Total_Caste_Total,Main Workers: Males_Caste_Total,...,Marginal workers - Worked for 3 to 6 months: Females_Caste_ST,Marginal workers - Seeking/available for work: Total_Caste_ST,Marginal workers - Seeking/available for work: Males_Caste_ST,Marginal workers - Seeking/available for work: Females_Caste_ST,Non-workers - All: Total_Caste_ST,Non-workers - All: Males_Caste_ST,Non-workers - All: Females_Caste_ST,Non-workers - Seeking/availabe for work: Total_Caste_ST,Non-workers - Seeking/availabe for work: Males_Caste_ST,Non-workers - Seeking/availabe for work: Females_Caste_ST
0,1,0,State - JAMMU & KASHMIR (01),Rural,15 to 34,3128174,1618996,1509178,715652,611663,...,43849.0,72530.0,48178.0,24352.0,210071.0,76648.0,133423.0,57456.0,23930.0,33526.0
1,1,0,State - JAMMU & KASHMIR (01),Rural,15 to 59,5123667,2663733,2459934,1514800,1318638,...,75777.0,119560.0,82174.0,37386.0,287518.0,84185.0,203333.0,71426.0,25911.0,45515.0
2,1,0,State - JAMMU & KASHMIR (01),Rural,35 to 59,1995493,1044737,950756,799148,706975,...,31928.0,47030.0,33996.0,13034.0,77447.0,7537.0,69910.0,13970.0,1981.0,11989.0
3,1,0,State - JAMMU & KASHMIR (01),Rural,5 to 14,2206292,1160505,1045787,18316,11459,...,5343.0,5547.0,3069.0,2478.0,363596.0,188778.0,174818.0,12348.0,6374.0,5974.0
4,1,0,State - JAMMU & KASHMIR (01),Rural,60+,651969,342489,309480,135058,121646,...,6013.0,8541.0,6883.0,1658.0,46178.0,17522.0,28656.0,2460.0,892.0,1568.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14135,35,640,District - South Andaman (03),Urban,35 to 59,44162,24216,19946,28537,22550,...,0.0,1.0,1.0,0.0,172.0,18.0,154.0,33.0,7.0,26.0
14136,35,640,District - South Andaman (03),Urban,5 to 14,23048,11853,11195,363,210,...,1.0,1.0,1.0,0.0,357.0,171.0,186.0,31.0,17.0,14.0
14137,35,640,District - South Andaman (03),Urban,60+,7199,3999,3200,1325,1156,...,0.0,1.0,1.0,0.0,14.0,6.0,8.0,1.0,1.0,0.0
14138,35,640,District - South Andaman (03),Urban,Age not stated,188,94,94,57,45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Enrich & format the merged dataset
**Enrichment**: It would be useful to "enrich" the merged dataset to have data corresponding to the "General" caste (i.e., those who do not belong to the SC and ST groups).

**Formatting**: For convenience, we then transform the enriched, merged dataset as a "[longer-form](https://www.statology.org/long-vs-wide-data/)" table, with `Gender` and `Caste` as separate dimensions.

**Further Enrichment**: Explicitly have a `State Name` column.

In [None]:
# Extract relevant columns
attribute_columns = list(merged_df.columns[5:])  # Attribute columns

# Enrich with Caste category "General"
for col in attribute_columns:
  if col.endswith("_Caste_Total"):
    base_name = col[:-len("_Caste_Total")]
    merged_df[f"{base_name}_Caste_General"] = (
      merged_df[f"{base_name}_Caste_Total"]
      - merged_df[f"{base_name}_Caste_SC"]
      - merged_df[f"{base_name}_Caste_ST"]
    )
merged_df

Unnamed: 0,State Code,District Code,Area Name,Total/Rural/Urban,Age-Group,Population: Total_Caste_Total,Population: Males_Caste_Total,Population: Females_Caste_Total,Main Workers: Total_Caste_Total,Main Workers: Males_Caste_Total,...,Marginal workers - Worked for 3 to 6 months: Females_Caste_General,Marginal workers - Seeking/available for work: Total_Caste_General,Marginal workers - Seeking/available for work: Males_Caste_General,Marginal workers - Seeking/available for work: Females_Caste_General,Non-workers - All: Total_Caste_General,Non-workers - All: Males_Caste_General,Non-workers - All: Females_Caste_General,Non-workers - Seeking/availabe for work: Total_Caste_General,Non-workers - Seeking/availabe for work: Males_Caste_General,Non-workers - Seeking/availabe for work: Females_Caste_General
0,1,0,State - JAMMU & KASHMIR (01),Rural,15 to 34,3128174,1618996,1509178,715652,611663,...,164228.0,308474.0,192857.0,115617.0,1335049.0,507578.0,827471.0,442198.0,193975.0,248223.0
1,1,0,State - JAMMU & KASHMIR (01),Rural,15 to 59,5123667,2663733,2459934,1514800,1318638,...,294176.0,487956.0,314160.0,173796.0,1856192.0,560084.0,1296108.0,530361.0,208341.0,322020.0
2,1,0,State - JAMMU & KASHMIR (01),Rural,35 to 59,1995493,1044737,950756,799148,706975,...,129948.0,179482.0,121303.0,58179.0,521143.0,52506.0,468637.0,88163.0,14366.0,73797.0
3,1,0,State - JAMMU & KASHMIR (01),Rural,5 to 14,2206292,1160505,1045787,18316,11459,...,13583.0,15659.0,8429.0,7230.0,1591410.0,838612.0,752798.0,54376.0,28481.0,25895.0
4,1,0,State - JAMMU & KASHMIR (01),Rural,60+,651969,342489,309480,135058,121646,...,19319.0,26208.0,20330.0,5878.0,313463.0,113190.0,200273.0,14215.0,5591.0,8624.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14135,35,640,District - South Andaman (03),Urban,35 to 59,44162,24216,19946,28537,22550,...,401.0,671.0,459.0,212.0,14173.0,820.0,13353.0,2206.0,208.0,1998.0
14136,35,640,District - South Andaman (03),Urban,5 to 14,23048,11853,11195,363,210,...,93.0,74.0,34.0,40.0,22119.0,11358.0,10761.0,995.0,514.0,481.0
14137,35,640,District - South Andaman (03),Urban,60+,7199,3999,3200,1325,1156,...,31.0,25.0,20.0,5.0,5718.0,2728.0,2990.0,123.0,63.0,60.0
14138,35,640,District - South Andaman (03),Urban,Age not stated,188,94,94,57,45,...,2.0,7.0,6.0,1.0,122.0,43.0,79.0,17.0,4.0,13.0


In [None]:
# Reformat merged dataset as a longer-format dataframe
# Step 1: Melt
df_long = merged_df.melt(
    id_vars=merge_keys,
    var_name="Variable",
    value_name="Value"
)

# Step 2: Extract components
df_long[["Attribute", "Sex_Caste"]] = df_long["Variable"].str.split(
    ": ", n=1, expand=True)
df_long[["Sex", "Caste"]] = df_long["Sex_Caste"].str.split(
    "_Caste_", n=1, expand=True)

# Step 3: Pivot so that Attributes become columns
merged_df_long = df_long.pivot_table(
    index=merge_keys + ["Sex", "Caste"],
    columns="Attribute",
    values="Value"
).reset_index()
merged_df_long.columns.name = None  # remove 'Attribute' as column index name
merged_df_long

Unnamed: 0,State Code,District Code,Area Name,Total/Rural/Urban,Age-Group,Sex,Caste,Main Workers,Marginal workers - Seeking/available for work,Marginal workers - Worked for 3 to 6 months,Marginal workers - Worked for less than 3 months,Non-workers - All,Non-workers - Seeking/availabe for work,Population
0,1,0,State - JAMMU & KASHMIR (01),Rural,15 to 34,Females,General,75318.0,115617.0,164228.0,90456.0,827471.0,248223.0,1157473.0
1,1,0,State - JAMMU & KASHMIR (01),Rural,15 to 34,Females,SC,9506.0,8741.0,16758.0,5960.0,100778.0,23495.0,133002.0
2,1,0,State - JAMMU & KASHMIR (01),Rural,15 to 34,Females,ST,19165.0,24352.0,43849.0,22266.0,133423.0,33526.0,218703.0
3,1,0,State - JAMMU & KASHMIR (01),Rural,15 to 34,Females,Total,103989.0,148710.0,224835.0,118682.0,1061672.0,305244.0,1509178.0
4,1,0,State - JAMMU & KASHMIR (01),Rural,15 to 34,Males,General,473945.0,192857.0,201801.0,68398.0,507578.0,193975.0,1251722.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169675,35,640,District - South Andaman (03),Urban,Total,Males,Total,42519.0,1926.0,2520.0,392.0,29749.0,5240.0,75180.0
169676,35,640,District - South Andaman (03),Urban,Total,Total,General,52150.0,2825.0,3802.0,546.0,82444.0,15938.0,138942.0
169677,35,640,District - South Andaman (03),Urban,Total,Total,SC,0.0,0.0,0.0,0.0,0.0,0.0,0.0
169678,35,640,District - South Andaman (03),Urban,Total,Total,ST,518.0,11.0,15.0,1.0,1271.0,416.0,1805.0


In [None]:
# state name
states = merged_df_long.loc[merged_df_long["District Code"] == 0,
 ["State Code", "Area Name"]].drop_duplicates().reset_index(drop = True)
states["State Name"] = states["Area Name"].str.extract(
    r"State - (.+) \(\d{1,2}\)")[0].str.strip()
merged_df_long = merged_df_long.merge(states[["State Code", "State Name"]],
                                      on = "State Code", how = "left")
cols = merged_df_long.columns.tolist()
merged_df_long = merged_df_long[
  [cols[0]] + ["State Name"] + [col for col in cols[1:] if col != "State Name"]]
merged_df_long

# column spelling correction
merged_df_long = merged_df_long.rename(columns={
    "Non-workers - Seeking/availabe for work":
    "Non-workers - Seeking/available for work"})

## Setting up Maps for Data Exploration
To enable map-based plotting, we download a repository of map files from [Github](https://github.com/datameet/maps). We then proceed to use a map corresponding to the Indian government's 2011 political map of India. This is important because in 2011, Telangana did not exist as a separate state.

We then want to see how to cross-reference the Census data with the map file. So we examine the map file (specifically, the shape file.

In [None]:
# Download repository of map files to enable map-based plotting
!git clone https://github.com/datameet/maps.git
shapefile_path = \
  "maps/Survey-of-India-Index-Maps/Boundaries/India-Districts-2011Census.shp"
india_districts_gdf = gpd.read_file(shapefile_path)
india_districts_gdf.head()

Cloning into 'maps'...
remote: Enumerating objects: 1179, done.[K
remote: Counting objects: 100% (88/88), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 1179 (delta 50), reused 81 (delta 48), pack-reused 1091 (from 1)[K
Receiving objects: 100% (1179/1179), 171.65 MiB | 18.16 MiB/s, done.
Resolving deltas: 100% (347/347), done.
Updating files: 100% (725/725), done.


Unnamed: 0,DISTRICT,ST_NM,ST_CEN_CD,DT_CEN_CD,censuscode,geometry
0,Adilabad,Andhra Pradesh,28,1,532,"POLYGON ((78.84972 19.7601, 78.85102 19.75945,..."
1,Agra,Uttar Pradesh,9,15,146,"POLYGON ((78.19803 27.4028, 78.19804 27.40278,..."
2,Ahmadabad,Gujarat,24,7,474,"MULTIPOLYGON (((72.03456 23.50527, 72.03337 23..."
3,Ahmadnagar,Maharashtra,27,26,522,"POLYGON ((74.67333 19.9467, 74.67393 19.93509,..."
4,Aizawl,Mizoram,15,3,283,"POLYGON ((92.98749 24.40453, 92.99107 24.40236..."


In [None]:
# Correct Spelling Mistakes
corrections = {
    "Andaman & Nicobar Island": "Andaman & Nicobar Islands",
    "Arunanchal Pradesh": "Arunachal Pradesh",
    "Dadara & Nagar Havelli": "Dadra and Nagar Haveli"
}
india_districts_gdf["ST_NM"] = india_districts_gdf["ST_NM"].replace(corrections)

In [None]:
# Aggregate District level map to get State level map
india_states_gdf = india_districts_gdf[["ST_NM", "ST_CEN_CD", "geometry"]
                        ].dissolve(by="ST_NM", aggfunc="first", as_index=False)
india_states_gdf

Unnamed: 0,ST_NM,geometry,ST_CEN_CD
0,Andaman & Nicobar Islands,"MULTIPOLYGON (((92.52174 10.89667, 92.52197 10...",35
1,Andhra Pradesh,"MULTIPOLYGON (((79.99121 13.48456, 79.98596 13...",28
2,Arunachal Pradesh,"POLYGON ((94.1975 27.49423, 94.19353 27.4931, ...",12
3,Assam,"MULTIPOLYGON (((92.41618 24.25141, 92.40657 24...",18
4,Bihar,"MULTIPOLYGON (((84.49734 24.27946, 84.49901 24...",10
5,Chandigarh,"POLYGON ((76.84147 30.75996, 76.83599 30.73623...",4
6,Chhattisgarh,"POLYGON ((81.96877 18.67453, 81.95977 18.66779...",22
7,Dadra and Nagar Haveli,"POLYGON ((73.20657 20.12216, 73.20797 20.1065,...",26
8,Daman & Diu,"MULTIPOLYGON (((72.80668 20.38423, 72.81109 20...",25
9,Goa,"MULTIPOLYGON (((74.1135 14.75203, 74.11099 14....",30


In [None]:
# Simplify geometry (tolerance in degrees, e.g. 0.005 ≈ 1km)
india_states_gdf_simplified = india_states_gdf.copy()
india_states_gdf_simplified["geometry"] = india_states_gdf_simplified[
    "geometry"].simplify(tolerance=0.005, preserve_topology=True)

# Export simplified state level map as GeoJSON file
india_states_gdf_simplified.to_file("state_map.geojson", driver="GeoJSON")

In the `india_districts_gdf`, the district level map dataset, it appears that the columns `ST_CEN_CD`	and `DT_CEN_CD` correspond to state and district codes. We therefore want to validate that the `State Code` in the census dataset maps correctly with the `ST_CEN_CD` in the maps dataset, and similarly, `District Code` and `DT_CEN_CD`.

We begin with the first, and examine state codes and corresponding state names in the maps dataset.

One of the things we want to ensure that there is a 1:1 map between the state codes and names in this dataset, as that is indeed the case in the census dataset (as we have verified that while cleaning that dataset).

In [None]:
map_states = india_districts_gdf[['ST_CEN_CD','ST_NM']
                        ].drop_duplicates().reset_index(drop = True)
map_states_counts = map_states.groupby('ST_NM')['ST_CEN_CD'].count()
map_states_counts[map_states_counts > 1]

Unnamed: 0_level_0,ST_CEN_CD
ST_NM,Unnamed: 1_level_1
Jammu & Kashmir,2


In [None]:
map_state_codes_counts = map_states.groupby('ST_CEN_CD')['ST_NM'].count()
map_state_codes_counts[map_state_codes_counts > 1]

Unnamed: 0_level_0,ST_NM
ST_CEN_CD,Unnamed: 1_level_1


In [None]:
map_states[map_states['ST_NM'] == "Jammu & Kashmir"]

Unnamed: 0,ST_CEN_CD,ST_NM
11,1,Jammu & Kashmir
27,99,Jammu & Kashmir


We can see above that there are two state codes for Jammu and Kashmir in the maps dataset. We want a 1-1 mapping, however, and specifically, we want the state code of Jammu and Kashmir to be what it is in the census dataset, viz. 1, for proper correspondence. We impute the map data accordingly.

In [None]:
india_districts_gdf.loc[india_districts_gdf['ST_CEN_CD'] == 99, 'ST_CEN_CD'] = 1
map_states = india_districts_gdf[['ST_CEN_CD','ST_NM']
                        ].drop_duplicates().reset_index(drop = True)
map_states_counts = map_states.groupby('ST_NM')['ST_CEN_CD'].count()
map_states_counts[map_states_counts > 1]

Unnamed: 0_level_0,ST_CEN_CD
ST_NM,Unnamed: 1_level_1


Now we proceed to check if there is proper correspondence between the state codes in the census and maps dataset. To do this, we cross-reference the state codes of both datasets and check if the corresponding state names match.

Since the results are small (35 records), we can visually inspect that there is in fact a 100% match.

In [None]:
census_states = merged_df_long.loc[
    merged_df_long['District Code'] == 0, ['State Code', 'Area Name']
    ].drop_duplicates().reset_index(drop = True)
census_states.merge(map_states, left_on = 'State Code', right_on = 'ST_CEN_CD')

Unnamed: 0,State Code,Area Name,ST_CEN_CD,ST_NM
0,1,State - JAMMU & KASHMIR (01),1,Jammu & Kashmir
1,2,State - HIMACHAL PRADESH (02),2,Himachal Pradesh
2,3,State - PUNJAB (03),3,Punjab
3,4,State - CHANDIGARH (04),4,Chandigarh
4,5,State - UTTARAKHAND (05),5,Uttarakhand
5,6,State - HARYANA (06),6,Haryana
6,7,State - NCT OF DELHI (07),7,NCT of Delhi
7,8,State - RAJASTHAN (08),8,Rajasthan
8,9,State - UTTAR PRADESH (09),9,Uttar Pradesh
9,10,State - BIHAR (10),10,Bihar


We now proceed to check whether the census and maps datasets agree at a district level as well. We essentially replicate with District Codes what we do above with State Codes, but with the additional need to look at the State corresponding to a given district, because district names, unlike state names, are not necessarily unique.

In [None]:
map_districts = india_districts_gdf[['DT_CEN_CD','DISTRICT', 'ST_CEN_CD']
                        ].drop_duplicates().reset_index(drop = True)
map_districts_counts = map_districts.groupby(['ST_CEN_CD', 'DISTRICT'])[
    'DT_CEN_CD'].count()
map_districts_counts[map_districts_counts > 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,DT_CEN_CD
ST_CEN_CD,DISTRICT,Unnamed: 2_level_1


In [None]:
map_district_codes_counts = map_districts.groupby(['ST_CEN_CD', 'DT_CEN_CD'])[
    'DISTRICT'].count()
map_district_codes_counts[map_district_codes_counts > 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,DISTRICT
ST_CEN_CD,DT_CEN_CD,Unnamed: 2_level_1


Thankfully, there are no issues with 1:1 mapping of district codes and districts within any state in the Maps dataset.

Our next step would be to check if we are able to successfully find 1:1 correspondence the Census and Maps datasets. So we first explore them along key dimensions.

In [None]:
census_districts = merged_df_long.loc[
    merged_df_long['District Code'] != 0,
    ['District Code', 'Area Name', 'State Code']
    ].drop_duplicates().reset_index(drop = True)
census_districts

Unnamed: 0,District Code,Area Name,State Code
0,1,District - Kupwara (01),1
1,2,District - Badgam (02),1
2,3,District - Leh(Ladakh) (03),1
3,4,District - Kargil (04),1
4,5,District - Punch (05),1
...,...,...,...
635,636,District - Mahe (03),34
636,637,District - Karaikal (04),34
637,638,District - Nicobars (01),35
638,639,District - North & Middle Andaman (02),35


In [None]:
map_districts

Unnamed: 0,DT_CEN_CD,DISTRICT,ST_CEN_CD
0,1,Adilabad,28
1,15,Agra,9
2,7,Ahmadabad,24
3,26,Ahmadnagar,27
4,3,Aizawl,15
...,...,...,...
636,26,Tapi,24
637,1,Nicobar,35
638,3,South Andaman,35
639,2,North & Middle Andaman,35


Cursory exploration makes it clear that in the Census dataset, District Code is unique irrespective the corresponding state. In the Maps dataset, however, District Code is unique only within a given state. We therefore have to impute district codes in the either the Maps dataset or the Census dataset to match the other.

However, we can also see that in the Census dataset, Area Names for districts seem to have a pattern of the kind "District - <District Name\> (<State-level District Code\>)". Where State-level District Code is a unique code within a given state but not across states. For the state (UT) of Andaman and Nicobar Islands, we find that this State-level District Code in the Census dataset matches 100% with the Maps dataset. So, let us try to use the State-level District Code for our mapping, and see what the results look like.

In [None]:
census_districts["State-Level District Code"] = \
  census_districts["Area Name"].str[-3:-1].astype(int)
census_districts

Unnamed: 0,District Code,Area Name,State Code,State-Level District Code
0,1,District - Kupwara (01),1,1
1,2,District - Badgam (02),1,2
2,3,District - Leh(Ladakh) (03),1,3
3,4,District - Kargil (04),1,4
4,5,District - Punch (05),1,5
...,...,...,...,...
635,636,District - Mahe (03),34,3
636,637,District - Karaikal (04),34,4
637,638,District - Nicobars (01),35,1
638,639,District - North & Middle Andaman (02),35,2


There is also 1 row more in the Maps dataset vs. the Census dataset. Below code, however, identifies it to be identified to be a dummy row in the former dataset. So this need not worry us.

In [None]:
map_districts.merge(
    census_districts,
    how="left",
    left_on=["ST_CEN_CD", "DT_CEN_CD"],
    right_on=["State Code", "State-Level District Code"],
    indicator=True
).query('_merge == "left_only"').drop(columns=["_merge"])

Unnamed: 0,DT_CEN_CD,DISTRICT,ST_CEN_CD,District Code,Area Name,State Code,State-Level District Code
135,99,Data Not Available,1,,,,


We now proceed to check if there is proper correspondence between the district codes in the census and maps datasets. As done for state codes, we cross reference the two datasets based on district codes (and state codes, since district names / codes may not be unique across states), only this time, if everything goes well, the results will be so large we can not manually inspect the data.

In [None]:
census_map_districts_merged = census_districts.drop(columns=["District Code"]
    ).merge(map_districts, left_on = ['State Code', 'State-Level District Code'],
    right_on = ['ST_CEN_CD', 'DT_CEN_CD'])
census_map_districts_merged

Unnamed: 0,Area Name,State Code,State-Level District Code,DT_CEN_CD,DISTRICT,ST_CEN_CD
0,District - Kupwara (01),1,1,1,Kupwara,1
1,District - Badgam (02),1,2,2,Badgam,1
2,District - Leh(Ladakh) (03),1,3,3,Leh (ladakh),1
3,District - Kargil (04),1,4,4,Kargil,1
4,District - Punch (05),1,5,5,Punch,1
...,...,...,...,...,...,...
635,District - Mahe (03),34,3,3,Mahe,34
636,District - Karaikal (04),34,4,4,Karaikal,34
637,District - Nicobars (01),35,1,1,Nicobar,35
638,District - North & Middle Andaman (02),35,2,2,North & Middle Andaman,35


Because there are 640 rows in the merged dataset, we know that there is a 1:1 correspondence now between the Maps and Census datasets. The final thing really to do is to ensure that this correspondence is actually good. For this, we simply need to match the District names in the two datasets. From below, we can see that even when district names don't match verbatim, the mismatch is due to innocuous reasons like spelling, case or format mismatch. We are good to use the Maps dataset now!

In [None]:
# Extract District Name from the Census Area Name,
# e.g., extract "South Andaman" from "District - South Andaman (03)"
census_map_districts_merged["Census District"] = census_map_districts_merged[
    "Area Name"].str.extract(r"District - (.+) \(\d{1,2}\)")[0].str.strip()

# Display problem cases, where Census and Map District names do not match
census_map_districts_merged[census_map_districts_merged[
    "Census District"] != census_map_districts_merged["DISTRICT"]
                            ].reset_index(drop = True)

Unnamed: 0,Area Name,State Code,State-Level District Code,DT_CEN_CD,DISTRICT,ST_CEN_CD,Census District
0,District - Leh(Ladakh) (03),1,3,3,Leh (ladakh),1,Leh(Ladakh)
1,District - Almora\n (09),5,9,9,Almora,5,
2,District - Siddharthnagar (53),9,53,53,Siddharth Nagar,9,Siddharthnagar
3,District - Mahrajganj (56),9,56,56,Maharajganj,9,Mahrajganj
4,District - Sant Ravidas Nagar (Bhadohi) (67),9,67,67,Sant Ravi Das Nagar(bhadohi),9,Sant Ravidas Nagar (Bhadohi)
5,District - Kanshiram Nagar (71),9,71,71,Kansiram Nagar,9,Kanshiram Nagar
6,District - Saran (17),10,17,17,Saran (chhapra),10,Saran
7,District - Kaimur (Bhabua) (31),10,31,31,Kaimur (bhabua),10,Kaimur (Bhabua)
8,District - North District (01),11,1,1,North,11,North District
9,District - West District (02),11,2,2,West,11,West District


In [None]:
merged_df_long["State-Level District Code"] = merged_df_long[
    "Area Name"].str[-3:-1].astype(int)
cols = list(merged_df_long.columns)
merged_df_long = merged_df_long.loc[:, cols[:1] + [cols[-1]] + cols[1:-1]
                                    ]
merged_df_long

Unnamed: 0,State Code,State-Level District Code,State Name,District Code,Area Name,Total/Rural/Urban,Age-Group,Sex,Caste,Main Workers,Marginal workers - Seeking/available for work,Marginal workers - Worked for 3 to 6 months,Marginal workers - Worked for less than 3 months,Non-workers - All,Non-workers - Seeking/available for work,Population
0,1,1,JAMMU & KASHMIR,0,State - JAMMU & KASHMIR (01),Rural,15 to 34,Females,General,75318.0,115617.0,164228.0,90456.0,827471.0,248223.0,1157473.0
1,1,1,JAMMU & KASHMIR,0,State - JAMMU & KASHMIR (01),Rural,15 to 34,Females,SC,9506.0,8741.0,16758.0,5960.0,100778.0,23495.0,133002.0
2,1,1,JAMMU & KASHMIR,0,State - JAMMU & KASHMIR (01),Rural,15 to 34,Females,ST,19165.0,24352.0,43849.0,22266.0,133423.0,33526.0,218703.0
3,1,1,JAMMU & KASHMIR,0,State - JAMMU & KASHMIR (01),Rural,15 to 34,Females,Total,103989.0,148710.0,224835.0,118682.0,1061672.0,305244.0,1509178.0
4,1,1,JAMMU & KASHMIR,0,State - JAMMU & KASHMIR (01),Rural,15 to 34,Males,General,473945.0,192857.0,201801.0,68398.0,507578.0,193975.0,1251722.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169675,35,3,ANDAMAN & NICOBAR ISLANDS,640,District - South Andaman (03),Urban,Total,Males,Total,42519.0,1926.0,2520.0,392.0,29749.0,5240.0,75180.0
169676,35,3,ANDAMAN & NICOBAR ISLANDS,640,District - South Andaman (03),Urban,Total,Total,General,52150.0,2825.0,3802.0,546.0,82444.0,15938.0,138942.0
169677,35,3,ANDAMAN & NICOBAR ISLANDS,640,District - South Andaman (03),Urban,Total,Total,SC,0.0,0.0,0.0,0.0,0.0,0.0,0.0
169678,35,3,ANDAMAN & NICOBAR ISLANDS,640,District - South Andaman (03),Urban,Total,Total,ST,518.0,11.0,15.0,1.0,1271.0,416.0,1805.0


# Part 2: State Level Data Exploration

In [None]:
# create a dataset with only state level rollup
state_level_data = merged_df_long[merged_df_long["District Code"] == 0]
state_level_data = state_level_data.loc[:, ~state_level_data.columns.isin(
    ["State-Level District Code", "District Code"])]
state_level_data

Unnamed: 0,State Code,State Name,Area Name,Total/Rural/Urban,Age-Group,Sex,Caste,Main Workers,Marginal workers - Seeking/available for work,Marginal workers - Worked for 3 to 6 months,Marginal workers - Worked for less than 3 months,Non-workers - All,Non-workers - Seeking/available for work,Population
0,1,JAMMU & KASHMIR,State - JAMMU & KASHMIR (01),Rural,15 to 34,Females,General,75318.0,115617.0,164228.0,90456.0,827471.0,248223.0,1157473.0
1,1,JAMMU & KASHMIR,State - JAMMU & KASHMIR (01),Rural,15 to 34,Females,SC,9506.0,8741.0,16758.0,5960.0,100778.0,23495.0,133002.0
2,1,JAMMU & KASHMIR,State - JAMMU & KASHMIR (01),Rural,15 to 34,Females,ST,19165.0,24352.0,43849.0,22266.0,133423.0,33526.0,218703.0
3,1,JAMMU & KASHMIR,State - JAMMU & KASHMIR (01),Rural,15 to 34,Females,Total,103989.0,148710.0,224835.0,118682.0,1061672.0,305244.0,1509178.0
4,1,JAMMU & KASHMIR,State - JAMMU & KASHMIR (01),Rural,15 to 34,Males,General,473945.0,192857.0,201801.0,68398.0,507578.0,193975.0,1251722.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168919,35,ANDAMAN & NICOBAR ISLANDS,State - ANDAMAN & NICOBAR ISLANDS (35),Urban,Total,Males,Total,43208.0,1999.0,2576.0,435.0,30365.0,5342.0,76584.0
168920,35,ANDAMAN & NICOBAR ISLANDS,State - ANDAMAN & NICOBAR ISLANDS (35),Urban,Total,Total,General,53022.0,2933.0,3888.0,618.0,84145.0,16235.0,141673.0
168921,35,ANDAMAN & NICOBAR ISLANDS,State - ANDAMAN & NICOBAR ISLANDS (35),Urban,Total,Total,SC,0.0,0.0,0.0,0.0,0.0,0.0,0.0
168922,35,ANDAMAN & NICOBAR ISLANDS,State - ANDAMAN & NICOBAR ISLANDS (35),Urban,Total,Total,ST,522.0,11.0,15.0,1.0,1277.0,419.0,1815.0


In [None]:
map_output_folder = "/content/drive/MyDrive/Socratus/Gender/Data Exploration/Stage I/B-01_Saved_Maps"



## Part 2.1: Explore the general (un)employment picture

In [None]:
# Dropdowns with defaults
tru_filter = widgets.Dropdown(
  options=state_level_data["Total/Rural/Urban"].unique(),
  description="Area Type",
  value="Total"
)

age_filter = widgets.Dropdown(
  options=state_level_data["Age-Group"].unique(),
  description="Age",
  value="15 to 59"
)

sex_filter = widgets.Dropdown(
  options=state_level_data["Sex"].unique(),
  description="Sex",
  value="Total"
)

caste_filter = widgets.Dropdown(
  options=state_level_data["Caste"].unique(),
  description="Caste",
  value="Total"
)

fact_dropdown = widgets.Dropdown(
  options=[
    'Population',
    'Workers (Main and Marginal)',
    'Main Workers',
    'Marginal workers - Seeking/available for work',
    'Marginal workers - Worked for 3 to 6 months',
    'Marginal workers - Worked for less than 3 months',
    'Non-workers - All',
    'Non-workers - Seeking/available for work',
    'Proportion_Workers',
    'Proportion_Main_Workers',
    'Proportion_Marginal_Workers'
  ],
  value='Population',
  description="Fact"
)

# Show map button
show_map_button_display = widgets.Button(description="Show Map")
show_map_button_save = widgets.Button(description="Create and Save Map")

# Export data button
export_data_button = widgets.Button(description="Export Data")

# Dedicated output widget for save messages
save_messages_output = widgets.Output()

# User Interface
ui = widgets.VBox([
    tru_filter, age_filter, sex_filter, caste_filter,
    fact_dropdown, show_map_button_save, show_map_button_display,
    export_data_button
])
map_output = widgets.Output()

In [None]:
def update_state_level_map(button=None, save_map=True, export_data=False):

  # Clear output
  clear_output(wait=True)
  if not save_map:
    save_messages_output.clear_output(wait=True)

  # Show progress while rendering
  if not export_data:
    with map_output:
      display(widgets.HTML("<b>Rendering map... please wait ⏳</b>"))

  # Filter state_level_data based on selected filters
  df = state_level_data[
      (state_level_data["Total/Rural/Urban"] == tru_filter.value) &
      (state_level_data["Age-Group"] == age_filter.value) &
      (state_level_data["Sex"] == sex_filter.value) &
      (state_level_data["Caste"] == caste_filter.value)
  ].copy()
  filters_applied = ([tru_filter.value, age_filter.value,
                          sex_filter.value, caste_filter.value])
  if df.empty:
    print("No data for selected filters.")
    return

  # Add in new fact(s) - if required
  selected_fact = fact_dropdown.value
  if selected_fact == "Workers (Main and Marginal)":
      df["Workers (Main and Marginal)"] = \
      df["Population"] - df["Non-workers - All"]
  elif selected_fact == "Proportion_Workers":
      df["Workers (Main and Marginal)"] = \
        df["Population"] - df["Non-workers - All"] # Reqd for Proportion_Workers
      df["Proportion_Workers"] = \
        df["Workers (Main and Marginal)"] / df["Population"]
  elif selected_fact == "Proportion_Main_Workers":
      df["Proportion_Main_Workers"] = df["Main Workers"] / df["Population"]
  elif selected_fact == "Proportion_Marginal_Workers":
      df["Proportion_Marginal_Workers"] = (
          df["Marginal workers - Worked for 3 to 6 months"] +
          df["Marginal workers - Worked for less than 3 months"]
      ) / df["Population"]

  # Retain only State Code, State Name and Selected Fact
  data = df[["State Code", selected_fact, "State Name"]]

  # Merge with Maps dataset
  gdf_merged = india_districts_gdf.merge(
    data, left_on="ST_CEN_CD", right_on="State Code", how="left"
  )

  # Ensure fact values and state name are in GeoJSON properties
  gdf_merged["tooltip_text"] = (
      gdf_merged["State Name"] + "<br>" +
      f"{selected_fact}: " + gdf_merged[selected_fact].round(3).astype(str)
  )

  # Export CSV if requested
  if export_data:
    gdf_merged[["ST_NM", selected_fact]].drop_duplicates().to_csv(
        "data_export.csv", index=False)
    with save_messages_output:
      display(widgets.HTML(
          f"<b>Data Exported to <code>data_export.csv</code> ✅</b>"))
  else: # create map
    # Create empty map
    folium_map = folium.Map(
        tiles="cartodbpositron",
        width="600px",
        height="650px",
        embed=True)

    # Add choropleth
    choropleth = Choropleth(
      geo_data=gdf_merged.to_json(),
      name="choropleth",
      data=gdf_merged,
      columns=["ST_CEN_CD", selected_fact],
      key_on="feature.properties.ST_CEN_CD",
      fill_color="YlGnBu",
      fill_opacity=0.7,
      line_opacity=0.2,
      legend_name=selected_fact
    ).add_to(folium_map)

    # Add tooltip
    folium.GeoJson(
      gdf_merged,
      name="State Labels",
      style_function=lambda x: {"fillOpacity": 0, "color": "transparent"},
      tooltip=folium.GeoJsonTooltip(
        fields=["tooltip_text"],
        aliases=[""],
        labels=False,
        sticky=True
      )
    ).add_to(folium_map)

    # Auto-fit to GeoDataFrame bounds
    bounds = gdf_merged.geometry.total_bounds
    folium_map.fit_bounds([[bounds[1], bounds[0]], [bounds[3], bounds[2]]])

    # Setup title
    parts = [selected_fact, "by State"]
    if filters_applied:
        parts += filters_applied
    title = "_".join(p.replace(" ", "_").replace("/", "_").replace("-", "_"
      ).replace(":", "") for p in parts)
    html_title = f"{selected_fact} by State"
    if filters_applied:
        html_title += "<br>" + " | ".join(filters_applied)

    html_title = f"<h3 style='font-family:Arial'>{html_title}</h3>"

    # Save or Display Map
    if save_map:
      # Save map to HTML file
      output_dir = map_output_folder
      os.makedirs(output_dir, exist_ok=True)  # ensure the folder exists
      folium_map.save(f"{output_dir}/{title}.html")
      with save_messages_output:
        display(widgets.HTML(f"<b>Map saved as: <code>{title}.html</code> ✅</b>"))
    else: # Display map within Colab
      with map_output:
        clear_output(wait=True)
        display(HTML(f"<h3>{html_title}</h3>")) # Display the title
        display(folium_map) # display map

In [None]:
# Connect the button to the update function
show_map_button_save.on_click(update_state_level_map)  # save_map=True by default
show_map_button_display.on_click(
    lambda b: update_state_level_map(b, save_map=False))
export_data_button.on_click(
    lambda b: update_state_level_map(b, export_data=True))

# Display UI
display(ui)
display(map_output)
display(save_messages_output)

VBox(children=(Dropdown(description='Area Type', index=1, options=('Rural', 'Total', 'Urban'), value='Total'),…

Output()

Output()

## Part 2.2: Explore the sex-nuance within the general (un)employment picture

In [None]:
# Dropdowns with defaults
tru_filter_by_sex = widgets.Dropdown(
  options=merged_df_long["Total/Rural/Urban"].unique(),
  description="Area Type",
  value="Total"
)

age_filter_by_sex = widgets.Dropdown(
  options=merged_df_long["Age-Group"].unique(),
  description="Age",
  value="15 to 59"
)

caste_filter_by_sex = widgets.Dropdown(
  options=merged_df_long["Caste"].unique(),
  description="Caste",
  value="Total"
)

fact_dropdown_by_sex = widgets.Dropdown(
  options=[
    'Sex Ratio in Population',
    'Sex Ratio amongst Workers (Main and Marginal)',
    'Sex Ratio amongst Main Workers',
    'Sex Ratio amongst Marginal workers - Seeking/available for work',
    'Sex Ratio amongst Marginal workers - Worked for 3 to 6 months',
    'Sex Ratio amongst Marginal workers - Worked for less than 3 months',
    'Sex Ratio amongst Non-workers - All',
    'Sex Ratio amongst Non-workers - Seeking/available for work',
    'Sex Disparity amongst Workers (Main and Marginal)',
    'Sex Disparity amongst Main Workers',
    'Sex Disparity amongst Marginal workers - Seeking/available for work',
    'Sex Disparity amongst Marginal workers - Worked for 3 to 6 months',
    'Sex Disparity amongst Marginal workers - Worked for less than 3 months',
    'Sex Disparity amongst Non-workers - All',
    'Sex Disparity amongst Non-workers - Seeking/available for work'
  ],
  value='Sex Ratio in Population',
  description="Fact"
)

# Show map button
show_map_button_display_by_sex = widgets.Button(description="Show Map")
show_map_button_save_by_sex = widgets.Button(description="Create and Save Map")

# Export data button
export_data_button_by_sex = widgets.Button(description="Export Data")

# Dedicated output widget for save messages
save_messages_output_by_sex = widgets.Output()

# User Interface
ui_by_sex = widgets.VBox([
    tru_filter_by_sex, age_filter_by_sex, caste_filter_by_sex,
    fact_dropdown_by_sex, show_map_button_display_by_sex,
    show_map_button_save_by_sex, export_data_button_by_sex
])
map_output_by_sex = widgets.Output()

In [None]:
def update_state_level_ratios_map(button=None, save_map=True,
                                  export_data=False):

  # Clear output
  map_output_by_sex.clear_output(wait=True)
  if not save_map:
    save_messages_output_by_sex.clear_output(wait=True)

  # Show progress while rendering
  if not export_data:
    with map_output_by_sex:
      display(widgets.HTML("<b>Rendering map... please wait ⏳</b>"))

  # Filter state_level_data based on selected filters
  df = state_level_data.copy()
  df = state_level_data[
    (state_level_data["Total/Rural/Urban"] == tru_filter_by_sex.value) &
    (state_level_data["Age-Group"] == age_filter_by_sex.value) &
    (state_level_data["Caste"] == caste_filter_by_sex.value)
  ]
  df = df[df["Sex"] != "Total"]
  if df.empty or df[df["Sex"] == "Females"].empty:
    print("No data for selected filters.")
    return

  # Collect selected filters for title
  filters_applied = [
    tru_filter_by_sex.value, age_filter_by_sex.value, caste_filter_by_sex.value
  ]

  # Keep only relevant columns
  selected_fact = fact_dropdown_by_sex.value
  ratio_core_group = re.match(r"Sex Ratio (?:in|amongst) (.+)", selected_fact)
  disp_core_group  = re.match(r"Sex Disparity amongst (.+)", selected_fact)
  if ratio_core_group:
      group = ratio_core_group.group(1)
      metrics = (
          ["Main Workers",
          "Marginal workers - Worked for 3 to 6 months",
          "Marginal workers - Worked for less than 3 months"]
          if group == "Workers (Main and Marginal)" # exceptional case
          else [group]
      )
  else:
      group = disp_core_group.group(1)
      base = (
          ["Main Workers",
          "Marginal workers - Worked for 3 to 6 months",
          "Marginal workers - Worked for less than 3 months"]
          if group == "Workers (Main and Marginal)" # exceptional case
          else [group]
      )
      metrics = ["Population", *base]
  index_cols = ["State Code", "State Name"]
  filtered_df = df[index_cols + ["Sex"] + metrics]

  # Pivot facts by Sex
  pivoted = filtered_df.pivot(index=index_cols, columns="Sex", values=metrics
                              ).fillna(0)
  pivoted_columns= [f"{col}: {sex}" for col, sex in pivoted.columns]
  pivoted.columns = pivoted_columns
  pivoted = pivoted.reset_index()

  # Add in new fact(s) if required
  group = (ratio_core_group or disp_core_group).group(1)
  male_col = group + ": Males"
  female_col = group + ": Females"
  if group == 'Population':
    pivoted["Sex Ratio in Population"] = pivoted[male_col] / pivoted[female_col]
  elif group == "Workers (Main and Marginal)":
    male_sum = (
        pivoted["Main Workers: Males"] +
        pivoted["Marginal workers - Worked for 3 to 6 months: Males"] +
        pivoted["Marginal workers - Worked for less than 3 months: Males"]
    )
    female_sum = (
        pivoted["Main Workers: Females"] +
        pivoted["Marginal workers - Worked for 3 to 6 months: Females"] +
        pivoted["Marginal workers - Worked for less than 3 months: Females"]
    )
    pivoted[f"Sex Ratio amongst {group}"] = male_sum / female_sum
  else:
    pivoted[f"Sex Ratio amongst {group}"] = \
      pivoted[male_col] / pivoted[female_col]
  if disp_core_group:
    if "Sex Ratio in Population" not in pivoted.columns:
      pivoted["Sex Ratio in Population"] = (
        pivoted["Population: Males"] / pivoted["Population: Females"]
      )
    pivoted[f"Sex Disparity amongst {group}"] = \
      pivoted[f"Sex Ratio amongst {group}"] - pivoted["Sex Ratio in Population"]

  # Retain only State Code, State Name and Selected Fact
  data = pivoted[["State Code", selected_fact, "State Name"]]

  # Merge with Maps dataset
  gdf_merged = india_districts_gdf.merge(
    data, left_on="ST_CEN_CD", right_on="State Code", how="left"
  )

  # Ensure fact values and state name are in GeoJSON properties
  gdf_merged["tooltip_text"] = (
      gdf_merged["State Name"] + "<br>" +
      f"{selected_fact}: " + gdf_merged[selected_fact].round(3).astype(str)
  )

  # Export CSV if requested
  if export_data:
    gdf_merged[["ST_NM", selected_fact]].drop_duplicates().to_csv(
        "data_export.csv", index=False)
    with save_messages_output_by_sex:
      display(widgets.HTML(
          f"<b>Data Exported to <code>data_export.csv</code> ✅</b>"))
  else: # create maps
    # Create empty map
    folium_map = folium.Map(
      tiles="cartodbpositron",
      width="600px",
      height="650px"
    )

    # Add choropleth
    choropleth = Choropleth(
      geo_data=gdf_merged.to_json(),
      name="choropleth",
      data=gdf_merged,
      columns=["ST_CEN_CD", selected_fact],
      key_on="feature.properties.ST_CEN_CD",
      fill_color="YlGnBu",
      fill_opacity=0.7,
      line_opacity=0.2,
      legend_name=selected_fact
    ).add_to(folium_map)

    # Add tooltip
    folium.GeoJson(
      gdf_merged,
      name="State Labels",
      style_function=lambda x: {"fillOpacity": 0, "color": "transparent"},
      tooltip=folium.GeoJsonTooltip(
        fields=["tooltip_text"],
        aliases=[""],
        labels=False,
        sticky=True
      )
    ).add_to(folium_map)

    # Auto-fit to GeoDataFrame bounds
    bounds = gdf_merged.geometry.total_bounds
    folium_map.fit_bounds([[bounds[1], bounds[0]], [bounds[3], bounds[2]]])

    # Setup title
    parts = [selected_fact, "by State"]
    if filters_applied:
      parts += filters_applied
    title = "_".join(p.replace(" ", "_").replace("/", "_").replace("-", "_")
                    .replace(":", "") for p in parts)
    html_title = " | ".join(parts)

    if save_map:
      # Save map to HTML file
      output_dir = map_output_folder
      os.makedirs(output_dir, exist_ok=True)  # ensure the folder exists
      folium_map.save(f"{output_dir}/{title}.html")
      with save_messages_output_by_sex:
        display(widgets.HTML(
          f"<b>Map saved as: <code>{title}.html</code> ✅</b>"))
    else:  # Display map within Colab
      with map_output_by_sex:
        clear_output(wait=True)
        display(HTML(f"<h3>{html_title}</h3>"))  # First display the title
        display(folium_map)  # Then, display the folium map

In [None]:
# Connect the button to the update function
show_map_button_save_by_sex.on_click(update_state_level_ratios_map)
show_map_button_display_by_sex.on_click(
    lambda b: update_state_level_ratios_map(b, save_map=False))
export_data_button_by_sex.on_click(
    lambda b: update_state_level_ratios_map(b, export_data=True))
# Display UI
display(ui_by_sex)
display(map_output_by_sex)
display(save_messages_output_by_sex)

VBox(children=(Dropdown(description='Area Type', index=1, options=('Rural', 'Total', 'Urban'), value='Total'),…

Output()

Output()

## Analysis
Looking at the [Sex Ratio map](https://drive.google.com/file/d/1XwdFfvuCFuDr7qIkEGQ4mI3-d2zV_I1a/view?usp=sharing) for the age group 15-59 years (across castes and area types), the appears to be very little variation in the metric across the country. This prompted us to review the distribution of the metric across states.

In [None]:
df = state_level_data.loc[
    (state_level_data["Total/Rural/Urban"] == tru_filter_by_sex.value) &
    (state_level_data["Age-Group"] == age_filter_by_sex.value) &
    (state_level_data["Caste"] == caste_filter_by_sex.value) &
    (state_level_data["Sex"] != "Total"),
  ["State Name", "Sex", "Population"]].reset_index(drop=True)

In [None]:
pivot_df = df.pivot(index='State Name',
                    columns='Sex',
                    values='Population').reset_index()
pivot_df.columns.name = None
pivot_df = pivot_df.rename(columns={'Males': 'Male_Pop', 'Females': 'Female_Pop'})
pivot_df['Sex Ratio'] = pivot_df['Male_Pop'] / pivot_df['Female_Pop']
pivot_df['Total_Pop'] = pivot_df['Male_Pop'] + pivot_df['Female_Pop']
sorted_df = pivot_df.sort_values(by='Sex Ratio').reset_index(drop=True)
sorted_df['Pop %'] = sorted_df['Total_Pop'] / sorted_df['Total_Pop'].sum() * 100
sorted_df['Cumulative Pop %'] = sorted_df['Pop %'].cumsum()

In [None]:
# Distplot
dist_fig = ff.create_distplot(
    [pivot_df['Sex Ratio']],   # List of series
    group_labels=['Sex Ratio'], # Legend label
    bin_size=.05,
    show_rug=False,
    curve_type='kde'
)

In [None]:
# Prepare secondary y-axis plot
pareto_fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"secondary_y": True}]],
    subplot_titles=["Sex Ratio vs Cumulative Population (Pareto Chart)"]
)

# Add bars for sex ratio
pareto_fig.add_trace(go.Bar(
    x=sorted_df['State Name'],
    y=sorted_df['Sex Ratio'],
    name='Sex Ratio',
    marker_color='steelblue',
    hovertemplate='<b>%{x}</b><br>Sex Ratio: %{y:.2f}<br>Total Pop: %{customdata[0]:,}',
    customdata=sorted_df[['Total_Pop']],
), secondary_y=False)

# Add line for cumulative population %
pareto_fig.add_trace(go.Scatter(
    x=sorted_df['State Name'],
    y=sorted_df['Cumulative Pop %'],
    mode='lines+markers',
    name='Cumulative Population %',
    line=dict(color='firebrick', width=3, dash='dot'),
    marker=dict(size=6),
    hovertemplate='Cumulative Pop %: %{y:.1f}%'
), secondary_y=True)

_ = pareto_fig  # suppress automatic display

In [None]:
dist_fig.update_layout(
    height=400,
    title=dict(
        text='Distribution of Sex Ratio Across States',
        x=0.5,
        xanchor='center',
        font=dict(size=16)
    ),
    legend=dict(x=0.01, y=0.99, bgcolor='white', borderwidth=1),
    margin=dict(t=50, l=50, r=50, b=50),
    yaxis_title='Density'
)

dist_fig.data[0].marker.color = 'lightblue'
dist_fig.data[0].name = 'Normalised Histogram'
dist_fig.data[1].name = 'KDE'
dist_fig.data[1].line.color = 'royalblue'
dist_fig.data[1].showlegend = True
dist_fig.update_xaxes(range=[0.85, 2])

pareto_fig.update_layout(
    height=700,
    bargap=0.2,
    hovermode='x unified',
    legend=dict(x=0.01, y=0.99, bgcolor='white', borderwidth=1),
    margin=dict(t=80, l=80, r=80, b=80)
)

pareto_fig.update_yaxes(title_text='Sex Ratio', range=[0.9, 2], secondary_y=False)
pareto_fig.update_yaxes(title_text='Cumulative Population %', range=[0, 101], secondary_y=True)
pareto_fig.update_xaxes(title_text='States (Sorted by Sex Ratio)', tickangle=45)

dist_fig.show()
pareto_fig.show()


What this demonstrates is that two states (union territories, rather), `Daman & Diu` and `Dadra & Nagar Haveli`, which account for less than 0.1% population of India, are skewing the Sex Ratio distribution highly!

Below we remove these outliers and replot:

In [None]:
state_level_data = state_level_data[
  (state_level_data["State Name"] != "DAMAN & DIU") &
  (state_level_data["State Name"] != "DADRA & NAGAR HAVELI")
].reset_index(drop=True)

In [None]:
df = state_level_data.loc[
    (state_level_data["Total/Rural/Urban"] == tru_filter_by_sex.value) &
    (state_level_data["Age-Group"] == age_filter_by_sex.value) &
    (state_level_data["Caste"] == caste_filter_by_sex.value) &
    (state_level_data["Sex"] != "Total"),
  ["State Name", "Sex", "Population"]].reset_index(drop=True)

In [None]:
pivot_df = df.pivot(index='State Name',
                    columns='Sex',
                    values='Population').reset_index()
pivot_df.columns.name = None
pivot_df = pivot_df.rename(columns={'Males': 'Male_Pop', 'Females': 'Female_Pop'})
pivot_df['Sex Ratio'] = pivot_df['Male_Pop'] / pivot_df['Female_Pop']
pivot_df['Total_Pop'] = pivot_df['Male_Pop'] + pivot_df['Female_Pop']
sorted_df = pivot_df.sort_values(by='Sex Ratio').reset_index(drop=True)
sorted_df['Pop %'] = sorted_df['Total_Pop'] / sorted_df['Total_Pop'].sum() * 100
sorted_df['Cumulative Pop %'] = sorted_df['Pop %'].cumsum()

In [None]:
# Distplot
dist_fig = ff.create_distplot(
    [pivot_df['Sex Ratio']],   # List of series
    group_labels=['Sex Ratio'], # Legend label
    bin_size=.05,
    show_rug=False,
    curve_type='kde'
)

In [None]:
# Prepare secondary y-axis plot
pareto_fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"secondary_y": True}]],
    subplot_titles=["Sex Ratio vs Cumulative Population (Pareto Chart)"]
)

# Add bars for sex ratio
pareto_fig.add_trace(go.Bar(
    x=sorted_df['State Name'],
    y=sorted_df['Sex Ratio'],
    name='Sex Ratio',
    marker_color='steelblue',
    hovertemplate='<b>%{x}</b><br>Sex Ratio: %{y:.2f}<br>Total Pop: %{customdata[0]:,}',
    customdata=sorted_df[['Total_Pop']],
), secondary_y=False)

# Add line for cumulative population %
pareto_fig.add_trace(go.Scatter(
    x=sorted_df['State Name'],
    y=sorted_df['Cumulative Pop %'],
    mode='lines+markers',
    name='Cumulative Population %',
    line=dict(color='firebrick', width=3, dash='dot'),
    marker=dict(size=6),
    hovertemplate='Cumulative Pop %: %{y:.1f}%'
), secondary_y=True)

_ = pareto_fig  # suppress automatic display

In [None]:
dist_fig.update_layout(
    height=400,
    title=dict(
        text='Distribution of Sex Ratio Across States',
        x=0.5,
        xanchor='center',
        font=dict(size=16)
    ),
    legend=dict(x=0.01, y=0.99, bgcolor='white', borderwidth=1),
    margin=dict(t=50, l=50, r=50, b=50),
    yaxis_title='Density'
)

dist_fig.data[0].marker.color = 'lightblue'
dist_fig.data[0].name = 'Normalised Histogram'
dist_fig.data[1].name = 'KDE'
dist_fig.data[1].line.color = 'royalblue'
dist_fig.data[1].showlegend = True
dist_fig.update_xaxes(range=[0.9, 1.25])

pareto_fig.update_layout(
    height=700,
    bargap=0.2,
    hovermode='x unified',
    legend=dict(x=0.01, y=0.99, bgcolor='white', borderwidth=1),
    margin=dict(t=80, l=80, r=80, b=80)
)

pareto_fig.update_yaxes(title_text='Sex Ratio', range=[0.89, 1.26], secondary_y=False)
pareto_fig.update_yaxes(title_text='Cumulative Population %', range=[0, 101], secondary_y=True)
pareto_fig.update_xaxes(title_text='States (Sorted by Sex Ratio)', tickangle=45)

dist_fig.show()
pareto_fig.show()

In [None]:
map_output_folder = "/content/drive/MyDrive/Socratus/Gender/Data Exploration/Stage I/B-01_Saved_Maps_Final"

## Part 2.3: Intersectional Charts

In [None]:
state_level_is_data = state_level_data.loc[
    (state_level_data["Total/Rural/Urban"] != "Total") &
    (state_level_data["Age-Group"] == "15 to 59") &
    (state_level_data["Caste"] != "Total") &
    (state_level_data["Sex"] != "Total"),
    ["Total/Rural/Urban", "Caste", "Sex", "Main Workers",
     "Marginal workers - Worked for 3 to 6 months",
     "Marginal workers - Worked for less than 3 months", "Population"]]
state_level_is_data

Unnamed: 0,Total/Rural/Urban,Caste,Sex,Main Workers,Marginal workers - Worked for 3 to 6 months,Marginal workers - Worked for less than 3 months,Population
12,Rural,General,Females,142451.0,294176.0,157796.0,1890531.0
13,Rural,SC,Females,18333.0,29205.0,9917.0,218136.0
14,Rural,ST,Females,35378.0,75777.0,36779.0,351267.0
16,Rural,General,Males,1033400.0,358462.0,109348.0,2061294.0
17,Rural,SC,Males,126134.0,40277.0,10966.0,234755.0
...,...,...,...,...,...,...,...
7849,Urban,SC,Females,0.0,0.0,0.0,0.0
7850,Urban,ST,Females,121.0,2.0,1.0,667.0
7852,Urban,General,Males,41355.0,2355.0,407.0,54451.0
7853,Urban,SC,Males,0.0,0.0,0.0,0.0


In [None]:
# Get Caste Groups
state_level_is_data['Caste Group'] = state_level_is_data['Caste'].apply(
    lambda x: 'sc-st' if x.lower() in ['sc', 'st'] else 'general')

# Get Total Workers
state_level_is_data['Total Workers'] = (
    state_level_is_data['Main Workers'] +
    state_level_is_data['Marginal workers - Worked for 3 to 6 months'] +
    state_level_is_data['Marginal workers - Worked for less than 3 months']
)

# Group and pivot by Area + Caste Group + Sex
state_level_is_data_grouped = state_level_is_data.groupby(
    ['Total/Rural/Urban', 'Caste Group', 'Sex'], as_index=False)[
    ['Population', 'Total Workers']
].sum()

# Pivot to separate Male and Female values (wide form over Sex dimension)
state_level_is_data_pivoted = state_level_is_data_grouped.pivot_table(
    index=['Total/Rural/Urban', 'Caste Group'],
    columns='Sex',
    values=['Population', 'Total Workers']
).reset_index()

# Flatten multi-index columns
state_level_is_data_pivoted.columns = [
    '_'.join(col).lower() for col in state_level_is_data_pivoted.columns
    ]
state_level_is_data_pivoted

Unnamed: 0,total/rural/urban_,caste group_,population_females,population_males,total workers_females,total workers_males
0,Rural,general,166579466.0,175676770.0,67599532.0,138462144.0
1,Rural,sc-st,69396672.0,72140187.0,39588165.0,59293575.0
2,Urban,general,100268000.0,108282342.0,20339155.0,82066550.0
3,Urban,sc-st,18202461.0,19127944.0,5030065.0,14306500.0


In [None]:
# Calculate ratios and disparities
state_level_is_data_pivoted['sex_ratio_population'] = (
    state_level_is_data_pivoted['population_males'] / \
    state_level_is_data_pivoted['population_females']
)
state_level_is_data_pivoted['sex_ratio_workers'] = (
    state_level_is_data_pivoted['total workers_males'] /\
    state_level_is_data_pivoted['total workers_females']
)
state_level_is_data_pivoted['sex_disparity'] = (
    state_level_is_data_pivoted['sex_ratio_workers'] - \
    state_level_is_data_pivoted['sex_ratio_population']
)

state_level_is_data_pivoted

Unnamed: 0,total/rural/urban_,caste group_,population_females,population_males,total workers_females,total workers_males,sex_ratio_population,sex_ratio_workers,sex_disparity
0,Rural,general,166579466.0,175676770.0,67599532.0,138462144.0,1.054612,2.048271,0.993658
1,Rural,sc-st,69396672.0,72140187.0,39588165.0,59293575.0,1.039534,1.49776,0.458226
2,Urban,general,100268000.0,108282342.0,20339155.0,82066550.0,1.079929,4.034905,2.954975
3,Urban,sc-st,18202461.0,19127944.0,5030065.0,14306500.0,1.050844,2.844198,1.793354


In [None]:
# Subplot layout without shared axes, reordered: [General, SC-ST] x [Urban, Rural]
fig = make_subplots(
    rows=2, cols=2,
    vertical_spacing=0.125,  # reduce from default (~0.15)
    subplot_titles=[
        "General – Urban", "General – Rural",
        "SC-ST – Urban", "SC-ST – Rural"
    ]
)

# Define new plot order: (caste, area) → (row, col)
positions = {
    ('general', 'Urban'): (1, 1),
    ('general', 'Rural'): (1, 2),
    ('sc-st', 'Urban'): (2, 1),
    ('sc-st', 'Rural'): (2, 2)
}

# Loop through combinations
for caste in ['general', 'sc-st']:
    for area in ['Urban', 'Rural']:
        row, col = positions[(caste, area)]

        subset = state_level_is_data_pivoted[
            (state_level_is_data_pivoted['total/rural/urban_'] == area) &
            (state_level_is_data_pivoted['caste group_'] == caste)
        ]

        if subset.empty:
            continue

        val = subset.iloc[0]
        base = round(val["sex_ratio_population"], 2)
        diff = round(val["sex_disparity"], 2)
        final = round(val["sex_ratio_workers"], 2)

        fig.add_trace(go.Waterfall(
            measure=["absolute", "relative", "total"],
            x=[
                "M/F Sex Ratio<br>in Population",
                "M/F Disparity",
                "M/F Sex Ratio<br>amongst Workers"
            ],
            y=[base, diff, final],
            text=[f"{base:.2f}", f"{diff:+.2f}", f"{final:.2f}"],
            textposition="inside",
            textfont=dict(color="black", size=12),
            connector={"line": {"color": "gray"}}
        ), row=row, col=col)

# Update layout with left-aligned title + subtitle
fig.update_layout(
    title={
        "text":
          "<b>National Male to Female Sex Ratios and Disparities by Caste and Rurality, 15–59 yrs, 2011</b><br>"
          "<sup>Workers include both Main and Marginal categories; all figures are at the National level</sup>",
        "x": 0.05,  # left-aligned
        "xanchor": "left"
    },
    title_font_size=20,
    title_font_family="Arial",
    showlegend=False,
    height=800,
    width=1000,
    margin=dict(t=120)  # increase top margin
)

# Fix y-axis across all charts
fig.update_yaxes(range=[0, 4.25])

# Horizontal x-axis labels with wrapping
fig.update_xaxes(tickangle=0, tickfont=dict(size=11), automargin=True)
fig.show()

# Part 3: District Level Data Exploration within Tamil Nadu

In [None]:
# fix map
tn_districts_gdf = india_districts_gdf.loc[
    india_districts_gdf["ST_NM"] == "Tamil Nadu",
    ["DISTRICT", "DT_CEN_CD", "geometry"]]
tn_districts_gdf

Unnamed: 0,DISTRICT,DT_CEN_CD,geometry
25,Ariyalur,15,"POLYGON ((79.32232 11.41967, 79.32473 11.41745..."
110,Chennai,2,"POLYGON ((80.29769 13.12548, 80.29798 13.11914..."
122,Coimbatore,31,"POLYGON ((77.11075 11.38995, 77.11018 11.38097..."
123,Cuddalore,16,"MULTIPOLYGON (((79.69546 11.88492, 79.69546 11..."
147,Dharmapuri,29,"POLYGON ((78.63535 12.17519, 78.64941 12.16151..."
156,Dindigul,11,"POLYGON ((78.19782 10.54686, 78.198 10.54667, ..."
174,Erode,9,"POLYGON ((77.4998 11.11443, 77.4998 11.12131, ..."
271,Kancheepuram,3,"POLYGON ((80.09392 13.13565, 80.10326 13.1331,..."
276,Kanniyakumari,28,"POLYGON ((77.5911 8.1418, 77.58924 8.14134, 77..."
287,Karur,12,"POLYGON ((78.16847 11.01091, 78.16847 11.01091..."


In [None]:
# Export TN district level map as GeoJSON file
tn_districts_gdf.to_file("tn_districts_map.geojson", driver="GeoJSON")

## Part 3.1: Explore the general (un)employment picture

In [None]:
# create a TN dataset at district level
tn_districts_data = merged_df_long.loc[
  (merged_df_long["State Name"] == "TAMIL NADU") & (
    merged_df_long["District Code"] != 0)].drop(
      columns=["State Code", "State Name", "District Code"])
tn_districts_data

Unnamed: 0,State-Level District Code,Area Name,Total/Rural/Urban,Age-Group,Sex,Caste,Main Workers,Marginal workers - Seeking/available for work,Marginal workers - Worked for 3 to 6 months,Marginal workers - Worked for less than 3 months,Non-workers - All,Non-workers - Seeking/available for work,Population
159372,1,District - Thiruvallur (01),Rural,15 to 34,Females,General,30963.0,6863.0,15205.0,2997.0,95064.0,18842.0,144229.0
159373,1,District - Thiruvallur (01),Rural,15 to 34,Females,SC,21596.0,5263.0,11501.0,2340.0,50971.0,12099.0,86408.0
159374,1,District - Thiruvallur (01),Rural,15 to 34,Females,ST,2362.0,571.0,993.0,328.0,2703.0,592.0,6386.0
159375,1,District - Thiruvallur (01),Rural,15 to 34,Females,Total,54921.0,12697.0,27699.0,5665.0,148738.0,31533.0,237023.0
159376,1,District - Thiruvallur (01),Rural,15 to 34,Males,General,81757.0,9892.0,16800.0,2644.0,47082.0,14497.0,148283.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
167431,32,District - Tiruppur (32),Urban,Total,Males,Total,471309.0,5450.0,22355.0,3373.0,269813.0,11289.0,766850.0
167432,32,District - Tiruppur (32),Urban,Total,Total,General,593965.0,8420.0,34278.0,5625.0,726452.0,32857.0,1360320.0
167433,32,District - Tiruppur (32),Urban,Total,Total,SC,73244.0,2014.0,7838.0,1176.0,75725.0,4857.0,157983.0
167434,32,District - Tiruppur (32),Urban,Total,Total,ST,1333.0,8.0,110.0,10.0,1355.0,30.0,2808.0


In [None]:
# Dropdowns with defaults
tru_filter_tn = widgets.Dropdown(
  options=state_level_data["Total/Rural/Urban"].unique(),
  description="Area Type",
  value="Total"
)

age_filter_tn = widgets.Dropdown(
  options=state_level_data["Age-Group"].unique(),
  description="Age",
  value="15 to 59"
)

sex_filter_tn = widgets.Dropdown(
  options=state_level_data["Sex"].unique(),
  description="Sex",
  value="Total"
)

caste_filter_tn = widgets.Dropdown(
  options=state_level_data["Caste"].unique(),
  description="Caste",
  value="Total"
)

fact_dropdown_tn = widgets.Dropdown(
  options=[
    'Population',
    'Workers (Main and Marginal)',
    'Main Workers',
    'Marginal workers - Seeking/available for work',
    'Marginal workers - Worked for 3 to 6 months',
    'Marginal workers - Worked for less than 3 months',
    'Non-workers - All',
    'Non-workers - Seeking/available for work',
    'Proportion_Workers',
    'Proportion_Main_Workers',
    'Proportion_Marginal_Workers'
  ],
  value='Population',
  description="Fact"
)

# Show map button
show_map_button_display_tn = widgets.Button(description="Show Map")
show_map_button_save_tn = widgets.Button(description="Create and Save Map")

# Export data button
export_data_button_tn = widgets.Button(description="Export Data")

# Dedicated output widget for save messages
save_messages_output_tn = widgets.Output()

# User Interface
ui_tn = widgets.VBox([
    tru_filter_tn, age_filter_tn, sex_filter_tn, caste_filter_tn,
    fact_dropdown_tn, show_map_button_save_tn, show_map_button_display_tn,
    export_data_button_tn
])
map_output_tn = widgets.Output()

In [None]:
def update_tn_map(button=None, save_map=True, export_data=False):

  # Clear output
  clear_output(wait=True)
  if not save_map:
    save_messages_output_tn.clear_output(wait=True)

  # Show progress while rendering
  if not export_data:
    with map_output_tn:
      display(widgets.HTML("<b>Rendering map... please wait ⏳</b>"))

  # Filter district level data based on selected filters
  df = tn_districts_data[
      (tn_districts_data["Total/Rural/Urban"] == tru_filter_tn.value) &
      (tn_districts_data["Age-Group"] == age_filter_tn.value) &
      (tn_districts_data["Sex"] == sex_filter_tn.value) &
      (tn_districts_data["Caste"] == caste_filter_tn.value)
  ].copy()
  filters_applied = ([tru_filter_tn.value, age_filter_tn.value,
                          sex_filter_tn.value, caste_filter_tn.value])
  if df.empty:
    print("No data for selected filters.")
    return

  # Add in new fact(s) - if required
  selected_fact = fact_dropdown_tn.value
  if selected_fact == "Workers (Main and Marginal)":
      df["Workers (Main and Marginal)"] = \
      df["Population"] - df["Non-workers - All"]
  elif selected_fact == "Proportion_Workers":
      df["Workers (Main and Marginal)"] = \
        df["Population"] - df["Non-workers - All"] # Reqd for Proportion_Workers
      df["Proportion_Workers"] = \
        df["Workers (Main and Marginal)"] / df["Population"]
  elif selected_fact == "Proportion_Main_Workers":
      df["Proportion_Main_Workers"] = df["Main Workers"] / df["Population"]
  elif selected_fact == "Proportion_Marginal_Workers":
      df["Proportion_Marginal_Workers"] = (
          df["Marginal workers - Worked for 3 to 6 months"] +
          df["Marginal workers - Worked for less than 3 months"]
      ) / df["Population"]

  # Retain only District Code, Area Name and Selected Fact
  data = df[["State-Level District Code", selected_fact, "Area Name"]]

  # Merge with Maps dataset
  gdf_merged = tn_districts_gdf.merge(
    data, left_on="DT_CEN_CD", right_on="State-Level District Code", how="inner"
  )

  # Ensure fact values and district name are in GeoJSON properties
  gdf_merged["tooltip_text"] = (
      gdf_merged["Area Name"] + "<br>" +
      f"{selected_fact}: " + gdf_merged[selected_fact].round(3).astype(str)
  )

  # Export CSV if requested
  if export_data:
    gdf_merged[["DISTRICT", selected_fact]].drop_duplicates().to_csv(
        "data_export.csv", index=False)
    with save_messages_output_tn:
      display(widgets.HTML(
          f"<b>Data Exported to <code>data_export.csv</code> ✅</b>"))
  else: # create map
    # Create empty map
    folium_map = folium.Map(
        tiles="cartodbpositron",
        width="600px",
        height="650px",
        embed=True)

    # Add choropleth
    choropleth = Choropleth(
      geo_data=gdf_merged.to_json(),
      name="choropleth",
      data=gdf_merged,
      columns=["DT_CEN_CD", selected_fact],
      key_on="feature.properties.DT_CEN_CD",
      fill_color="YlGnBu",
      fill_opacity=0.7,
      line_opacity=0.2,
      legend_name=selected_fact
    ).add_to(folium_map)

    # Add tooltip
    folium.GeoJson(
      gdf_merged,
      name="District Labels",
      style_function=lambda x: {"fillOpacity": 0, "color": "transparent"},
      tooltip=folium.GeoJsonTooltip(
        fields=["tooltip_text"],
        aliases=[""],
        labels=False,
        sticky=True
      )
    ).add_to(folium_map)

    # Auto-fit to GeoDataFrame bounds
    bounds = gdf_merged.geometry.total_bounds
    folium_map.fit_bounds([[bounds[1], bounds[0]], [bounds[3], bounds[2]]])

    # Setup title
    parts = [selected_fact, "by District"]
    if filters_applied:
        parts += filters_applied
    title = "_".join(p.replace(" ", "_").replace("/", "_").replace("-", "_"
      ).replace(":", "") for p in parts)
    html_title = f"{selected_fact} by District"
    if filters_applied:
        html_title += "<br>" + " | ".join(filters_applied)

    html_title = f"<h3 style='font-family:Arial'>{html_title}</h3>"

    # Save or Display Map
    if save_map:
      # Save map to HTML file
      output_dir = map_output_folder
      os.makedirs(output_dir, exist_ok=True)  # ensure the folder exists
      folium_map.save(f"{output_dir}/{title}.html")
      with save_messages_output_tn:
        display(widgets.HTML(f"<b>Map saved as: <code>{title}.html</code> ✅</b>"))
    else: # Display map within Colab
      with map_output_tn:
        clear_output(wait=True)
        display(HTML(f"<h3>{html_title}</h3>")) # Display the title
        display(folium_map) # display map

In [None]:
# Connect the button to the update function
show_map_button_save_tn.on_click(update_tn_map)  # save_map=True by default
show_map_button_display_tn.on_click(
    lambda b: update_tn_map(b, save_map=False))
export_data_button_tn.on_click(
    lambda b: update_tn_map(b, export_data=True))

# Display UI
display(ui_tn)
display(map_output_tn)
display(save_messages_output_tn)

VBox(children=(Dropdown(description='Area Type', index=1, options=('Rural', 'Total', 'Urban'), value='Total'),…

Output()

Output()

## Part 3.2: Explore the sex-nuance within the general (un)employment picture

In [None]:
# Dropdowns with defaults
tru_filter_by_sex_tn = widgets.Dropdown(
  options=merged_df_long["Total/Rural/Urban"].unique(),
  description="Area Type",
  value="Total"
)

age_filter_by_sex_tn = widgets.Dropdown(
  options=merged_df_long["Age-Group"].unique(),
  description="Age",
  value="15 to 59"
)

caste_filter_by_sex_tn = widgets.Dropdown(
  options=merged_df_long["Caste"].unique(),
  description="Caste",
  value="Total"
)

fact_dropdown_by_sex_tn = widgets.Dropdown(
  options=[
    'Sex Ratio in Population',
    'Sex Ratio amongst Workers (Main and Marginal)',
    'Sex Ratio amongst Main Workers',
    'Sex Ratio amongst Marginal workers - Seeking/available for work',
    'Sex Ratio amongst Marginal workers - Worked for 3 to 6 months',
    'Sex Ratio amongst Marginal workers - Worked for less than 3 months',
    'Sex Ratio amongst Non-workers - All',
    'Sex Ratio amongst Non-workers - Seeking/available for work',
    'Sex Disparity amongst Workers (Main and Marginal)',
    'Sex Disparity amongst Main Workers',
    'Sex Disparity amongst Marginal workers - Seeking/available for work',
    'Sex Disparity amongst Marginal workers - Worked for 3 to 6 months',
    'Sex Disparity amongst Marginal workers - Worked for less than 3 months',
    'Sex Disparity amongst Non-workers - All',
    'Sex Disparity amongst Non-workers - Seeking/available for work'
  ],
  value='Sex Ratio in Population',
  description="Fact"
)

# Show map button
show_map_button_display_by_sex_tn = widgets.Button(description="Show Map")
show_map_button_save_by_sex_tn = widgets.Button(description="Create and Save Map")

# Export data button
export_data_button_by_sex_tn = widgets.Button(description="Export Data")

# Dedicated output widget for save messages
save_messages_output_by_sex_tn = widgets.Output()

# User Interface
ui_by_sex_tn = widgets.VBox([
    tru_filter_by_sex_tn, age_filter_by_sex_tn, caste_filter_by_sex_tn,
    fact_dropdown_by_sex_tn, show_map_button_display_by_sex_tn,
    show_map_button_save_by_sex_tn, export_data_button_by_sex_tn
])
map_output_by_sex_tn = widgets.Output()

In [None]:
def update_tn_ratios_map(button=None, save_map=True, export_data=False):

  # Clear output
  map_output_by_sex_tn.clear_output(wait=True)
  if not save_map:
    save_messages_output_by_sex_tn.clear_output(wait=True)

  # Show progress while rendering
  if not export_data:
    with map_output_by_sex_tn:
      display(widgets.HTML("<b>Rendering map... please wait ⏳</b>"))

  # Filter district level based on selected filters
  df = tn_districts_data.copy()
  df = tn_districts_data[
    (tn_districts_data["Total/Rural/Urban"] == tru_filter_by_sex_tn.value) &
    (tn_districts_data["Age-Group"] == age_filter_by_sex_tn.value) &
    (tn_districts_data["Caste"] == caste_filter_by_sex_tn.value)
  ]
  df = df[df["Sex"] != "Total"]
  if df.empty or df[df["Sex"] == "Females"].empty:
    print("No data for selected filters.")
    return

  # Collect selected filters for title
  filters_applied = [
    tru_filter_by_sex_tn.value, age_filter_by_sex_tn.value,
    caste_filter_by_sex_tn.value
  ]

  # Keep only relevant columns
  selected_fact = fact_dropdown_by_sex_tn.value
  ratio_core_group = re.match(r"Sex Ratio (?:in|amongst) (.+)", selected_fact)
  disp_core_group  = re.match(r"Sex Disparity amongst (.+)", selected_fact)
  if ratio_core_group:
      group = ratio_core_group.group(1)
      metrics = (
          ["Main Workers",
          "Marginal workers - Worked for 3 to 6 months",
          "Marginal workers - Worked for less than 3 months"]
          if group == "Workers (Main and Marginal)" # exceptional case
          else [group]
      )
  else:
      group = disp_core_group.group(1)
      base = (
          ["Main Workers",
          "Marginal workers - Worked for 3 to 6 months",
          "Marginal workers - Worked for less than 3 months"]
          if group == "Workers (Main and Marginal)" # exceptional case
          else [group]
      )
      metrics = ["Population", *base]
  index_cols = ["State-Level District Code", "Area Name"]
  filtered_df = df[index_cols + ["Sex"] + metrics]

  # Pivot facts by Sex
  pivoted = filtered_df.pivot(index=index_cols, columns="Sex", values=metrics
                              ).fillna(0)
  pivoted_columns= [f"{col}: {sex}" for col, sex in pivoted.columns]
  pivoted.columns = pivoted_columns
  pivoted = pivoted.reset_index()

  # Add in new fact(s) if required
  group = (ratio_core_group or disp_core_group).group(1)
  male_col = group + ": Males"
  female_col = group + ": Females"
  if group == 'Population':
    pivoted["Sex Ratio in Population"] = pivoted[male_col] / pivoted[female_col]
  elif group == "Workers (Main and Marginal)":
    male_sum = (
        pivoted["Main Workers: Males"] +
        pivoted["Marginal workers - Worked for 3 to 6 months: Males"] +
        pivoted["Marginal workers - Worked for less than 3 months: Males"]
    )
    female_sum = (
        pivoted["Main Workers: Females"] +
        pivoted["Marginal workers - Worked for 3 to 6 months: Females"] +
        pivoted["Marginal workers - Worked for less than 3 months: Females"]
    )
    pivoted[f"Sex Ratio amongst {group}"] = male_sum / female_sum
  else:
    pivoted[f"Sex Ratio amongst {group}"] = \
      pivoted[male_col] / pivoted[female_col]
  if disp_core_group:
    if "Sex Ratio in Population" not in pivoted.columns:
      pivoted["Sex Ratio in Population"] = (
        pivoted["Population: Males"] / pivoted["Population: Females"]
      )
    pivoted[f"Sex Disparity amongst {group}"] = \
      pivoted[f"Sex Ratio amongst {group}"] - pivoted["Sex Ratio in Population"]

  # Retain only District Code, Area Name and Selected Fact
  data = pivoted[["State-Level District Code", selected_fact, "Area Name"]]

  # Merge with Maps dataset
  gdf_merged = tn_districts_gdf.merge(
    data, left_on="DT_CEN_CD", right_on="State-Level District Code", how="inner"
  )

  # Ensure fact values and district name are in GeoJSON properties
  gdf_merged["tooltip_text"] = (
      gdf_merged["Area Name"] + "<br>" +
      f"{selected_fact}: " + gdf_merged[selected_fact].round(3).astype(str)
  )

  # Export CSV if requested
  if export_data:
    gdf_merged[["DISTRICT", selected_fact]].drop_duplicates().to_csv(
        "data_export.csv", index=False)
    with save_messages_output_by_sex_tn:
      display(widgets.HTML(
          f"<b>Data Exported to <code>data_export.csv</code> ✅</b>"))
  else: # create maps
    # Create empty map
    folium_map = folium.Map(
      tiles="cartodbpositron",
      width="600px",
      height="650px"
    )

    # Add choropleth
    choropleth = Choropleth(
      geo_data=gdf_merged.to_json(),
      name="choropleth",
      data=gdf_merged,
      columns=["DT_CEN_CD", selected_fact],
      key_on="feature.properties.DT_CEN_CD",
      fill_color="YlGnBu",
      fill_opacity=0.7,
      line_opacity=0.2,
      legend_name=selected_fact
    ).add_to(folium_map)

    # Add tooltip
    folium.GeoJson(
      gdf_merged,
      name="District Labels",
      style_function=lambda x: {"fillOpacity": 0, "color": "transparent"},
      tooltip=folium.GeoJsonTooltip(
        fields=["tooltip_text"],
        aliases=[""],
        labels=False,
        sticky=True
      )
    ).add_to(folium_map)

    # Auto-fit to GeoDataFrame bounds
    bounds = gdf_merged.geometry.total_bounds
    folium_map.fit_bounds([[bounds[1], bounds[0]], [bounds[3], bounds[2]]])

    # Setup title
    parts = [selected_fact, "by District"]
    if filters_applied:
      parts += filters_applied
    title = "_".join(p.replace(" ", "_").replace("/", "_").replace("-", "_")
                    .replace(":", "") for p in parts)
    html_title = " | ".join(parts)

    if save_map:
      # Save map to HTML file
      output_dir = map_output_folder
      os.makedirs(output_dir, exist_ok=True)  # ensure the folder exists
      folium_map.save(f"{output_dir}/{title}.html")
      with save_messages_output_by_sex_tn:
        display(widgets.HTML(
          f"<b>Map saved as: <code>{title}.html</code> ✅</b>"))
    else:  # Display map within Colab
      with map_output_by_sex_tn:
        clear_output(wait=True)
        display(HTML(f"<h3>{html_title}</h3>"))  # First display the title
        display(folium_map)  # Then, display the folium map

In [None]:
# Connect the button to the update function
show_map_button_save_by_sex_tn.on_click(update_tn_ratios_map)
show_map_button_display_by_sex_tn.on_click(
    lambda b: update_tn_ratios_map(b, save_map=False))
export_data_button_by_sex_tn.on_click(
    lambda b: update_tn_ratios_map(b, export_data=True))
# Display UI
display(ui_by_sex_tn)
display(map_output_by_sex_tn)
display(save_messages_output_by_sex_tn)

VBox(children=(Dropdown(description='Area Type', index=1, options=('Rural', 'Total', 'Urban'), value='Total'),…

Output()

Output()

## Part 3.3: Intersectional Charts

In [None]:
tn_is_data = tn_districts_data.loc[
    (tn_districts_data["Total/Rural/Urban"] != "Total") &
    (tn_districts_data["Age-Group"] == "15 to 59") &
    (tn_districts_data["Caste"] != "Total") &
    (tn_districts_data["Sex"] != "Total"),
    ["Total/Rural/Urban", "Caste", "Sex", "Main Workers",
     "Marginal workers - Worked for 3 to 6 months",
     "Marginal workers - Worked for less than 3 months", "Population"]]
tn_is_data

Unnamed: 0,Total/Rural/Urban,Caste,Sex,Main Workers,Marginal workers - Worked for 3 to 6 months,Marginal workers - Worked for less than 3 months,Population
159384,Rural,General,Females,70681.0,33027.0,7377.0,268636.0
159385,Rural,SC,Females,47305.0,24535.0,5809.0,152357.0
159386,Rural,ST,Females,4571.0,1925.0,610.0,10984.0
159388,Rural,General,Males,178383.0,35125.0,5628.0,271783.0
159389,Rural,SC,Males,92319.0,24898.0,4764.0,150480.0
...,...,...,...,...,...,...,...
167365,Urban,SC,Females,25474.0,3440.0,530.0,54635.0
167366,Urban,ST,Females,516.0,43.0,10.0,942.0
167368,Urban,General,Males,391205.0,15703.0,2384.0,471838.0
167369,Urban,SC,Males,42363.0,3397.0,513.0,53220.0


In [None]:
# Get Caste Groups
tn_is_data['Caste Group'] = tn_is_data['Caste'].apply(
    lambda x: 'sc-st' if x.lower() in ['sc', 'st'] else 'general')

# Get Total Workers
tn_is_data['Total Workers'] = (
    tn_is_data['Main Workers'] +
    tn_is_data['Marginal workers - Worked for 3 to 6 months'] +
    tn_is_data['Marginal workers - Worked for less than 3 months']
)

# Group and pivot by Area + Caste Group + Sex
tn_is_data_grouped = tn_is_data.groupby(
    ['Total/Rural/Urban', 'Caste Group', 'Sex'], as_index=False)[
    ['Population', 'Total Workers']
].sum()

# Pivot to separate Male and Female values (wide form over Sex dimension)
tn_is_data_pivoted = tn_is_data_grouped.pivot_table(
    index=['Total/Rural/Urban', 'Caste Group'],
    columns='Sex',
    values=['Population', 'Total Workers']
).reset_index()

# Flatten multi-index columns
tn_is_data_pivoted.columns = [
    '_'.join(col).lower() for col in tn_is_data_pivoted.columns
    ]
tn_is_data_pivoted

Unnamed: 0,total/rural/urban_,caste group_,population_females,population_males,total workers_females,total workers_males
0,Rural,general,8782756.0,8745012.0,4671357.0,7093720.0
1,Rural,sc-st,3311361.0,3249908.0,2102540.0,2660782.0
2,Urban,general,10042123.0,9984690.0,2799862.0,7961328.0
3,Urban,sc-st,1739506.0,1707608.0,661803.0,1346689.0


In [None]:
# Calculate ratios and disparities
tn_is_data_pivoted['sex_ratio_population'] = (
    tn_is_data_pivoted['population_males'] / \
    tn_is_data_pivoted['population_females']
)
tn_is_data_pivoted['sex_ratio_workers'] = (
    tn_is_data_pivoted['total workers_males'] /\
    tn_is_data_pivoted['total workers_females']
)
tn_is_data_pivoted['sex_disparity'] = (
    tn_is_data_pivoted['sex_ratio_workers'] - \
    tn_is_data_pivoted['sex_ratio_population']
)

tn_is_data_pivoted

Unnamed: 0,total/rural/urban_,caste group_,population_females,population_males,total workers_females,total workers_males,sex_ratio_population,sex_ratio_workers,sex_disparity
0,Rural,general,8782756.0,8745012.0,4671357.0,7093720.0,0.995702,1.518557,0.522854
1,Rural,sc-st,3311361.0,3249908.0,2102540.0,2660782.0,0.981442,1.265508,0.284067
2,Urban,general,10042123.0,9984690.0,2799862.0,7961328.0,0.994281,2.843472,1.849191
3,Urban,sc-st,1739506.0,1707608.0,661803.0,1346689.0,0.981663,2.034879,1.053216


In [None]:
# Subplot layout without shared axes, reordered: [General, SC-ST] x [Urban, Rural]
fig = make_subplots(
    rows=2, cols=2,
    vertical_spacing=0.125,  # reduce from default (~0.15)
    subplot_titles=[
        "General – Urban", "General – Rural",
        "SC-ST – Urban", "SC-ST – Rural"
    ]
)

# Define new plot order: (caste, area) → (row, col)
positions = {
    ('general', 'Urban'): (1, 1),
    ('general', 'Rural'): (1, 2),
    ('sc-st', 'Urban'): (2, 1),
    ('sc-st', 'Rural'): (2, 2)
}

# Loop through combinations
for caste in ['general', 'sc-st']:
    for area in ['Urban', 'Rural']:
        row, col = positions[(caste, area)]

        subset = tn_is_data_pivoted[
            (tn_is_data_pivoted['total/rural/urban_'] == area) &
            (tn_is_data_pivoted['caste group_'] == caste)
        ]

        if subset.empty:
            continue

        val = subset.iloc[0]
        base = round(val["sex_ratio_population"], 2)
        diff = round(val["sex_disparity"], 2)
        final = round(val["sex_ratio_workers"], 2)

        fig.add_trace(go.Waterfall(
            measure=["absolute", "relative", "total"],
            x=[
                "M/F Sex Ratio<br>in Population",
                "M/F Disparity",
                "M/F Sex Ratio<br>amongst Workers"
            ],
            y=[base, diff, final],
            text=[f"{base:.2f}", f"{diff:+.2f}", f"{final:.2f}"],
            textposition="inside",
            textfont=dict(color="black", size=12),
            connector={"line": {"color": "gray"}}
        ), row=row, col=col)

# Update layout with left-aligned title + subtitle
fig.update_layout(
    title={
        "text":
          "<b>Tamil Nadu Male to Female Sex Ratios and Disparities by Caste and Rurality, 15–59 yrs, 2011</b><br>"
          "<sup>Workers include both Main and Marginal categories; all figures are at the State level</sup>",
        "x": 0.05,  # left-aligned
        "xanchor": "left"
    },
    title_font_size=20,
    title_font_family="Arial",
    showlegend=False,
    height=800,
    width=1000,
    margin=dict(t=120)  # increase top margin
)

# Fix y-axis across all charts
fig.update_yaxes(range=[0, 3])

# Horizontal x-axis labels with wrapping
fig.update_xaxes(tickangle=0, tickfont=dict(size=11), automargin=True)
fig.show()