# Installation

In [1]:
#%pip install pandas

# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os

# Loading datafile

## Extraction

In [5]:
file_path_1 =  r".\PLFS 2023-24\Extracted_files\HHV1_PERV1_merged.dta"

In [20]:
original_df = pd.read_stata(file_path_1)
df = original_df.copy()

## Changing dataformats

In [21]:
df[["pvar32", "pvar43", "usual_status_code", "pvar20"]] = df[["pvar32", "pvar43", "usual_status_code", "pvar20"]].apply(pd.to_numeric)

# Analysis

## LFPR, WPR, PU and UR

In [9]:
def status_population (state = None, sector = None, gender = None, min_age = None, max_age = None):
    df_filtered = df.copy()
    
    #Validation
    valid_sectors = df_filtered["pvar5"].unique()
    valid_states = df_filtered["pvar6"].unique()
    valid_genders = df_filtered["pvar19"].unique()
    
    #Filtering for state
    if state is None:
        state = "India"
    elif state in valid_states:
        state = state
        df_filtered = df_filtered [df_filtered ["pvar6"] == state]
    else:
        raise ValueError(f"Invalid state: {state}. Select from {valid_states}")
        
    
    #Filtering for sector
    if sector is None:
        sector = "rural and urban"
    elif sector in valid_sectors:
        sector = sector
        df_filtered = df_filtered[df_filtered["pvar5"] == sector]
    else:
        raise ValueError(f"Invalid sector: {sector}. Select from {valid_sectors}")

    #Filtering for gender
    if gender is None:
        gender = "male, female and transgender"
    elif gender in valid_genders:
        gender = gender
        df_filtered = df_filtered[df_filtered["pvar19"] == gender]
    else:
        raise ValueError(f"Invalid gender: {gender}. Select from {valid_genders}")
    
    #Filtering for min and max age
    if min_age is not None:
        df_filtered = df_filtered.loc[df_filtered["pvar20"] >= min_age]
    if max_age is not None:
        df_filtered = df_filtered.loc[df_filtered["pvar20"] <=max_age]
    
    #Grouping by status
    df_status = df_filtered.copy()
    df_status = df_status.groupby(['usual_status_code', 'pvar5','pvar6', 'pvar19', 'pvar20'])['weights'].sum().reset_index()
    
    #Calculating status wise population for Usual Status
    total_population = df_status["weights"].sum()
    employed = df_status[(df_status["usual_status_code"]<81)]["weights"].sum()
    unemployed = df_status[(df_status["usual_status_code"].isin([81,82]))]["weights"].sum()

    #Sample Size
    sample_size = {
                    'category': ['employed', 'unemployed', 'total population'],
                    'sample size': [(df_filtered["usual_status_code"]<81).sum(), 
                                    (df_filtered["usual_status_code"].isin([81,82])).sum(), 
                                    df_filtered.shape[0]]
                    }
    sample_size = pd.DataFrame(sample_size)
    print("Sample size of the filtered dataset:")
    display(sample_size)

    print(f"total population = {total_population}")
    print(f"employed = {employed}")
    print(f"unemployed = {unemployed}")

    #Labor force participation rate
    lfpr = (employed+unemployed)*100/total_population
    print(f"\nLabour force participation rate for [{state}, {sector}, {gender}, age {min_age} - {max_age}] = {lfpr:.2f}%")

    #Worker population ratio
    wpr = employed*100/total_population
    print(f"Worker population ratio for [{state}, {sector}, {gender}, age {min_age} - {max_age}] = {wpr:.2f}%")

    #Proportion unemployed 
    pu = unemployed*100/total_population
    print(f"Proportion unemployed for [{state}, {sector}, {gender}, age {min_age} - {max_age}] = {pu:.2f}%")

    #Unemployment rate
    ur = unemployed*100/(employed+unemployed)
    print(f"Unemployment rate for [{state}, {sector}, {gender}, age {min_age} - {max_age}] = {ur:.2f}%")


In [10]:
status_population(state = "Odisha", min_age=15)

Sample size of the filtered dataset:


Unnamed: 0,category,sample size
0,employed,7469
1,unemployed,292
2,total population,12333


total population = 30048784
employed = 18906007
unemployed = 604935

Labour force participation rate for [Odisha, rural and urban, male, female and transgender, age 15 - None] = 64.93%
Worker population ratio for [Odisha, rural and urban, male, female and transgender, age 15 - None] = 62.92%
Proportion unemployed for [Odisha, rural and urban, male, female and transgender, age 15 - None] = 2.01%
Unemployment rate for [Odisha, rural and urban, male, female and transgender, age 15 - None] = 3.10%


## Industry category wise employment

In [27]:
#Creating a new column with 2 digit NIC codes based on usual status

#Assigning NIC code based on usual status
df["usual_status_ind"] = np.where((df['pvar32'] >=81) & (df['pvar43'].notna()),   # Condition for subsidiary code to be not NA and principal to be more than or equal to 81
                                    df['pvar44'],
                                    df['pvar33'])

#Keeping only the first two NIC digits
df["usual_status_ind"] = df["usual_status_ind"].astype(str).str[:2]
df["usual_status_ind"] = df["usual_status_ind"].apply(pd.to_numeric)

In [28]:
#Categorising NIC codes into industry categories

#Codes based classification
conditions = [(df["usual_status_ind"]>=1) & (df["usual_status_ind"]<=3),
              (df["usual_status_ind"]>=5) & (df["usual_status_ind"]<=9),
              (df["usual_status_ind"]>=10) & (df["usual_status_ind"]<=33),
              (df["usual_status_ind"]>=35) & (df["usual_status_ind"]<=39),
              (df["usual_status_ind"]>=41) & (df["usual_status_ind"]<=43),
              (df["usual_status_ind"]>=45) & (df["usual_status_ind"]<=47),
              (df["usual_status_ind"]>=49) & (df["usual_status_ind"]<=53),
              (df["usual_status_ind"]>=55) & (df["usual_status_ind"]<=56),
              (df["usual_status_ind"]>=58) & (df["usual_status_ind"]<=99),
              ]

#Categories
choices = ["Agriculture", "Mining and quarrying", "Manufacturing", "Electricity and water supply", "Construction", "Trade", "Transport", "Accomodation and food services", "Other sevices"]

df["usual_status_ind_category"] = np.select(conditions, choices, default = "")

In [29]:
def ind_category (state = None, sector = None, gender = None, min_age = None, max_age = None):
    
    #Filtering dataframe
    df_filtered = df.copy()
    
    ##Validation
    valid_sectors = df_filtered["pvar5"].unique()
    valid_states = df_filtered["pvar6"].unique()
    valid_genders = df_filtered["pvar19"].unique()
    
    if state in valid_states:
        df_filtered = df_filtered[df_filtered["pvar6"] == state]
    elif ((state is not None) and (state not in valid_sectors)):
        raise ValueError(f"Invalid state: {state}. \n Select from {valid_states}")
    
    if sector in valid_sectors:
        df_filtered = df_filtered[df_filtered["pvar5"] == sector]
    elif ((sector is not None) and (sector not in valid_sectors)):
        raise ValueError(f"Invalid sector: {sector}. \n Select from {valid_sectors}")
    
    if gender in valid_genders:
        df_filtered = df_filtered[df_filtered["pvar19"] == gender]
    elif ((gender is not None) and (gender not in valid_genders)):
        raise ValueError(f"Invalid gender: {gender}. \n Select from {valid_genders}")
    
    df_filtered = df_filtered.loc[(df_filtered["pvar20"] >= min_age if min_age is not None else df_filtered.index)]
    df_filtered = df_filtered.loc[(df_filtered["pvar20"] <= max_age if max_age is not None else df_filtered.index)] #df.index provides a boolean series with value True for all rows
    
    #Calculating industry classification wise weights
    df_ind_category = df_filtered.groupby(["usual_status_ind_category"]).agg(
                        population_size = ("weights", "sum"),
                        sample_size = ("weights", "size")
                    ).reset_index()
    df_ind_category = df_ind_category.loc[df_ind_category["usual_status_ind_category"] != ""]

    #Calculating share
    total_weight = df_ind_category["population_size"].sum()
    df_ind_category ["popn_percentage"] = (df_ind_category["population_size"]*(100/total_weight)).round(1)
    df_ind_category = df_ind_category.sort_values(by = "popn_percentage", ascending = False).reset_index(drop = True)

    print("Population based percentage distribution: ")
    display(df_ind_category)

In [None]:
ind_category()

Population based percentage distribution: 


Unnamed: 0,usual_status_ind_category,population_size,sample_size,popn_percentage
0,Agriculture,242244757,72133,46.1
1,Other sevices,69532973,30139,13.2
2,Construction,62969497,20917,12.0
3,Manufacturing,60157900,21854,11.4
4,Trade,53799265,22631,10.2
5,Transport,22487351,9067,4.3
6,Accomodation and food services,10502817,4440,2.0
7,Electricity and water supply,2860535,1322,0.5
8,Mining and quarrying,1223326,567,0.2


: 

# Additional statistics

In [11]:
#Filtering based on 2 digit NIC
trial = df[((df["pvar20"]>=15) & (df["pvar20"]<=60))]                     # Filtering for age
trial = trial.groupby(by = 'usual_status_ind', dropna = False)            # Grouping by 2 digit NIC
trial = trial.agg(
                        population_size = ("weights", "sum"),             # Aggregating weights (sum)
                        sample_size = ("weights", "size")                 #Aggregating sample size
                    ).reset_index()
trial

Unnamed: 0,usual_status_ind,population_size,sample_size
0,1.0,211296947,62470
1,2.0,2487124,824
2,3.0,1537522,611
3,5.0,373608,183
4,6.0,42734,26
...,...,...,...
82,94.0,1251133,544
83,95.0,2331669,897
84,96.0,4414857,1796
85,97.0,6540931,2676


In [12]:
#Filtering based on 2 digit NIC
trial = df.groupby(by = 'usual_status_ind', dropna = False)            # Grouping by 2 digit NIC
trial = trial.agg(
                        population_size = ("weights", "sum"),             # Aggregating weights (sum)
                        sample_size = ("weights", "size")                 #Aggregating sample size
                    ).reset_index()
trial

Unnamed: 0,usual_status_ind,population_size,sample_size
0,1.0,237873713,70591
1,2.0,2673201,877
2,3.0,1697843,665
3,5.0,374745,184
4,6.0,42734,26
...,...,...,...
82,94.0,1556950,653
83,95.0,2464087,948
84,96.0,4752675,1923
85,97.0,6890164,2838


In [13]:
#Aggregating based on Vocational training and status of employment
trial = df[((df["pvar20"]>=15) & (df["pvar20"]<=59))]                     # Filtering for age
trial = trial.groupby(by = ['pvar26', 'usual_status_code'], dropna = False)           
trial = trial.agg(
                        population_size = ("weights", "sum"),             # Aggregating weights (sum)
                        sample_size = ("weights", "size")                 #Aggregating sample size
                    ).reset_index()
trial

Unnamed: 0,pvar26,usual_status_code,population_size,sample_size
0,1,11.0,5328024,2253
1,1,12.0,762885,293
2,1,21.0,1984046,714
3,1,31.0,11822832,5544
4,1,41.0,49421,25
...,...,...,...,...
73,6,92.0,107414361,37874
74,6,93.0,32892547,10490
75,6,94.0,4964629,1864
76,6,95.0,5379543,1858


In [14]:
#Aggregating based on duration of training
trial = df[((df["pvar20"]>=15) & (df["pvar20"]<=59))]                     # Filtering for age
trial = trial.groupby(by = ['pvar29'], dropna = False)          
trial = trial.agg(
                        population_size = ("weights", "sum"),             # Aggregating weights (sum)
                        sample_size = ("weights", "size")                 #Aggregating sample size
                    ).reset_index()
trial

Unnamed: 0,pvar29,population_size,sample_size
0,,734537577,259596
1,1.0,2926674,1355
2,2.0,10688220,3694
3,3.0,6897532,3273
4,4.0,4076532,2276
5,5.0,1853947,735
6,6.0,4770640,1694


## Unemployment

### Effort for finding employment

In [15]:
def efforts_unemployment (state = None, sector = None, gender = None, min_age = None, max_age = None):
    
    #Filtering dataframe
    df_filtered = df.loc[df["usual_status_code"] == 81].copy()
    
    ##Validation
    valid_sectors = df_filtered["pvar5"].unique()
    valid_states = df_filtered["pvar6"].unique()
    valid_genders = df_filtered["pvar19"].unique()
    
    if state in valid_states:
        df_filtered = df_filtered[df_filtered["pvar6"] == state]
    elif ((state is not None) and (state not in valid_sectors)):
        raise ValueError(f"Invalid state: {state}. \n Select from {valid_states}")
    
    if sector in valid_sectors:
        df_filtered = df_filtered[df_filtered["pvar5"] == sector]
    elif ((sector is not None) and (sector not in valid_sectors)):
        raise ValueError(f"Invalid sector: {sector}. \n Select from {valid_sectors}")
    
    if gender in valid_genders:
        df_filtered = df_filtered[df_filtered["pvar19"] == gender]
    elif ((gender is not None) and (gender not in valid_genders)):
        raise ValueError(f"Invalid gender: {gender}. \n Select from {valid_genders}")
    
    df_filtered = df_filtered.loc[(df_filtered["pvar20"] >= min_age if min_age is not None else df_filtered.index)]
    df_filtered = df_filtered.loc[(df_filtered["pvar20"] <= max_age if max_age is not None else df_filtered.index)]
    
    #Reasons for unemployment
    df_reasons = df_filtered.groupby("pvar56").agg(
                population_size = ("weights", "sum"),
                sample_size = ("weights", "size")
                )
    #df_reasons = df_filtered.groupby(["pvar56"])["weights"].sum()
    df_reasons = pd.DataFrame(df_reasons).reset_index()
    total_sum = df_reasons["population_size"].sum()
    df_reasons["popn_percentage"] = df_reasons["population_size"]*100/total_sum
    df_reasons = df_reasons.sort_values(by = "popn_percentage", ascending = False)

    #Labelling categories
    labels_dict = {
    "1": "Applied to employers/check work sites",
    "2": "Registered with employment exchange",
    "3": "Registered with private employment center",
    "4": "Sought financial help for business",
    "5": "Sought help from family/friends",
    "6": "Applied for permit/license for business",
    "7": "Others"
    }
    df_reasons["pvar56"] = df_reasons["pvar56"].replace(labels_dict)

    
    print("Effors for employment:")
    display(df_reasons)

In [16]:
efforts_unemployment(state = "Punjab")

Effors for employment:


Unnamed: 0,pvar56,population_size,sample_size,popn_percentage
0,Applied to employers/check work sites,324194,135,52.233835
4,Sought help from family/friends,185984,91,29.965569
5,Others,75790,29,12.211214
2,Registered with private employment center,21676,14,3.492417
1,Registered with employment exchange,9454,3,1.52322
3,Sought financial help for business,3561,2,0.573745


### Duration of unemployment

In [17]:
def duration_unemployment (state = None, sector = None, gender = None, min_age = None, max_age = None):
    
    #Filtering dataframe
    df_filtered = df.loc[df["usual_status_code"] == 81].copy()
    
    ##Validation
    valid_sectors = df_filtered["pvar5"].unique()
    valid_states = df_filtered["pvar6"].unique()
    valid_genders = df_filtered["pvar19"].unique()
    
    if state in valid_states:
        df_filtered = df_filtered[df_filtered["pvar6"] == state]
    elif ((state is not None) and (state not in valid_sectors)):
        raise ValueError(f"Invalid state: {state}. \n Select from {valid_states}")
    
    if sector in valid_sectors:
        df_filtered = df_filtered[df_filtered["pvar5"] == sector]
    elif ((sector is not None) and (sector not in valid_sectors)):
        raise ValueError(f"Invalid sector: {sector}. \n Select from {valid_sectors}")
    
    if gender in valid_genders:
        df_filtered = df_filtered[df_filtered["pvar19"] == gender]
    elif ((gender is not None) and (gender not in valid_genders)):
        raise ValueError(f"Invalid gender: {gender}. \n Select from {valid_genders}")
    
    df_filtered = df_filtered.loc[(df_filtered["pvar20"] >= min_age if min_age is not None else df_filtered.index)]
    df_filtered = df_filtered.loc[(df_filtered["pvar20"] <= max_age if max_age is not None else df_filtered.index)]
    
    #Duration of unemployment
    df_duration = df_filtered.groupby(["pvar57"]).agg(
        population_size = ("weights", "sum"),
        sample_size = ("weights", "size")
    )
    #df_duration = df_filtered.groupby(["pvar57"])["weights"].sum()
    df_duration = pd.DataFrame(df_duration).reset_index()
    total_sum = df_duration["population_size"].sum()
    df_duration["popn_percentage"] = df_duration["population_size"]*100/total_sum
    df_duration = df_duration.sort_values(by="popn_percentage", ascending = False)

    #Labelling categories
    labels_dict = {
    "1": "≤ 6 months",
    "2": "> 6 months, ≤ 1 year",
    "3": "> 1 year, ≤ 2 years",
    "4": "> 2 years, ≤ 3 years",
    "5": "> 3 years"
    }
    df_duration["pvar57"] = df_duration["pvar57"].replace(labels_dict)
    
    print("Duration of unemployment:")
    display(df_duration)

In [18]:
duration_unemployment(state = "Punjab", max_age = 29)

Duration of unemployment:


Unnamed: 0,pvar57,population_size,sample_size,popn_percentage
1,"> 6 months, ≤ 1 year",162689,68,29.298447
2,"> 1 year, ≤ 2 years",154282,64,27.784441
3,"> 2 years, ≤ 3 years",95212,37,17.146603
0,≤ 6 months,83479,36,15.033623
4,> 3 years,59620,36,10.736887


### Reasons for not working

In [19]:
def reason_not_working (state = None, sector = None, gender = None, min_age = None, max_age = None):
    
    #Filtering dataframe
    df_filtered = df.loc[(df["usual_status_code"] >= 81) & (df["usual_status_code"] <= 97) & (df["pvar58"] == "1")].copy()
    
    ##Validation
    valid_sectors = df_filtered["pvar5"].unique()
    valid_states = df_filtered["pvar6"].unique()
    valid_genders = df_filtered["pvar19"].unique()
    
    if state in valid_states:
        df_filtered = df_filtered[df_filtered["pvar6"] == state]
    elif ((state is not None) and (state not in valid_sectors)):
        raise ValueError(f"Invalid state: {state}. \n Select from {valid_states}")
    
    if sector in valid_sectors:
        df_filtered = df_filtered[df_filtered["pvar5"] == sector]
    elif ((sector is not None) and (sector not in valid_sectors)):
        raise ValueError(f"Invalid sector: {sector}. \n Select from {valid_sectors}")
    
    if gender in valid_genders:
        df_filtered = df_filtered[df_filtered["pvar19"] == gender]
    elif ((gender is not None) and (gender not in valid_genders)):
        raise ValueError(f"Invalid gender: {gender}. \n Select from {valid_genders}")
    
    df_filtered = df_filtered.loc[(df_filtered["pvar20"] >= min_age if min_age is not None else df_filtered.index)]
    df_filtered = df_filtered.loc[(df_filtered["pvar20"] <= max_age if max_age is not None else df_filtered.index)]

    #Duration of unemployment
    df_reason = df_filtered.groupby(["pvar59"]).agg(
        population_size = ("weights", "sum"),
        sample_size = ("weights", "size")
    )
    #df_reason = df_filtered.groupby(["pvar59"])["weights"].sum()
    df_reason = pd.DataFrame(df_reason).reset_index()
    df_reason [["population_size", "sample_size"]] = df_reason[["population_size", "sample_size"]].astype(float)
    total_sum = df_reason["population_size"].sum()
    df_reason["percentage"] = (df_reason["population_size"]*100/total_sum).round(2)
    df_reason = df_reason.sort_values(by = "percentage", ascending = False)

    #Labelling categories
    labels_dict = {
    "01": "Retrenchment/lay-off without pay",
    "02": "End of contract/quit",
    "03": "Not operating the unit",
    "04": "Lack of work in the area",
    "05": "Retirement",
    "06": "Child care",
    "07": "Household responsibilities (not child care)",
    "08": "Health-related reasons",
    "10": "No financial need",
    "19": "Others"
    }
    df_reason["pvar59"] = df_reason["pvar59"].replace(labels_dict)
    
    """#Calculating sample size for each category
    sample_size = df_filtered["pvar59"].value_counts()
    sample_size = pd.DataFrame(sample_size).reset_index()
    sample_size ['pvar59'] = sample_size['pvar59'].replace(labels_dict)
    
    #Printing results
    print("Sample size of categories:")
    display(sample_size)"""

    print("Reasons for not working:")
    display(df_reason)

In [20]:
reason_not_working(state = "Punjab")

Reasons for not working:


Unnamed: 0,pvar59,population_size,sample_size,percentage
7,Health-related reasons,648508.0,314.0,43.86
4,Retirement,349017.0,180.0,23.6
6,Household responsibilities (not child care),135512.0,71.0,9.16
1,End of contract/quit,128039.0,70.0,8.66
9,Others,62657.0,31.0,4.24
5,Child care,58442.0,30.0,3.95
3,Lack of work in the area,37655.0,25.0,2.55
0,Retrenchment/lay-off without pay,24442.0,12.0,1.65
8,No financial need,17571.0,6.0,1.19
2,Not operating the unit,16757.0,11.0,1.13


### Main reason for being in Principal activity status (91 to 97) 

In [21]:
def reason_ps_not_working (state = None, sector = None, gender = None, min_age = None, max_age = None):
    
    #Filtering dataframe
    df_filtered = df.loc[(df["usual_status_code"] >= 91) & (df["usual_status_code"] <= 97)].copy()
    
    ##Validation
    valid_sectors = df_filtered["pvar5"].unique()
    valid_states = df_filtered["pvar6"].unique()
    valid_genders = df_filtered["pvar19"].unique()
    
    if state in valid_states:
        df_filtered = df_filtered[df_filtered["pvar6"] == state]
    elif ((state is not None) and (state not in valid_sectors)):
        raise ValueError(f"Invalid state: {state}. \n Select from {valid_states}")
    
    if sector in valid_sectors:
        df_filtered = df_filtered[df_filtered["pvar5"] == sector]
    elif ((sector is not None) and (sector not in valid_sectors)):
        raise ValueError(f"Invalid sector: {sector}. \n Select from {valid_sectors}")
    
    if gender in valid_genders:
        df_filtered = df_filtered[df_filtered["pvar19"] == gender]
    elif ((gender is not None) and (gender not in valid_genders)):
        raise ValueError(f"Invalid gender: {gender}. \n Select from {valid_genders}")
    
    df_filtered = df_filtered.loc[(df_filtered["pvar20"] >= min_age if min_age is not None else df_filtered.index)]
    df_filtered = df_filtered.loc[(df_filtered["pvar20"] <= max_age if max_age is not None else df_filtered.index)]
    
    #Duration of unemployment
    df_reason = df_filtered.groupby(["pvar60"]).agg(
        population_size = ("weights", "sum"),
        sample_size = ("weights", "size")
    )
    #df_reason = df_filtered.groupby(["pvar60"])["weights"].sum()
    df_reason = pd.DataFrame(df_reason).reset_index()
    df_reason [["population_size", "sample_size"]] = df_reason[["population_size", "sample_size"]].astype(float)
    total_sum = df_reason["population_size"].sum()
    df_reason["percentage"] = (df_reason["population_size"]*100/total_sum).round(2)
    df_reason = df_reason.sort_values(by="percentage", ascending = False)

    #Labelling categories
    labels_dict = {
    "1": "Lack of training/qualification/age",
    "2": "No work at convenient location",
    "3": "Health/age-related reason",
    "4": "Wants to continue study",
    "5": "Social reasons",
    "6": "Financially well-off",
    "7": "Child care/personal commitments",
    "9": "Others"
    }
    df_reason["pvar60"] = df_reason["pvar60"].replace(labels_dict)
    
    """#Calculating sample size for each category
    sample_size = df_filtered["pvar60"].value_counts()
    sample_size = pd.DataFrame(sample_size).reset_index()
    sample_size ['pvar60'] = sample_size['pvar60'].replace(labels_dict)
    
    #Printing results
    print("Sample size pf categories:")
    display(sample_size)"""

    print("Reasons for being in status code 91-97:")
    display(df_reason)

In [22]:
df["pvar19"].unique()

array(['male', 'female', 'transgender'], dtype=object)

In [23]:
reason_ps_not_working(gender="male", min_age=15, max_age=59)

Reasons for being in status code 91-97:


Unnamed: 0,pvar60,population_size,sample_size,percentage
3,Wants to continue study,54289148.0,20698.0,86.17
2,Health/age-related reason,5658657.0,2043.0,8.98
7,Others,1729648.0,631.0,2.75
6,Child care/personal commitments,596168.0,231.0,0.95
0,Lack of training/qualification/age,360318.0,108.0,0.57
5,Financially well-off,201598.0,62.0,0.32
1,No work at convenient location,102576.0,36.0,0.16
4,Social reasons,66607.0,21.0,0.11


In [24]:
reason_ps_not_working(gender="female", min_age=15, max_age=59)

Reasons for being in status code 91-97:


Unnamed: 0,pvar60,population_size,sample_size,percentage
6,Child care/personal commitments,135597595.0,47201.0,64.54
3,Wants to continue study,47789209.0,18126.0,22.75
4,Social reasons,8187304.0,2526.0,3.9
2,Health/age-related reason,7220506.0,2567.0,3.44
7,Others,6780318.0,2731.0,3.23
5,Financially well-off,1905141.0,639.0,0.91
1,No work at convenient location,1431980.0,500.0,0.68
0,Lack of training/qualification/age,1175832.0,429.0,0.56


## Share of informal employment

In [25]:
#Creating a new column for nature of contract based on usual status
df["usual_status_contract"] = df.apply(lambda row: row["pvar49"] if ((row["pvar32"]>=81) & pd.notna(row["pvar43"])) else row["pvar39"], axis=1 )


  df["usual_status_contract"] = df.apply(lambda row: row["pvar49"] if ((row["pvar32"]>=81) & pd.notna(row["pvar43"])) else row["pvar39"], axis=1 )


In [26]:
#Creating a new column for access to social security based on usual status
df["usual_status_social_security"] = df.apply(lambda row: row["pvar51"] if ((row["pvar32"]>=81) & pd.notna(row["pvar43"])) else row["pvar41"], axis=1 )


  df["usual_status_social_security"] = df.apply(lambda row: row["pvar51"] if ((row["pvar32"]>=81) & pd.notna(row["pvar43"])) else row["pvar41"], axis=1 )


In [27]:
def share_informality (state = None, sector = None, gender = None, min_age = None, max_age = None):
    
    #Filtering dataframe for employed people
    df_filtered = df.loc[(df["usual_status_code"]<81)].copy()
    
    ##Validation
    valid_sectors = df_filtered["pvar5"].unique()
    valid_states = df_filtered["pvar6"].unique()
    valid_genders = df_filtered["pvar19"].unique()
    
    if state in valid_states:
        df_filtered = df_filtered[df_filtered["pvar6"] == state]
    elif ((state is not None) and (state not in valid_sectors)):
        raise ValueError(f"Invalid state: {state}. \n Select from {valid_states}")
    
    if sector in valid_sectors:
        df_filtered = df_filtered[df_filtered["pvar5"] == sector]
    elif ((sector is not None) and (sector not in valid_sectors)):
        raise ValueError(f"Invalid sector: {sector}. \n Select from {valid_sectors}")
    
    if gender in valid_genders:
        df_filtered = df_filtered[df_filtered["pvar19"] == gender]
    elif ((gender is not None) and (gender not in valid_genders)):
        raise ValueError(f"Invalid gender: {gender}. \n Select from {valid_genders}")
    
    df_filtered = df_filtered.loc[(df_filtered["pvar20"] >= min_age if min_age is not None else df_filtered.index)]
    df_filtered = df_filtered.loc[(df_filtered["pvar20"] <= max_age if max_age is not None else df_filtered.index)]
    
    #Duration of unemployment
    df_filtered["informal_dummy"] = df_filtered.apply(lambda row: 1 
                                                      if ((row["usual_status_contract"] == "1") |
                                                         (row["usual_status_social_security"] == "8") |
                                                         (row["usual_status_code"] in[11,12,21]))
                                                         else 0, axis=1)
    
    """df_informal = df_filtered.loc[(df_filtered["usual_status_contract"] == "1") | #No written contract OR
                     (df_filtered["usual_status_social_security"] == "8") |       #No social security benefit OR
                     (df_filtered["usual_status_code"].isin([11, 12, 21]))        #Worked in HH enterprise.
                     ]"""
    df_informal = df_filtered.groupby(["informal_dummy"]).agg(
        population_size = ("weights", "sum"),
        sample_size = ("weights", "size")
    )
    df_informal = pd.DataFrame(df_informal).reset_index()
    df_informal [["population_size", "sample_size"]] = df_informal[["population_size", "sample_size"]].astype(float)
    total_sum = df_informal["population_size"].sum()
    df_informal["percentage"] = (df_informal["population_size"]*100/total_sum).round(2)
    df_informal = df_informal.sort_values(by="percentage", ascending = False)

    #Labelling categories
    labels_dict = {
    1: "Informally employed",
    0: "Formally employed",
    }
    df_informal["informal_dummy"] = df_informal["informal_dummy"].replace(labels_dict)

    print("Share informality")
    return (df_informal.reset_index(drop=True))

In [28]:
share_informality (sector="urban")

Share informality


Unnamed: 0,informal_dummy,population_size,sample_size,percentage
0,Informally employed,111030968.0,55220.0,81.45
1,Formally employed,25280309.0,12895.0,18.55
