# Importing Libraries

In [6]:
import pandas as pd
import numpy as np
import os
import pathlib

In [22]:
# Check relative paths
#pathlib.Path(r"..\..\..\Dta raw files\level_14.dta").resolve()

# Loading data

In [18]:
level_14_path = r"..\..\..\Dta raw files\level_14.dta"
level_01_path = r"..\..\..\Dta raw files\level_01.dta"
level_15_path = r"..\..\..\Dta raw files\level_15.dta"

In [19]:
df_level_14 = pd.read_stata(level_14_path)

In [5]:
df_level_14.head()

Unnamed: 0,index,hhid,questionnaire_num,level,section,item_code,value,multiplier
0,0,31000301,C,14,11.3,439,1399,96498
1,1,31000301,F,14,6.2,219,37,96498
2,2,31000301,F,14,6.5,199,200,96498
3,3,31000301,C,14,8.1,349,347,96498
4,4,31000301,D,14,14.1,629,1650,96498


In [6]:
#Reading level 15 for HH size
df_level_15 = pd.read_stata(level_15_path)
df_level_15 = df_level_15.pivot(index = "hhid", columns="questionnaire_num", values = 'hh_size')
df_level_15 = df_level_15[['C','D','F']] #Preserving only FDQ questionnaire hh size
df_level_15.head(10)

questionnaire_num,C,D,F
hhid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
31000301,1.0,1.0,1.0
31000302,5.0,5.0,5.0
31000303,5.0,5.0,5.0
31000304,2.0,2.0,2.0
31000305,1.0,1.0,1.0
31000306,1.0,1.0,1.0
31000307,3.0,3.0,3.0
31000308,5.0,5.0,5.0
31000309,4.0,4.0,4.0
31000310,3.0,3.0,3.0


## Commodity mapping

In [7]:
commodity_mapping = {'food_w': {
                                'description': 'Food items with recorded weekly expenditure', 
                                'item_list': [169, 219, 239, 249, 199, 189, 269, 279, 289, 299],
                                'duration': 7
                                },
                     'food_m': {
                                'description': 'Food items with recorded monthly expenditure', 
                                'item_list': [129,139,159,179],
                                'duration': 30
                                },
                     'consumables_w': {
                                'description': 'Consumable items with recorded weekly expenditure', 
                                'item_list': [309,319,329],
                                'duration': 7
                                },
                     'consumables_m': {
                                'description': 'Consumable items with recorded monthly expenditure', 
                                'item_list': [349,459,479,429,519,499,439,529],
                                'duration': 30
                                },
                     'consumables_y': {
                                'description': 'Consumable items with recorded yearly expenditure', 
                                'item_list': [409,419,899],
                                'duration': 365
                                },
                     'durables_y': {
                                'description': 'Durable items with recorded yearly expenditure', 
                                'item_list': [379,399,389,629,609,99,619,599,579,559,569,639,649],
                                'duration': 365
                                },
                    }

## Scaling function

In [8]:
#Scaling function to convert expenditure based on hh_size (CSQ/DGQ) to FDQ

def scaling_column (hh_series, old_hsize_series, new_hsize_series):
    scaled_series = hh_series.div(old_hsize_series, axis = 'index')
    scaled_series = scaled_series.mul(new_hsize_series, axis = 'index')
    return scaled_series

## Pivoting and scaling dataframe

In [None]:
df_pivot = df_level_14.pivot(index = 'hhid', columns="item_code", values = 'value')
print(f"# of households = {df_pivot.shape[0]}")

# Scaling FDQ items
ic_food = [item for key in ['food_w', 'food_m'] for item in commodity_mapping [key]['item_list']] #List of FDQ items
df_pivot[ic_food] = df_pivot[ic_food].apply(lambda col: scaling_column(hh_series= col, 
                                                                       old_hsize_series= df_level_15['F'], 
                                                                       new_hsize_series= df_level_15['F']))

# Scaling CSQ items
ic_consumables = [item for key in ['consumables_w', 'consumables_m', 'consumables_y'] for item in commodity_mapping [key]['item_list']] #List of CSQ items
df_pivot[ic_consumables] = df_pivot[ic_consumables].apply(lambda col: scaling_column(hh_series= col, 
                                                                                     old_hsize_series= df_level_15['C'], 
                                                                                     new_hsize_series= df_level_15['F']))

# Scaling DGQ items
ic_durables = [item for key in ['durables_y'] for item in commodity_mapping [key]['item_list']] #List of DGQ items
df_pivot[ic_durables] = df_pivot[ic_durables].apply(lambda col: scaling_column(hh_series= col, 
                                                                                     old_hsize_series= df_level_15['D'], 
                                                                                     new_hsize_series= df_level_15['F']))

#Display
df_pivot.head()

# of households = 261746


item_code,99,129,139,159,169,179,189,199,219,239,...,559,569,579,599,609,619,629,639,649,899
hhid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31000301,,135.0,,54.0,,4.0,40.0,200.0,37.0,20.0,...,600.0,,240.0,400.0,,,1650.0,,,
31000302,,1267.0,,218.0,225.0,66.0,110.0,226.0,165.0,60.0,...,500.0,1500.0,,,6000.0,,2000.0,,700.0,
31000303,,1586.0,,234.0,350.0,80.0,120.0,760.0,224.0,85.0,...,1000.0,,1100.0,750.0,8500.0,,600.0,,1200.0,
31000304,,800.0,,102.0,175.0,58.0,85.0,210.0,116.0,100.0,...,600.0,700.0,350.0,2000.0,,,650.0,,2500.0,780.0
31000305,,181.0,,31.0,50.0,14.0,34.0,24.0,19.0,20.0,...,500.0,,300.0,950.0,,,400.0,2500.0,,785.0


## Method: Aggregate expenditure -- Food, Consumables and Durables

In [10]:
#Method to merge weekly/monthly/yearly expenditure into total monthly expenditure for the category 

def expenditure (hh_series, commodity_mapping = commodity_mapping):
    series_set = set(hh_series.index)
    commodity_set = set(commodity_mapping.keys())

    if series_set.issubset(commodity_set):    
        durations = hh_series.index.map(lambda x: commodity_mapping[x]['duration'] ) #Creating a series of duration
        item_monthly = (hh_series/durations)*30
        item_monthly = item_monthly.round(2)
        total_exp = item_monthly.sum()
    else: 
        print("List not in commodity mapping")
    return total_exp

## Total expenditure

In [None]:
#Generating total expenditure for each key in commodity mapping
for key in commodity_mapping.keys():
    df_pivot[key] = df_pivot[commodity_mapping[key]['item_list']].apply(lambda x: x.sum(), axis = 1)

In [12]:
#Generating total monthly expenditures for each of the categories: food, consumables, and durables
df_pivot['food_total'] = df_pivot[['food_w', 'food_m']].apply(expenditure, axis = 1)
df_pivot['consumables_total'] = df_pivot[['consumables_w', 'consumables_m', 'consumables_y']].apply(expenditure, axis = 1)
df_pivot['durables_total'] = df_pivot[['durables_y']].apply(expenditure, axis = 1) #df_pivot[['durables_y']] because the expenditure function is made for a dataframe

In [13]:
#Total monthly expenditure of HHs
df_pivot['total expenditure'] = df_pivot['food_total'] + df_pivot['consumables_total'] + df_pivot['durables_total']

In [14]:
df_pivot.head()

item_code,99,129,139,159,169,179,189,199,219,239,...,food_w,food_m,consumables_w,consumables_m,consumables_y,durables_y,food_total,consumables_total,durables_total,total expenditure
hhid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31000301,,135.0,,54.0,,4.0,40.0,200.0,37.0,20.0,...,1986.0,193.0,800.0,6227.0,0.0,16007.0,8704.43,9655.57,1315.64,19675.64
31000302,,1267.0,,218.0,225.0,66.0,110.0,226.0,165.0,60.0,...,2679.0,1551.0,1006.0,15688.0,49750.0,38650.0,13032.43,24088.47,3176.71,40297.61
31000303,,1586.0,,234.0,350.0,80.0,120.0,760.0,224.0,85.0,...,2649.0,1900.0,0.0,19216.0,27050.0,41950.0,13252.86,21439.29,3447.95,38140.1
31000304,,800.0,,102.0,175.0,58.0,85.0,210.0,116.0,100.0,...,1093.0,960.0,0.0,5496.0,780.0,22397.0,5644.29,5560.11,1840.85,13045.25
31000305,,181.0,,31.0,50.0,14.0,34.0,24.0,19.0,20.0,...,242.0,226.0,0.0,4354.0,785.0,13190.0,1263.14,4418.52,1084.11,6765.77


## Calculating MPCE + Assigning separate deciles to rural and urban sector

In [15]:
#Reading level 1 for sector (Rural vs Urban) and multiplier
df_level_01 = pd.read_stata(level_01_path)
df_level_01 = df_level_01[['hhid', 'sector', 'multiplier']]
df_level_01 = df_level_01.sort_values(by = 'hhid')
df_level_01 = df_level_01.set_index('hhid')
df_level_01.head()

Unnamed: 0_level_0,sector,multiplier
hhid,Unnamed: 1_level_1,Unnamed: 2_level_1
31000301,2,96498
31000302,2,96498
31000303,2,96498
31000304,2,96498
31000305,2,96498


In [16]:
## Merging expenditure and hh_size table with df_pivot
df_mpce = pd.merge(df_pivot[['food_total', 'consumables_total', 'durables_total', 'total expenditure']], df_level_15['F'], 
                   how = 'inner', 
                   #indicator = True,
                   left_index = True,
                   right_index = True)

## Merging df_mpce and multiplier table
df_mpce = pd.merge(df_mpce, df_level_01,
                     how = 'inner', 
                    #indicator = True,
                    left_index = True,
                    right_index = True)

## Renaming hh_size column
df_mpce = df_mpce.rename(columns = {"F": "hh_size"})

## Expanding each row hh_size number of times
df_mpce = df_mpce.loc[df_mpce.index.repeat(df_mpce['hh_size'])]

## MPCE for each household
df_mpce['mpce'] = round(df_mpce['total expenditure']/df_mpce['hh_size'],2) #rounded off to two digits


# Calculating rural-urban deciles
## Sorting by sector and MPCE for decile calculation
df_mpce = df_mpce.sort_values(by = ['sector','mpce'])

## Normalizing multiplier values to range between 0 and 100
df_mpce["normal_multiplier"] = df_mpce.groupby('sector')['multiplier'].transform(lambda x: x*(100/x.sum())) #groupby requires use of transform (and not apply)

## Cumulative normalised values
df_mpce['cum_normal_multiplier'] = df_mpce.groupby('sector')['normal_multiplier'].cumsum()

## Decile classes based on cumulative normalised multiplier values
df_mpce['decile'] = pd.cut(df_mpce['cum_normal_multiplier'],
                             bins = 10,
                             labels = range(1,11),
                             include_lowest=False
                             )

##Collapsing df_mpce back to only keep the first row
df_mpce = df_mpce.groupby("hhid").first()

In [17]:
df_mpce.head()

Unnamed: 0_level_0,food_total,consumables_total,durables_total,total expenditure,hh_size,sector,multiplier,mpce,normal_multiplier,cum_normal_multiplier,decile
hhid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
31000301,8704.43,9655.57,1315.64,19675.64,1.0,2,96498,19675.64,0.00028,98.070289,10
31000302,13032.43,24088.47,3176.71,40297.61,5.0,2,96498,8059.52,0.00028,77.872489,8
31000303,13252.86,21439.29,3447.95,38140.1,5.0,2,96498,7628.02,0.00028,74.870871,8
31000304,5644.29,5560.11,1840.85,13045.25,2.0,2,96498,6522.62,0.00028,65.18541,7
31000305,1263.14,4418.52,1084.11,6765.77,1.0,2,96498,6765.77,0.00028,67.463181,7


In [18]:
#Method to calculate weighted expenditure for any list of categories and any expenditure column
def wt_mpce_exp (table,categories: list, expenditure_column: str):
    """
    Calculates category wise expenditure weighted by multiplier

    Args:
        table: Dataframe containing hhsize, expenditures, and categories.
        categories: list of categories over which to calculate weighted per capita expenditure.
        expenditure_column: Name of the expenditure column.

    Return:
        agg_df: Aggregate dataframe
    """


    try:
        dataframe = table.copy()
        dataframe['exp_mult'] = (dataframe['multiplier']/100).round(0) * dataframe[expenditure_column] #Multiplying expenditure col with multiplier
        dataframe['hh_size_mult'] = (dataframe['multiplier']/100).round(0) * dataframe['hh_size']      #Multiplying hh_size col with multiplier
        agg_df = dataframe.groupby(by = categories, as_index = False, observed = False).agg(
            mult_exp = ('exp_mult','sum'),                                              #Category wise sum-product of multiplier and expenditure
            mult_hh_size = ('hh_size_mult', 'sum')                                      #Category wise sum-product of hh_size
        )
        agg_df['wt_expenditure'] = round(agg_df['mult_exp']/agg_df['mult_hh_size'],2)           #Weighted per capita expenditure for each category
        #return agg_df[categories + ['wt_expenditure']]
        return agg_df
    except Exception as e: 
        print(e)

In [19]:
wt_mpce_exp(df_mpce, ['sector', 'decile'], 'total expenditure')

Unnamed: 0,sector,decile,mult_exp,mult_hh_size,wt_expenditure
0,1,1,137202400000.0,86987797.0,1577.26
1,1,2,183681100000.0,86981366.0,2111.73
2,1,3,213415700000.0,86979422.0,2453.63
3,1,4,240813700000.0,86988491.0,2768.34
4,1,5,269098500000.0,86976609.0,3093.92
5,1,6,300520100000.0,86987044.0,3454.77
6,1,7,338097300000.0,86979146.0,3887.11
7,1,8,387794900000.0,86981502.0,4458.36
8,1,9,466031000000.0,87010538.0,5356.03
9,1,10,745243400000.0,86953659.0,8570.58


# Saving

In [20]:
try:
    base_export_path = r"G:\.shortcut-targets-by-id\1NprIdwv7vnADEhIOU_jU6YsVBoFvikqW\Coal Research\HCES 2022-23\Python implementation\Codes\Population based MPCE"

#saving
    df_mpce.to_pickle(base_export_path + r"\MPCE_1Jan24.pkl")
except Exception as e:
    print(e)