# Importing libraries

In [1]:
import pandas as pd
import os
import numpy as np

# Reading pickle files

In [2]:
# Read all .pkl files in the current directory
pkl_files = []
for file in os.listdir():
    if file.endswith('.pkl') and file not in ["hh_quintile.pkl"]:
        pkl_files.append(file)

In [3]:
# Dictionary containing level wise data frames
df_dict = {}
for file in pkl_files:
    df_name = 'df_' + file.split(".")[0]
    df = pd.read_pickle(file)
    df = df.transpose()                                         # Transposes each df for readability
    df = df.rename_axis(index = 'item_code', columns = None)    # Removes the name of column title
    df = df.reset_index()                                       # Makes item_code a df column and resets index
    df_dict[df_name] = df

In [4]:
# Vertically concat all dataframes based on decile columns
try:
    keys = [key for key in df_dict.keys()]
    final_df = pd.DataFrame()
    for key in keys:
        final_df = pd.concat([final_df,df_dict[key]])
    final_df = final_df.drop_duplicates(subset = 'item_code')
except Exception as e:
    print(e)

In [5]:
display(final_df.head())
print(final_df.shape)

Unnamed: 0,item_code,1,2,3,4,5
0,260,106.44,125.36,148.45,170.06,211.13
1,202,723.6,1035.53,1266.5,1485.34,1914.3
2,219,6935.13,9821.9,11450.11,13007.86,16395.59
3,189,4619.29,6763.21,7676.86,8396.67,10051.68
4,253,478.37,687.28,801.78,937.34,1139.82


(444, 6)


# Item codes

In [6]:
# All item codes

## Level 5
ic_51 = ['061', '062', '070', '101', '102', '103', '105', '106', '107', '108', '110', '111', '112', '114', '001', '002', '122', '129', '139', \
         '055', '064', '068','116', '058', '056', '115', '121', '057', '063', '066', '117', '120', '059', '118', '060', '067'] 
ic_52 = ['140', '143', '144', '145', '141', '142', '146', '148', '150', '158', '152', '071', '072', '159']
ic_53 = ['073', '074', '178', '170', '171', '172', '173', '174', '175', '179']
ic_61 = ['160', '162', '164', '163', '165', '166', '003', '004', '005', '092', '169', '007', '167', '006']
ic_62 = ['201', '200', '202', '206', '203', '208', '207', '211', '213', '212', '210', '215', '214', '205', '204', '216', '217', '219']
ic_63 = ['220', '236', '224', '231', '237', '225', '226', '230', '228', '222', '232', '093', '239', '233', '223', '235', '234', '221', '238', '227']
ic_64 = ['240', '241', '243', '242', '245', '246', '094', '249', '247', '008', '010']
ic_65 = ['190', '195', '191', '192', '193', '194', '196', '199']
ic_66 = ['181', '184', '182', '183', '188', '095', '075', '189', '185', '180']
ic_67 = ['256', '254', '252', '253', '251', '250', '258', '255', '257', '260', '263', '261', '269']
ic_68 = ['270', '271', '272', '273', '274', '011', '275', '276', '278', '279']

## Level 6
ic_71 = [ "076","280", "281", "282", "283", "284", "289"]
ic_72 = ["012", "161", "290", "013", "291", "292", "293", "294", "295", "113", "014", "015", "296", "299"]

## Level 8
ic_81 = ["332", "338", "331", "334", "335", "096", "349", '346', '333', '336', '340', '344', '341', '345', '342', '337', '343']

## Level 9
ic_91 = ["450", "016", "451", "453", "017", "018", "452", "020", "021", "456", "022", "454", "455", "457", "459"]   
ic_92 = ["466", "467", "465", "471", "464", "468", "470", "023", "460", "462", "461", "463", "472", "479"]
ic_101 = ["405", "406", "400", "401", "404", "408", "409"]
ic_102 = ["410", "412", "411", "413", "414", "419"]
ic_103 = ['420', '422', '421', '423', '424', '429']
ic_111 = ["024", "504", "025", "506", "505", "500", "508", "501", "510", "026", "511", "503", "512", "513", "514", "519"]
ic_112 = ["488", "487", "496", "540", "483", "480", "484", "482", "481", "027", "485", "486", "492", "495", "494", "493", "490", "497", "499"]
ic_113 = ["437", "430", "402", "435", "433", "028", "438", "439"]
ic_114 = ["520", "521", "522", "523", "529", "539"]
ic_115 = ["899"] # Considering separate from ic_114 because it's for 365 dqys

## Level 10
ic_121 = ['300', '301', '302', '309']
ic_122 = ['310', '311', '316', '312', '314', '315', '313', '317', '319']
ic_123 = ['322', '324', '323', '321', '320', '325', '329']

## Level 12
ic_131 = ["351", "352", "353", "030", "363", "031", "364", "354", "032", "358", "360", "356", "357", "033", "361", "362", "365", "034", "368", "355", "366", "367", "370", "350", "371", "373", "372", "374", "375", "379"]
ic_132 = ["393", "390", "391", "392", "394", "395", "399"]
ic_133 = ["380", "381", "383", "382", "385", "384", "386", "389"]

## Level 13
ic_1401 = ["440", "441", "442", "623", "624", "622", "620", "554", "035", "621", "625", "629"]
ic_1402 = ["601", "602", "600", "603", "604", "609"]
ic_1403 = ["036", "037", "432", "097", "099"]
ic_1404 = ["040", "041", "610", "042", "611", "619"]
ic_1405 = ["580", "043", "581", "044", "045", "588", "585", "582", "046", "586", "590", "047", "048", "587", "591", "584", "583", "592", "599"]
ic_1406 = ["570", "571", "572", "050", "573", "579"]
ic_1407 = ["550", "551", "552", "553", "555", "556", "557", "559"]
ic_1408 = ["560", "561", "562", "563", "564", "569"]
ic_1409 = ["632", "630", "631", "633", "639"]
ic_1410 = ["640", "641", "642", "643", "649"]

In [7]:
# Combined level wise list
ic_lvl5_list = ic_51 + ic_52 + ic_53+ic_61 + ic_62 + ic_63 + ic_64 + ic_65 + ic_66 + ic_67 + ic_68
ic_lvl6_list = ic_71 + ic_72
ic_lvl8_list = ic_81
ic_lvl9_list = ic_91 + ic_92 + ic_101 + ic_102 + ic_103 + ic_111 + ic_112 + ic_113 + ic_114 + ic_115
ic_lvl10_list = ic_121 + ic_122 + ic_123
ic_lvl12_list = ic_131+ic_132+ic_133
ic_lvl13_list = ic_1401 + ic_1402 + ic_1403 + ic_1404 + ic_1405 + ic_1406 + ic_1407 + ic_1408 + ic_1409 + ic_1410

In [8]:
# Questionnaire/category wise list
fdq_list = ic_lvl5_list + ic_lvl6_list
csq_list = ic_lvl8_list + ic_lvl9_list + ic_lvl10_list
dgq_list = ic_lvl12_list + ic_lvl13_list

In [10]:
# Items to be avoided while summing to avoid double counting (sub totals + others)
ic_lvl5_st_list  = ['129','159', '179', '169', '219', '239', '249', '199','189','269', '279','092' ,'093', '094','095', '055', '056', '057', '058', '059', '060', '115', '116', '117', '118', '121', '120', '063', '064', '065', '066', '067', '068']
ic_lvl6_st_list = ['289', '299']
ic_lvl8_st_list = ['349', '096']
ic_lvl9_st_list = ['459', '479', '409', '419', '429', '519', '499', '439', '529', '539'] #item '539' is imputed rent. It is not considered while calculating total expenditure
ic_lvl10_st_list = ['309', '319', '329']
ic_lvl12_st_list = ["379", "399", "389"]
ic_lvl13_st_list =  ["629", "609", "099", "619", "599", "579", "559", "569", "639", "649"]

In [11]:
#Assigning category to each item code (FDQ/CSQ/DGQ)
condition_list = [np.isin(final_df['item_code'], fdq_list),
                  np.isin(final_df['item_code'], csq_list),
                  np.isin(final_df['item_code'], dgq_list)]
choice_list = ['fdq', 'csq', 'dgq']
final_df['category'] = np.select(condition_list, choice_list, default = "")

In [12]:
# Combined list of item codes to remove before summing (for verification)
sub_total_list = ic_lvl5_st_list + ic_lvl6_st_list + ic_lvl8_st_list + ic_lvl9_st_list + ic_lvl10_st_list + ic_lvl12_st_list + ic_lvl13_st_list

In [13]:
# Removing duplicates
final_df = final_df[~final_df['item_code'].isin(sub_total_list)]
final_df = final_df.reset_index(drop=True)

In [14]:
final_df.head()

Unnamed: 0,item_code,1,2,3,4,5,category
0,260,106.44,125.36,148.45,170.06,211.13,fdq
1,202,723.6,1035.53,1266.5,1485.34,1914.3,fdq
2,253,478.37,687.28,801.78,937.34,1139.82,fdq
3,237,149.92,287.6,394.4,514.73,755.58,fdq
4,140,895.66,1266.12,1386.59,1465.6,1739.44,fdq


In [15]:
# Aggregating based on category
final_df.groupby(by = 'category')[final_df.columns.difference(['item_code', 'category'])].apply('sum')

Unnamed: 0_level_0,1,2,3,4,5
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,95854.28,160936.5,215570.25,287080.73,494330.71
csq,37124.39,64239.12,89539.37,125828.5,235929.11
dgq,9894.61,17764.36,24832.27,35249.0,75709.78
fdq,48835.33,78933.02,101198.64,126003.2,182691.81


In [16]:
final_df.to_clipboard()