# Importing libraries

In [6]:
import pandas as pd
import pathlib

# Reading data

### Creating PLFS dataloader for reading txt file|

The same functionality can be easily achieved by creating it as a function

In [3]:
class PLFSdataloader:
    
    #Initialising the dataloader
    def __init__ (self,file_path,col_names,col_desc,col_specs):
        self.file_path = file_path
        self.col_names = col_names
        self.col_desc = col_desc
        self.col_specs = col_specs
        self.data_df = None
        self.col_df = None
    
        #Automatically load data when an instance is created
        self.load_data()

    #Function to load data as a df
    def load_data (self):
        self.data_df = pd.read_fwf(self.file_path, colspecs = self.col_specs, names = self.col_names, dtype=str)
        
        col_dictionary = {
            "variable" : self.col_names,
            "descriptions" : self.col_desc
        }
        self.col_df = pd.DataFrame(col_dictionary)
        self.col_df = self.col_df.set_index('variable')
    
    #Return the output as two separate dfs for data and columns
    def get_dataframes(self):
        return self.data_df, self.col_df

### Loading household files from first visit

In [10]:
pathlib.Path(r"..\Raw data\HHV1.TXT").resolve().exists()

True

In [11]:
#INPUT DATA

#file path
file_path = r'..\Raw data\HHV1.TXT'

#column indices
col_specs = [(0,4),	(4,7),	(7,9),	(9,11),	(11,12),	(12,14),	(14,16),	(16,19),	(19,21),	(21,23),	(23,24),	(24,28),	(28,33),	(33,34),	(34,35),	(35,37),	(37,39),	(39,40),	(40,41),	(41,42),	(42,44),	(44,45),	(45,46),	(46,47),	(47,55),	(55,63),	(63,71),	(71,79),	(79,87),	(87,95),	(95,97),	(97,105),	(105,109),	(109,112),	(112,115),	(115,125),	(125,126)]

#column names
col_names = ["hhvar1",	"hhvar2",	"hhvar3",	"hhvar4",	"hhvar5",	"hhvar6",	"hhvar7",	"hhvar8",	"hhvar9",	"hhvar10",	"hhvar11",	"hhvar12",	"hhvar13",	"hhvar14",	"hhvar15",	"hhvar16",	"hhvar17",	"hhvar18",	"hhvar19",	"hhvar20",	"hhvar21",	"hhvar22",	"hhvar23",	"hhvar24",	"hhvar25",	"hhvar26",	"hhvar27",	"hhvar28",	"hhvar29",	"hhvar30",	"hhvar31",	"hhvar32",	"hhvar33",	"hhvar34",	"hhvar35",	"hhvar36",	"hhvar37"]

#column descriptions
col_desc = ["File Identification",	"Schdule",	"Quarter",	"Visit",	"Sector",	"State/Ut Code",	"District Code",	"NSS-Region",	"Stratum",	"Sub-Stratum",	"Sub-Sample",	"Fod Sub-Region",	"FSU",	"Sample Sg/Sb No.",	"Second Stage Stratum No.",	"Sample Household Number",	"Month of Survey",	"Response Code",	"Survey Code",	"Reason for Substitution of original household",	"Household Size",	"Household Type",	"Religion",	"Social Group",	"Household's usual consumer Expenditure in A Month for purposes out of Goods and Services(Rs.)",	"Imputed value of usual consumption in a month out of Home Grown stock (Rs.)",	"Imputed value of usual consumption in a Month from wages in kind,free collection, gifts etc. (Rs.)",	"Household's Annual Expenditure on purchase of items like clothing, footwear etc.(Rs.)",	"Household's Annual Expenditure on purchase of durables like Bedstead, TV, fridge etc.(Rs.)",	"Household'S Usual Consumer Expenditure In A Month (Rs.)",	"Informant Serial no.",	"Survey Date",	"Total Time Taken To Canvass Sch. 10.4",	"Ns count for sector x stratum x substratum x sub-sample",	"Ns count for sector x stratum x substratum",	"Sub-sample wise Multiplier",	"Count of contributing State x Sector x Stratum x SubStratum in 4 Quarters"]


In [12]:
#Initiating PLFS dataloader with hh files
df_hhv1,coldesc_hhv1 = PLFSdataloader(file_path = file_path, col_names = col_names, col_desc = col_desc, col_specs = col_specs).get_dataframes()


In [13]:
df_hhv1.head()

Unnamed: 0,hhvar1,hhvar2,hhvar3,hhvar4,hhvar5,hhvar6,hhvar7,hhvar8,hhvar9,hhvar10,...,hhvar28,hhvar29,hhvar30,hhvar31,hhvar32,hhvar33,hhvar34,hhvar35,hhvar36,hhvar37
0,FVH7,104,Q1,V1,1,2,4,21,1,14,...,10000,0,8733,3,11082023,70,2,4,246798,4
1,FVH7,104,Q1,V1,1,2,4,21,1,14,...,9000,0,7550,3,11082023,60,2,4,35596,4
2,FVH7,104,Q1,V1,1,2,4,21,1,14,...,10000,0,8483,1,11082023,64,2,4,35596,4
3,FVH7,104,Q1,V1,1,2,4,21,1,14,...,8000,0,7017,1,10082023,65,2,4,61700,4
4,FVH7,104,Q1,V1,1,2,4,21,1,14,...,8000,5000,9333,3,10082023,65,2,4,1613680,4


### Loading personal files from first visit

In [15]:
file_path = r'..\Raw data\PERV1.TXT'
col_specs = [(0,4),	(4,7),	(7,9),	(9,11),	(11,12),	(12,14),	(14,16),	(16,19),	(19,21),	(21,23),	(23,24),	(24,28),	(28,33),	(33,34),	(34,35),	(35,37),	(37,39),	(39,40),	(40,41),	(41,44),	(44,45),	(45,47),	(47,49),	(49,51),	(51,53),	(53,54),	(54,55),	(55,57),	(57,58),	(58,59),	(59,60),	(60,62),	(62,67),	(67,70),	(70,71),	(71,73),	(73,75),	(75,76),	(76,77),	(77,78),	(78,79),	(79,80),	(80,82),	(82,87),	(87,90),	(90,92),	(92,94),	(94,95),	(95,96),	(96,97),	(97,98),	(98,99),	(99,100),	(100,101),	(101,102),	(102,103),	(103,104),	(104,105),	(105,107),	(107,108),	(108,110),	(110,112),	(112,114),	(114,119),	(119,121),	(121,123),	(123,125),	(125,130),	(130,132),	(132,134),	(134,136),	(136,138),	(138,140),	(140,145),	(145,147),	(147,149),	(149,151),	(151,156),	(156,158),	(158,160),	(160,162),	(162,164),	(164,166),	(166,171),	(171,173),	(173,175),	(175,177),	(177,182),	(182,184),	(184,186),	(186,188),	(188,190),	(190,192),	(192,197),	(197,199),	(199,201),	(201,203),	(203,208),	(208,210),	(210,212),	(212,214),	(214,216),	(216,218),	(218,223),	(223,225),	(225,227),	(227,229),	(229,234),	(234,236),	(236,238),	(238,240),	(240,242),	(242,244),	(244,249),	(249,251),	(251,253),	(253,255),	(255,260),	(260,262),	(262,264),	(264,266),	(266,268),	(268,270),	(270,275),	(275,277),	(277,279),	(279,281),	(281,286),	(286,288),	(288,290),	(290,292),	(292,294),	(294,297),	(297,305),	(305,313),	(313,316),	(316,319),	(319,329),	(329,330)]
col_names = ["pvar1",	"pvar2",	"pvar3",	"pvar4",	"pvar5",	"pvar6",	"pvar7",	"pvar8",	"pvar9",	"pvar10",	"pvar11",	"pvar12",	"pvar13",	"pvar14",	"pvar15",	"pvar16",	"pvar17",	"pvar18",	"pvar19",	"pvar20",	"pvar21",	"pvar22",	"pvar23",	"pvar24",	"pvar25",	"pvar26",	"pvar27",	"pvar28",	"pvar29",	"pvar30",	"pvar31",	"pvar32",	"pvar33",	"pvar34",	"pvar35",	"pvar36",	"pvar37",	"pvar38",	"pvar39",	"pvar40",	"pvar41",	"pvar42",	"pvar43",	"pvar44",	"pvar45",	"pvar46",	"pvar47",	"pvar48",	"pvar49",	"pvar50",	"pvar51",	"pvar52",	"pvar53",	"pvar54",	"pvar55",	"pvar56",	"pvar57",	"pvar58",	"pvar59",	"pvar60",	"pvar61",	"pvar62",	"pvar63",	"pvar64",	"pvar65",	"pvar66",	"pvar67",	"pvar68",	"pvar69",	"pvar70",	"pvar71",	"pvar72",	"pvar73",	"pvar74",	"pvar75",	"pvar76",	"pvar77",	"pvar78",	"pvar79",	"pvar80",	"pvar81",	"pvar82",	"pvar83",	"pvar84",	"pvar85",	"pvar86",	"pvar87",	"pvar88",	"pvar89",	"pvar90",	"pvar91",	"pvar92",	"pvar93",	"pvar94",	"pvar95",	"pvar96",	"pvar97",	"pvar98",	"pvar99",	"pvar100",	"pvar101",	"pvar102",	"pvar103",	"pvar104",	"pvar105",	"pvar106",	"pvar107",	"pvar108",	"pvar109",	"pvar110",	"pvar111",	"pvar112",	"pvar113",	"pvar114",	"pvar115",	"pvar116",	"pvar117",	"pvar118",	"pvar119",	"pvar120",	"pvar121",	"pvar122",	"pvar123",	"pvar124",	"pvar125",	"pvar126",	"pvar127",	"pvar128",	"pvar129",	"pvar130",	"pvar131",	"pvar132",	"pvar133",	"pvar134",	"pvar135",	"pvar136",	"pvar137",	"pvar138",	"pvar139"]
col_desc = ["File Identification",	"Schdule",	"Quarter",	"Visit",	"Sector",	"State/Ut Code",	"District Code",	"NSS-Region",	"Stratum",	"Sub-Stratum",	"Sub-Sample",	"Fod Sub-Region",	"FSU",	"Sample Sg/Sb No.",	"Second Stage Stratum No.",	"Sample Household Number",	"Person Serial No.",	"Relationship To Head",	"Gender",	"Age",	"Marital Status",	"General Educaion Level",	"Technical Educaion Level",	"No. of years in Formal Education",	"Status of Current Attendance in Educational Institution",	"Whether received any Vocational/Technical Training",	"Whether Training completed during last 365 Days",	"Field Of Training",	"Duration Of Training",	"Type Of Training",	"Source Of Funding The Training",	"Status Code",	"Industry Code (NIC)",	"Occupation Code (NCO)",	"Whether Engaged In Any Work In Subsidiary Capacity",	"(Principal)location Of Workplace Code",	"(Principal) Enterprise Type Code",	"(Principal) No. Of Workers In The Enterprise",	"(Principal)  Type Of Job Contract",	"(Principal) Eligble Of Paid Leave",	"(Principal) Social Security Benefits",	"(Principal) Usage of product of the economic activity",	"Status Code",	"Industry Code (NIC)",	"Occupation Code (NCO)",	"(Subsidiary) location Of Workplace Code",	"(Subsidiary)  Enterprise Type Code",	"(Subsidiary)  No. Of Workers In The Enterprise",	"(Subsidiary)   Type Of Job Contract",	"(Subsidiary)  Eligble Of Paid Leave",	"(Subsidiary)  Social Security Benefits",	"(Subsidiary) Usage of product of the economic activity",	"Ever Worked Prior to last 365 days",	"Duration of engagement in the economic activity in usual Principal Activity Status",	"Duration of engagement in the economic activity in Subsidiary Activity Status",	"Efforts undertaken to search work",	"Duration of spell of Unemployment",	"Whether Ever Worked ",	" Reason for not working in last 365 days",	"Main reason for being in Principal activity status (91 to 97) ",	"Status Code for activity 1",	"Industry Code (NIC) for activity 1",	"hours actuallly worked for activity 1 on 7 th day",	"wage earning for activity 1 on 7 th day",	"Status Code for activity 2",	"Industry Code (NIC) for activity 2",	"hours actuallly worked for activity 2 on 7 th day",	"wage earning for activity 2 on 7 th day",	"total hours actually worked on 7th day",	"hours available for aditional worked on 7th day",	"Status Code for activity 1",	"Industry Code (NIC) for activity 1",	"hours actuallly worked for activity 1 on 6 th day",	"wage earning for activity 1 on 7 th day",	"Status Code for activity 2",	"Industry Code (NIC) for activity 2",	"hours actuallly worked for activity 2 on 6 th day",	"wage earning for activity 2 on 6 th day",	"total hours actually worked on 6th day",	"hours available for aditional worked on 6th day",	"Status Code for activity 1",	"Industry Code (NIC) for activity 1",	"hours actuallly worked for activity 1 on5 th day",	"wage earning for activity 1 on 5 th day",	"Status Code for activity 2",	"Industry Code (NIC) for activity 2",	"hours actuallly worked for activity 2 on 5 th day",	"wage earning for activity 2 on 5 th day",	"total hours actually worked on 5th day",	"hours available for aditional worked on 5th day",	"Status Code for activity 1",	"Industry Code (NIC) for activity 1",	"hours actuallly worked for activity 1 on 4th day",	"wage earning for activity 1 on 4th day",	"Status Code for activity 2",	"Industry Code (NIC) for activity 2",	"hours actuallly worked for activity 2 on 4th day",	"wage earning for activity 2 on 4th day",	"total hours actually worked on 4th day",	"hours available for aditional worked on 4th day",	"Status Code for activity 1",	"Industry Code (NIC) for activity 1",	"hours actuallly worked for activity 1 on 3rd day",	"wage earning for activity 1 on 3rd day",	"Status Code for activity 2",	"Industry Code (NIC) for activity 2",	"hours actuallly worked for activity 2 on 3rd day",	"wage earning for activity 2 on 3 rd day",	"total hours actually worked on 3rd day",	"hours available for aditional worked on 3rd day",	"Status Code for activity 1",	"Industry Code (NIC) for activity 1",	"hours actuallly worked for activity 1 on 2nd day",	"wage earning for activity 1 on 2nd day",	"Status Code for activity 2",	"Industry Code (NIC) for activity 2",	"hours actuallly worked for activity 2 on 2nd day",	"wage earning for activity 2 on 2nd day",	"total hours actually worked on 2nd day",	"hours available for aditional worked on 2nd day",	"Status Code for activity 1",	"Industry Code (NIC) for activity 1",	"hours actuallly worked for activity 1 on 1st day",	"wage earning for activity 1 on 1st day",	"Status Code for activity 2",	"Industry Code (NIC) for activity 2",	"hours actuallly worked for activity 2 on 1st day",	"wage earning for activity 2 on 1st day",	"total hours actually worked on 1st day",	"hours available for aditional worked on 1st day",	"Current Weekly Status (CWS)",	"Industry Code (CWS)",	"Occupation Code (CWS)",	"Earnings For Regular Salaried/Wage Activity",	"Earnings For Self Employed",	"Ns count for sector x stratum x substratum x sub-sample",	"Ns count for sector x stratum x substratum",	"Sub-sample wise Multiplier",	"Count of contributing State x Sector x Stratum x SubStratum in 4 Quarters"]

In [16]:
#Initiating PLFS dataloader with hh files
df_perv1,coldesc_perv1 = PLFSdataloader(file_path = file_path, col_names = col_names, col_desc = col_desc, col_specs = col_specs).get_dataframes()


In [17]:
df_perv1.head()

Unnamed: 0,pvar1,pvar2,pvar3,pvar4,pvar5,pvar6,pvar7,pvar8,pvar9,pvar10,...,pvar130,pvar131,pvar132,pvar133,pvar134,pvar135,pvar136,pvar137,pvar138,pvar139
0,FVP7,104,Q1,V1,1,2,4,21,1,14,...,0,94,,,0,0,2,4,246798,4
1,FVP7,104,Q1,V1,1,2,4,21,1,14,...,0,93,,,0,0,2,4,246798,4
2,FVP7,104,Q1,V1,1,2,4,21,1,14,...,0,11,1.0,611.0,0,6500,2,4,246798,4
3,FVP7,104,Q1,V1,1,2,4,21,1,14,...,3,21,1.0,611.0,0,0,2,4,246798,4
4,FVP7,104,Q1,V1,1,2,4,21,1,14,...,0,91,,,0,0,2,4,246798,4


### Merging household and personal data

In [86]:
#Creating hhid in HHV1
df_hhv1["hhid"] = df_hhv1['hhvar3'] + df_hhv1['hhvar4'] + df_hhv1['hhvar5'] + df_hhv1['hhvar13']+ df_hhv1['hhvar14']+ df_hhv1['hhvar15']+ df_hhv1['hhvar16']

In [89]:
# Creating hhid in PERV1
df_perv1["hhid"] = df_perv1['pvar3'] + df_perv1['pvar4'] + df_perv1['pvar5'] + df_perv1['pvar13'] + df_perv1['pvar14'] + df_perv1['pvar15'] + df_perv1['pvar16']

#Creatinn perid in PERV1
df_perv1["perid"] = df_perv1['pvar3'] + df_perv1['pvar4'] + df_perv1['pvar5'] + df_perv1['pvar13'] + df_perv1['pvar14'] + df_perv1['pvar15'] + df_perv1['pvar16'] + df_perv1['pvar17']

In [99]:
#Merging HHV1 and PERV1
df_merged = pd.merge(df_hhv1,df_perv1, on = "hhid", how="outer",indicator= True)
df_merged["_merge"].value_counts() #Checking for unmerged rows

_merge
both          418159
left_only          0
right_only         0
Name: count, dtype: int64

### Generating annual weights

In [110]:
print(df_merged["pvar139"].describe())
print(df_merged["pvar139"].unique())

count     418159
unique         2
top            4
freq      418049
Name: pvar139, dtype: object
['4' '3']


In [None]:
# Generating weights for first visits

## Changing dtype to numeric
df_merged[['pvar138','pvar139','pvar136','pvar137']] = df_merged[['pvar138','pvar139','pvar136','pvar137']].apply(pd.to_numeric, errors = 'coerce')

## Assigning weights 
df_merged["weights"] = df_merged.apply(lambda row: round(row['pvar138']/(row['pvar139']*100) if row['pvar136'] == row['pvar137'] else row['pvar138']/(row['pvar139']*200)), axis=1)

In [122]:
#Total population as per PLFS
df_merged["weights"].sum()

np.int64(1204324589)

# Exporting files in dta format

In [128]:
df_merged.to_stata(r"Extracted_files\HHV1_PERV1_merged.dta", write_index=False)

In [None]:
coldesc_hhv1.to_csv(r"..\Extracted_files\hhv1 column description.csv")
coldesc_perv1.to_csv(r"..\Extracted_files\perv1 column descriptions.csv")