In [27]:
import pandas as pd
import matplotlib.pyplot as plt

In [17]:
# Read the .dta file without converting categoricals
df = pd.read_stata("UP_BH_Individual_UDAYA 2_Uploaded.dta", convert_categoricals=False)

In [None]:
attributes = [
    # Household-related attributes
    'h15m',  # Total number of male members in the household
    'h15f',  # Total number of female members in the household
    'h15t',  # Total number of members in the household
    'h16',   # Whether a member of the household owns the house
    'h18',   # Ownership of agricultural land
    'h19u',  # Agricultural land unit
    'h21',   # Number of rooms in the household
    'h22',   # Type of toilet facility
    
    # Demographic and background information
    'gg101b',  # Age in completed years
    'h05_1',   # Age of the member
    'h26',     # Religion
    'h27',     # Caste
    # 'h27o',    # Caste: specify
    'gg103',   # Mother's highest level of schooling successfully completed
    'h08_1',   # Highest level of schooling successfully completed year of education of father
    
    # Personal habits and substance use
    'h35',   # Anyone in the family consumed tobacco products
    'h36',   # Anyone in the family consumed alcohol
    'h37',   # Anyone in the family consumed drugs
    'gg1101', # Ever consumed tobacco products
    'gg1102', # Consumed tobacco products in the last one month
    'gg1103', # Ever had alcohol
    'gg1104', # Taken alcoholic drinks in the last one month
    'gg1105', # Ever used drugs
    'gg1106', # Taken drugs in the last one month
    
    # Health and well-being
    'gh1',    # Weight (in kgs)
    'gh3',    # Height (in cms)
    'gg627a', # Had trouble falling asleep, or sleeping too much in the last 2 weeks
    'gg627b', # Been feeling tired or having little energy in the last 2 weeks
    'gg627c', # Had poor appetite or overeating in the last 2 weeks
    'gg627d', # Had trouble concentrating on things in the last 2 weeks
    'gg627g', # Been feeling bad about yourself in the last 2 weeks
    'gg612',  # Injured in a road accident in the last three months
    'gg613',  # Injured under any other circumstances in the last three months
    
    # Education and school-related attributes
    'gg106',  # Currently attending school
    'gg107',  # Attended school all days
    'gg110',  # Performance in class in the last academic year
    'gg201',  # Discussed school performance with mother or father in the last one year
    'gg202',  # Discussed friendship with mother or father in the last one year
    'gg203',  # Discussed being teased with mother or father
    'gg204',  # Discussed physical changes (boys)/menstruation (girls) with mother/father in the last one year
    
    # Social relationships and role models
    'gg701',  # Number of friends respondent has
    'gg801',  # Ever had a boyfriend/girlfriend
    'gg802',  # Number of boyfriends/girlfriends respondent had
    'gg803',  # Ever hugged boyfriend(s)/girlfriend(s)
    'gg804',  # Ever kissed boyfriend(s)/girlfriend(s) on the lips
    'gg805',  # Ever had sexual intercourse with boyfriend(s)/girlfriend(s)
    'gg209',  # Respondent seen any adult(s) as a role model
    
    # Employment and economic activities
    'gg160',  # Currently seeking a job for pay
    
    # Violence and abuse
    'gg179',  # Father ever beaten mother
    'gg180',  # Father beaten mother in the last 12 months
    'gg181',  # Respondent physically hurt by father and/or mother since age 10
    'gg182',  # Forms of physical violence experienced by respondent
    'gg183',  # Respondent physically hurt by father and/or mother in the last 12 months
    
    # Activities and behaviors
    'gg708',  # Often played outdoor games or engaged in physical activities
    'gg1213', # Ever accessed pornographic materials on the internet
    'gg1107', # Involved in physical fights with someone in the last 12 months
    'gg1108', # Have you robbed anyone
    
    # Technology and device usage
    'fng1207bh', # Last day, number of hours spent on mobile phone
]


In [19]:
label=['gg627i', #had thoughts that r would be better off dead in the last 2 weeks
 'gg628', #ever seriously consider attempting suicide during the last one year
 'gg629', #made a plan about how r would attempt suicide during the last one year
 'gg630', #number of times r actually attempted suicide during the last one year
 'gg631a', #agitated, angry or sad, ever cut/beaten himself/herself during the last one year
 'gg631b', #agitated, angry or sad, ever pulled own hair during the last one year
 'gg631c' #aitated, angry or sad, ever banged/hit himself/herself during the last one year
]

In [20]:
#give marks on 1-10 scale for suicidal behaviour 
weight_label={'gg627i':5,
 'gg628':8,
 'gg629':7,         
 'gg630':9,
 'gg631a':5,
 'gg631b':3,
 'gg631c':4}

In [21]:
# consider df with only attributes and labels
df = df[attributes + label]

In [22]:
#'h18' 1 to True and other to False
df['h18'] = df['h18'].apply(lambda x: True if x == 1 else False)

In [24]:
#convert area
# Conversion factors
conversion_factors = {
    'Katha': 126.44,
    'Bigha': 1337.8,
    'Acre': 4046.86
}

# Function to convert units
def convert_to_m2(value):
    if pd.isna(value) or value in ['0', '']:
        return 0  # Handle missing or zero values
    try:
        # Extract numeric and unit parts
        parts = value.split()
        num = float(parts[0])
        unit = parts[1].lower()
        
        # Identify and convert based on the unit
        if unit.startswith('k'):
            return num * conversion_factors['Katha']
        elif unit.startswith('b'):
            return num * conversion_factors['Bigha']
        elif unit.startswith('a') or unit.startswith('e'):
            return num * conversion_factors['Acre']
        else:
            return 0  # Invalid unit
    except:
        return 0  # Handle any parsing errors

In [25]:
df['h19u']=df['h19u'].apply(convert_to_m2)

In [26]:
#mother education
df['gg101b']=df['gg101b'].apply(lambda x: 0  if x<=5 else 1)

In [23]:
#convert religious 'h26' 1 to 'H' , 2 to 'M', >2 to 'O'
df['h26'] = df['h26'].apply(lambda x: 'H' if x == 1 else 'M' if x == 2 else 'O')
df['h27']=df['h27'].apply(lambda x: 'SC' if x == 1 else 'ST' if x == 2 else 'GEN' if x ==4 else 'OBC')


In [None]:
# tobaco alchohol to true false
columns_to_transform = [
    'h35', 'h36', 'h37', 
    'gg1101', 'gg1102', 'gg1103', 
    'gg1104', 'gg1105', 'gg1106'
]

# Convert 1 to True and everything else to False
df[columns_to_transform] = df[columns_to_transform] == 1