In [49]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [41]:
hrs_hcap = pd.read_csv('/Users/novak/Columbia-IPHD Dropbox/CU_IPHD/HCAP/python file/hrs-hcap.csv')
mex_cog = pd.read_csv('/Users/novak/Columbia-IPHD Dropbox/CU_IPHD/HCAP/python file/mex-cog.csv')

In [42]:
data = pd.concat([mex_cog, hrs_hcap], ignore_index=True)

# Drop rows where 'rage' is less than 65
data = data[data['rage'] >= 65]

# Replace 'hrs' with 0 where 'mex' is 1
# data.loc[data['mex'] == 1, 'hrs'] = 0

data.head()

Unnamed: 0,cunicah,np,id_mexcog,hrs,mex,wgt,binf1csidmental,binf1csidmemory,binf1csidput,binf1csidkept,...,ispouse,ichild,iothfam,inonfam,coresi,hhid,pn,id_hrs,binf1csidwordwrg,mmse
0,397.0,20.0,39720.0,0,1,1137.0,1.0,0.0,1.0,1.0,...,0,1,0,0,1.0,,,,,
1,457.0,10.0,45710.0,0,1,4483.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0.0,,,,,
2,460.0,10.0,46010.0,0,1,3041.0,0.0,0.0,1.0,0.0,...,0,1,0,0,1.0,,,,,
3,460.0,20.0,46020.0,0,1,3981.0,0.0,0.0,1.0,0.0,...,0,0,0,0,1.0,,,,,
4,533.0,10.0,53310.0,0,1,5450.0,0.0,0.0,1.0,0.0,...,0,0,0,1,0.0,,,,,


In [44]:
# create variables 

# List of variables
vars_list = ['ispouse', 'ichild', 'iothfam', 'inonfam']

# Loop through the variables and create new ones
for var in vars_list:
    data[f'{var}F'] = data[var] * data['ifemale']
    data[f'{var}M'] = data[var] * (1 - data['ifemale'])
    data[f'{var}COR'] = data[var] * data['coresi']
    data[f'{var}NCOR'] = data[var] * (1 - data['coresi'])


'''
The loop generates new variables by interacting the original variables (ispouse, ichild, etc.) with other variables (ifemale, coresi) in the dataset.

g var'F = var'*ifemale:
This line creates a new variable (e.g., ispouseF) for each variable in the list, which is the product of the original variable (e.g., ispouse) and ifemale (presumably a binary variable indicating female gender, where 1 represents female and 0 represents male).
This interaction term will represent the relationship status specifically for female respondents.

g var'M = var'*(1-ifemale):
Similarly, this line creates a new variable (e.g., ispouseM) which is the product of the original variable (e.g., ispouse) and the inverse of ifemale.
This captures the relationship status for male respondents, assuming ifemale is 0 for males and 1 for females.

g var'COR = var'*coresi:
Here, a new variable (e.g., ispouseCOR) is created. It's the product of the original variable and coresi (which might indicate whether the respondent is co-residing, with 1 for co-residing and 0 otherwise).
This interaction term indicates the relationship status for respondents who are co-residing.

g var'NCOR = var'*(1-coresi):
This line creates a new variable (e.g., ispouseNCOR) which is the product of the original variable and the inverse of coresi.
It represents the relationship status for respondents who are not co-residing.

The overall purpose of these operations seems to be to create detailed interaction terms that distinguish respondent-informant relationships by gender and co-residing status. This allows for a more nuanced analysis of how these relationships vary across these different groups. */
'''


"\nThe loop generates new variables by interacting the original variables (ispouse, ichild, etc.) with other variables (ifemale, coresi) in the dataset.\n\ng var'F = var'*ifemale:\nThis line creates a new variable (e.g., ispouseF) for each variable in the list, which is the product of the original variable (e.g., ispouse) and ifemale (presumably a binary variable indicating female gender, where 1 represents female and 0 represents male).\nThis interaction term will represent the relationship status specifically for female respondents.\n\ng var'M = var'*(1-ifemale):\nSimilarly, this line creates a new variable (e.g., ispouseM) which is the product of the original variable (e.g., ispouse) and the inverse of ifemale.\nThis captures the relationship status for male respondents, assuming ifemale is 0 for males and 1 for females.\n\ng var'COR = var'*coresi:\nHere, a new variable (e.g., ispouseCOR) is created. It's the product of the original variable and coresi (which might indicate whether 

In [45]:
# check for missing data 

# List of variables to check
variables = [
    'rage', 'rfemale', 'reduc', 'iage', 'ifemale', 'ieduc', 'coresi',
    'ispouse', 'ichild', 'iothfam', 'inonfam', 'fgcp', 'fmem', 'fexf',
    'flang', 'forient', 'mmse'
]

# Loop through variables
for var in variables:
    # Summary statistics (similar to codebook in Stata)
    print(f"Summary of {var}:")
    print(data[var].describe())
    print("\n")

    # Check for missing values and list the relevant rows
    missing_data = data[data[var].isna()][['id_mexcog', 'id_hrs', var]]
    if not missing_data.empty:
        print(f"Missing values for {var}:")
        print(missing_data)
        print("\n")


Summary of rage:
count    4045.000000
mean       75.504821
std         7.430508
min        65.000000
25%        69.000000
50%        75.000000
75%        81.000000
max       104.000000
Name: rage, dtype: float64


Summary of rfemale:
count    4045.000000
mean        0.583436
std         0.493050
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: rfemale, dtype: float64


Summary of reduc:
count    4036.000000
mean       10.591675
std         5.047852
min         0.000000
25%         7.000000
50%        12.000000
75%        14.000000
max        17.000000
Name: reduc, dtype: float64


Missing values for reduc:
      id_mexcog       id_hrs  reduc
34     172220.0          NaN    NaN
264    492520.0          NaN    NaN
297    550820.0          NaN    NaN
505    799410.0          NaN    NaN
647    905020.0          NaN    NaN
679    915113.0          NaN    NaN
1433  1334020.0          NaN    NaN
1631  1486320.0          NaN    NaN


In [46]:
# create study-specific vars 

variables = [
    'rage', 'rfemale', 'reduc', 'iage', 'ifemale', 'ieduc', 'coresi',
    'ispouse', 'ichild', 'iothfam', 'inonfam', 'fgcp', 'fmem', 'fexf',
    'flang', 'forient', 'mmse'
]

# Loop through variables and create new study-specific variables
for var in variables:
    data[f'{var}HRS'] = data[var] * data['hrs']


In [52]:
# Create z-scores for each group separately
data['zbcsidhrs'] = np.nan
data['zbcsidmex'] = np.nan

# Assign z-scores to the appropriate rows
data.loc[data['hrs'] == 1, 'zbcsidhrs'] = zscore(data.loc[data['hrs'] == 1, 'bcsid'])
data.loc[data['hrs'] == 0, 'zbcsidmex'] = zscore(data.loc[data['hrs'] == 0, 'bcsid'])

# Combine the z-scores into a single column
data['zbcsid'] = np.where(data['hrs'] == 1, data['zbcsidhrs'], data['zbcsidmex'])

# Optional: Drop the intermediate columns if not needed
data.drop(columns=['zbcsidhrs', 'zbcsidmex'], inplace=True)
