In [5]:
# --- 设置项目根目录路径，修复模块导入问题 ---
import sys
from pathlib import Path

project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# --- 导入配置模块 ---
from utils.config import load_config, abspath


## 🔧 加载配置与字段清单（统一字段控制 via candidate_vars_csv）

In [7]:
from utils.config import load_config, abspath
import pandas as pd

cfg, root = load_config()

# 加载处理后的 parquet 数据（特征）
df = pd.read_parquet(abspath(root, cfg["processing"]["outputs"]["features"]))
df_raw = pd.read_parquet(abspath(root, cfg["processing"]["outputs"]["indresp_selected"]))

# 加载字段候选列表
candidate_vars_path = abspath(root, cfg["ukhls"]["candidate_vars_csv"])
df_vars = pd.read_csv(candidate_vars_path)
usecols = df_vars["varname"].tolist()

print("候选字段总数：", len(usecols))
display(df_vars.head())


候选字段总数： 118


Unnamed: 0,theme,varname,label_in_dictionary
0,demographics,n_age_dv,"Age, derived from dob_dv and intdat_dv"
1,demographics,n_agegr5_dv,Age group (age_dv): 5 year intervals
2,demographics,n_englang,English is first language
3,demographics,n_ethn_dv,Ethnic group (derived from multiple sources)
4,demographics,n_gor_dv,Government Office Region


In [9]:
# import libraries
# ------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [11]:
# display setting
# ------------------------------------
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [13]:
# 0.Set File Path
# ------------------------------------
file_path = "abspath(root, cfg['ukhls']['indresp_dta'])"
output_dir = abspath(root, cfg['paths']['data_processed'])  # updated"abspath(root, cfg['paths']['data_processed'])"
output_file_name = "n_indresp_navigation.csv"
output_path = os.path.join(output_dir, output_file_name)

In [15]:
# 1.Variables to Extract (use_vars) - All prefixed with 'n_' for Wave 14
# ------------------------------------
use_vars = [
    # 1. Identifiers
    "pidp",     # Cross-wave person identifier (Core ID)
    "pid",      # personal identifier (BHPS cohort)
    "n_hidp",   # Household identifier 
    
    # 2. Mental Health Outcomes
    "n_scghq1_dv",  # GHQ-12 Score (Likert, 0-36) - Continuous target
    "n_scghq2_dv",  # GHQ-12 Caseness (Binary) - Classification target
    "n_sclfsato",   # Satisfaction with life overall (SWB proxy)
    "n_health",     # Self-reported general health (Control)

    # Specific Diagnosed Conditions
    "n_mhcond8",    # A phobia
    "n_mhcond9",    # Panic attacks
    "n_mhcond6",    # Post-traumatic stress disorder (PTSD)
    "n_mhcond19",   # Any other mental, emotional or neurological problem or condition
    "n_mhcond10",   # Attention deficit hyperactivity disorder (ADHD)
    "n_mhcond4",    # Bipolar disorder (or 'manic depression')
    "n_mhcond2",    # Depression
    "n_mhcond11",   # Post-natal depression
    "n_mhcond12",   # Dementia (including Alzheimer's)
    "n_mhcond5",    # An eating disorder
    "n_mhcond13",   # Nervous breakdown
    "n_mhcond14",   # A personality disorder
    "n_mhcond3",    # Psychosis or schizophrenia
    "n_mhcond15",   # Obsessive compulsive disorder (OCD)
    "n_mhcond16",   # Seasonal affective disorder
    "n_mhcond17",   # Alcohol or drug dependence
    "n_mhcond18",   # Any other anxiety disorder
    "n_mhcond97",   # Any other emotional, nervous or psychiatric problem or condition
    "n_mhcond96",   # None of these (Useful for validation)
    
    # 3. Demographics and Control Variables
    "n_sex",        # Sex
    "n_age_dv",     # Age
    "n_marstat",    # Marital Status
    "n_racel_dv",   # Ethnic Group
    "n_nchild_dv",  # Number of children in household
    "n_hhsize",     # Household size
    "n_urban_dv",   # Urban or rural area
    "n_finnow",     # Subjective financial situation - now
    "n_finfut",     # Subjective financial situation - future

    # Core Controls and Utility Variables (Finance, Date, Weight)
    # Household Net Monthly Income (Financial Control)
    "n_ienddaty",    # Interview Year (CRITICAL for Duration Calculation)
    "n_ienddatm",    # Interview Month (CRITICAL for Duration Calculation)
    "n_ienddatd",
    "n_indinub_lw", # Individual Longitudinal Weight (For Statistical Inference)

    # 4. Employment & Unemployment Features
    "n_jbstat",     # Main economic activity (Current status)
    "n_jbhas",      # Had paid work last week
    "n_jbsemp",     # Self-employed
    "n_julk4wk",    # Looked for work in the last 4 weeks
    "n_julkjb",     # Would like a regular paid job
    "n_jbhad",      # Ever had a paid job? (General history)
    "n_jubgn",      # Able to start work within 2 weeks
    "n_eprosh",     # Chance starting work within 12 months
    
    # Last Job End Date (for Current Unemployment Duration)
    "n_jlendm",     # Month left last job
    "n_jlendy",     # Year left last job

    # Unemployment History
    "n_nnmpsp_dv",  # No. non-employment spells since last interview
    "n_nmpsp_dv",   # No. employment spells since last interview
    "n_nunmpsp_dv", # No. unemployment spells since last interview   

    # First Job Status and Duration (Advanced Feature Construction)
    "n_j1none",     # Still in full-time education / never had a paid job (Used for "Never Employed")
    "n_j1still",    # Still in first job (Used to flag censored duration data)
    "n_j1mnth",     # First job start month
    "n_j1year",     # First job start year
    "n_j1endmnth",  # First job end month
    "n_j1endyear",  # First job end year
    
    # 5. Education Variables
    "n_edtype",     # Type of educational institution attending 
    "n_hiqual_dv",  # Highest educational qualification (UK standard)
    "n_isced11_dv", # Highest education qualification, short ISCED 2011
    "n_qfhigh",     # Highest qualification level
    "n_btec1",      # First certificate or general certificate (below level 2)
    "n_btec2",      # First diploma or general diploma (level 2)
    "n_btec3",      # National Certificate or National Diploma level (level 3)
    "n_btec4",      # Higher level (level 4 or higher)
    
    # Proxies for Total Years of Education (for CFPS comparability)
    "n_feend",      # Age finished further education
    "n_scend"       # Age finished school
]

In [17]:
# 2.Data Reading Code
# ------------------------------------
try:
    # Attempt to read the Stata file using the specified path and variables
    df = pd.read_stata(file_path, 
                       columns=use_vars)
    print(f"Successfully extracted data from {file_path}.")
    print(f"DataFrame dimensions: {df.shape}")
    print("\nFirst 5 rows of the data (Preview):")
    # print(df.head()) # Uncomment to view the head in your environment
    
    # Next Step: Missing Value Handling and Feature Engineering
    import numpy as np
    # Replace UKHLS special missing value codes with NaN
    # Common codes: -9 (missing), -8 (inapplicable), -7 (proxy), -2 (refusal), -1 (don't know)
    df = df.replace([-9, -8, -7, -2, -1], np.nan)
    print("\nInitial cleaning: UKHLS missing codes replaced with NaN.")
    
except FileNotFoundError:
    print(f"Error: File not found. Please confirm the file path is correct: {file_path}")
except ValueError as e:
    print(f"Error: An issue occurred during data reading (e.g., incorrect variable names or Stata file issue). Details: {e}")

Error: File not found. Please confirm the file path is correct: abspath(root, cfg['ukhls']['indresp_dta'])


In [14]:
# 3.Data Storage (Saving to CSV)
# ------------------------------------
if df is not None:
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    try:
        # Save the DataFrame to a CSV file
        df.to_csv(output_path, index=False)
        print(f"\nSuccessfully saved the extracteddata to:")
        print(f"--> {output_path}")
        
    except Exception as e:
        print(f"Error saving file to CSV: {e}")



Successfully saved the extracteddata to:
--> /Users/wanderer/Thesis/ModelProject/Processed_Data/n_indresp_navigation.csv


In [24]:
# Configuration
# ------------------------------------
processed_dir = "abspath(root, cfg['paths']['data_processed'])"
file_name = "n_indresp_navigation.csv"
file_path = os.path.join(processed_dir, file_name)

In [26]:
print("--- 1. Data Loading and Feature Engineering ---")

try:
    # Load the extracted and cleaned CSV file
    df = pd.read_csv(file_path)
    print(f"Successfully loaded file: {file_path}. Dimensions: {df.shape}")
except FileNotFoundError:
    print(f"Error: File not found. Please confirm the path is correct: {file_path}")
    exit()

--- 1. Data Loading and Feature Engineering ---
Successfully loaded file: /Users/wanderer/Thesis/ModelProject/Processed_Data/n_indresp_navigation.csv. Dimensions: (35471, 68)


In [28]:
df.head()

Unnamed: 0,pidp,pid,n_hidp,n_scghq1_dv,n_scghq2_dv,n_sclfsato,n_health,n_mhcond8,n_mhcond9,n_mhcond6,n_mhcond19,n_mhcond10,n_mhcond4,n_mhcond2,n_mhcond11,n_mhcond12,n_mhcond5,n_mhcond13,n_mhcond14,n_mhcond3,n_mhcond15,n_mhcond16,n_mhcond17,n_mhcond18,n_mhcond97,n_mhcond96,n_sex,n_age_dv,n_marstat,n_racel_dv,n_nchild_dv,n_hhsize,n_urban_dv,n_finnow,n_finfut,n_ienddaty,n_ienddatm,n_ienddatd,n_indinub_lw,n_jbstat,n_jbhas,n_jbsemp,n_julk4wk,n_julkjb,n_jbhad,n_jubgn,n_eprosh,n_jlendm,n_jlendy,n_nnmpsp_dv,n_nmpsp_dv,n_nunmpsp_dv,n_j1none,n_j1still,n_j1mnth,n_j1year,n_j1endmnth,n_j1endyear,n_edtype,n_hiqual_dv,n_isced11_dv,n_qfhigh,n_btec1,n_btec2,n_btec3,n_btec4,n_feend,n_scend
0,22445,10127798,276365626,24,7,Somewhat satisfied,No,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,female,37,Married,british/english/scottish/welsh/northern irish ...,2,4,urban area,Doing alright,or about the same?,2022,April,19,0.0,Paid employment(ft/pt),Yes,Employee,No,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,none,1,none,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,Degree,Masters or equivalent,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable
1,29925,10192697,617732426,23,11,Somewhat dissatisfied,Yes,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,female,45,Divorced,british/english/scottish/welsh/northern irish ...,2,3,urban area,Finding it very difficult,or about the same?,2022,August,1,0.0,Paid employment(ft/pt),Yes,Employee,No,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,none,1,none,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,Degree,Masters or equivalent,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable
2,76165,10689869,140161626,12,0,Mostly satisfied,No,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,female,39,Married,british/english/scottish/welsh/northern irish ...,2,4,urban area,Just about getting by,Better off,2022,March,2,0.0,Paid employment(ft/pt),Yes,Employee,No,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,none,1,none,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,A-level etc,Upper secondary,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable
3,280165,12430439,753467226,16,5,Somewhat dissatisfied,No,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,female,43,Married,british/english/scottish/welsh/northern irish ...,1,4,rural area,Just about getting by,or about the same?,2022,December,11,0.0,Self employed,No,inapplicable,Yes,inapplicable,inapplicable,Yes,Very likely,inapplicable,inapplicable,none,none,none,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,GCSE etc,Lower secondary,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable
4,469205,13857142,413725626,16,4,Mostly satisfied,No,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,female,32,"Single, nvr marr/civ p",british/english/scottish/welsh/northern irish ...,2,3,urban area,Doing alright,or about the same?,2022,May,10,0.0,Paid employment(ft/pt),Yes,Employee,No,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,none,1,none,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,Other qualification,Lower secondary,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable,inapplicable


In [44]:
def get_value_counts(
    df: pd.DataFrame,
    include_cols: list[str] | None = None
) -> pd.DataFrame:
    """
    返回一个整洁的DataFrame，仅统计指定列中的各个取值出现次数。
    
    参数：
    ----------
    df : pd.DataFrame
        要分析的DataFrame。
    include_cols : list[str], 可选
        只统计这些列。如果为 None，则默认统计所有列。
    
    输出列说明：
    ----------
    - column：列名
    - value：该列中的某个取值
    - count：该取值在该列中出现的次数
    """
    records = []

    # 1️⃣ 确定要处理的列
    cols_to_process = include_cols if include_cols else df.columns

    # 2️⃣ 统计每列取值频次
    for col in cols_to_process:
        if col not in df.columns:
            print(f"⚠️ 警告：'{col}' 不在 DataFrame 列中，已跳过。")
            continue

        counts = df[col].value_counts(dropna=False)
        for val, cnt in counts.items():
            records.append({
                'column': col,
                'value': val,
                'count': cnt
            })

    return (
        pd.DataFrame(records)
        .sort_values(['column', 'count'], ascending=[True, False])
        .reset_index(drop=True)
    )

In [46]:
cols_to_analyze = ['n_mhcond8', 'n_mhcond9', 'n_mhcond6', 'n_mhcond19', 'n_mhcond10', 'n_mhcond14', 'n_mhcond2', 'n_mhcond11', 'n_mhcond12', 'n_mhcond5']

result = get_value_counts(df, include_cols=cols_to_analyze)
print(result)

        column          value  count
0   n_mhcond10   inapplicable  26576
1   n_mhcond10  Not mentioned   8502
2   n_mhcond10  Yes mentioned    136
3   n_mhcond10          proxy    134
4   n_mhcond10        refusal     91
5   n_mhcond10     don't know     32
6   n_mhcond11   inapplicable  26576
7   n_mhcond11  Not mentioned   8451
8   n_mhcond11  Yes mentioned    187
9   n_mhcond11          proxy    134
10  n_mhcond11        refusal     91
11  n_mhcond11     don't know     32
12  n_mhcond12   inapplicable  26576
13  n_mhcond12  Not mentioned   8606
14  n_mhcond12          proxy    134
15  n_mhcond12        refusal     91
16  n_mhcond12     don't know     32
17  n_mhcond12  Yes mentioned     32
18  n_mhcond14   inapplicable  26576
19  n_mhcond14  Not mentioned   8564
20  n_mhcond14          proxy    134
21  n_mhcond14        refusal     91
22  n_mhcond14  Yes mentioned     74
23  n_mhcond14     don't know     32
24  n_mhcond19   inapplicable  26576
25  n_mhcond19  Not mentioned   7845
2

In [153]:
# import libraries
# ------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [155]:
# display setting
# ------------------------------------
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [157]:
# 0.Set File Path
# ------------------------------------
file_path = "abspath(root, cfg['ukhls']['indresp_dta'])"
output_dir = abspath(root, cfg['paths']['data_processed'])  # updated"abspath(root, cfg['paths']['data_processed'])"
output_file_name = "n_indresp_extracted.csv"
output_path = os.path.join(output_dir, output_file_name)

In [159]:
# 1.Variables to Extract (use_vars) - All prefixed with 'n_' for Wave 14
# ------------------------------------
use_vars = [
    # 1. Identifiers
    "pidp",     # Cross-wave person identifier (Core ID)
    "pid",      # personal identifier (BHPS cohort)
    "n_hidp",   # Household identifier 
    
    # 2. Mental Health Outcomes
    "n_scghq1_dv",  # GHQ-12 Score (Likert, 0-36) - Continuous target
    "n_scghq2_dv",  # GHQ-12 Caseness (Binary) - Classification target
    "n_sclfsato",   # Satisfaction with life overall (SWB proxy)
    "n_health",     # Self-reported general health (Control)

    # Specific Diagnosed Conditions
    "n_mhcond8",    # A phobia
    "n_mhcond9",    # Panic attacks
    "n_mhcond6",    # Post-traumatic stress disorder (PTSD)
    "n_mhcond19",   # Any other mental, emotional or neurological problem or condition
    "n_mhcond10",   # Attention deficit hyperactivity disorder (ADHD)
    "n_mhcond4",    # Bipolar disorder (or 'manic depression')
    "n_mhcond2",    # Depression
    "n_mhcond11",   # Post-natal depression
    "n_mhcond12",   # Dementia (including Alzheimer's)
    "n_mhcond5",    # An eating disorder
    "n_mhcond13",   # Nervous breakdown
    "n_mhcond14",   # A personality disorder
    "n_mhcond3",    # Psychosis or schizophrenia
    "n_mhcond15",   # Obsessive compulsive disorder (OCD)
    "n_mhcond16",   # Seasonal affective disorder
    "n_mhcond17",   # Alcohol or drug dependence
    "n_mhcond18",   # Any other anxiety disorder
    "n_mhcond97",   # Any other emotional, nervous or psychiatric problem or condition
    "n_mhcond96",   # None of these (Useful for validation)
    
    # 3. Demographics and Control Variables
    "n_sex",        # Sex
    "n_age_dv",     # Age
    "n_marstat",    # Marital Status
    "n_racel_dv",   # Ethnic Group
    "n_nchild_dv",  # Number of children in household
    "n_hhsize",     # Household size
    "n_urban_dv",   # Urban or rural area
    "n_finnow",     # Subjective financial situation - now
    "n_finfut",     # Subjective financial situation - future

    # Core Controls and Utility Variables (Finance, Date, Weight)
    # Household Net Monthly Income (Financial Control)
    "n_ienddaty",    # Interview Year (CRITICAL for Duration Calculation)
    "n_ienddatm",    # Interview Month (CRITICAL for Duration Calculation)
    "n_ienddatd",
    "n_indinub_lw", # Individual Longitudinal Weight (For Statistical Inference)

    # 4. Employment & Unemployment Features
    "n_jbstat",     # Main economic activity (Current status)
    "n_jbhas",      # Had paid work last week
    "n_jbsemp",     # Self-employed
    "n_julk4wk",    # Looked for work in the last 4 weeks
    "n_julkjb",     # Would like a regular paid job
    "n_jbhad",      # Ever had a paid job? (General history)
    "n_jubgn",      # Able to start work within 2 weeks
    "n_eprosh",     # Chance starting work within 12 months
    
    # Last Job End Date (for Current Unemployment Duration)
    "n_jlendm",     # Month left last job
    "n_jlendy",     # Year left last job

    # Unemployment History
    "n_nnmpsp_dv",  # No. non-employment spells since last interview
    "n_nmpsp_dv",   # No. employment spells since last interview
    "n_nunmpsp_dv", # No. unemployment spells since last interview   

    # First Job Status and Duration (Advanced Feature Construction)
    "n_j1none",     # Still in full-time education / never had a paid job (Used for "Never Employed")
    "n_j1still",    # Still in first job (Used to flag censored duration data)
    "n_j1mnth",     # First job start month
    "n_j1year",     # First job start year
    "n_j1endmnth",  # First job end month
    "n_j1endyear",  # First job end year
    
    # 5. Education Variables
    "n_edtype",     # Type of educational institution attending 
    "n_hiqual_dv",  # Highest educational qualification (UK standard)
    "n_isced11_dv", # Highest education qualification, short ISCED 2011
    "n_qfhigh",     # Highest qualification level
    "n_btec1",      # First certificate or general certificate (below level 2)
    "n_btec2",      # First diploma or general diploma (level 2)
    "n_btec3",      # National Certificate or National Diploma level (level 3)
    "n_btec4",      # Higher level (level 4 or higher)
    
    # Proxies for Total Years of Education (for CFPS comparability)
    "n_feend",      # Age finished further education
    "n_scend"       # Age finished school
]

In [161]:
# 2.Data Reading Code
# ------------------------------------
try:
    # Attempt to read the Stata file using the specified path and variables
    df = pd.read_stata(file_path, 
                       columns=use_vars, 
                       convert_categoricals=False)
    print(f"Successfully extracted data from {file_path}.")
    print(f"DataFrame dimensions: {df.shape}")
    print("\nFirst 5 rows of the data (Preview):")
    # print(df.head()) # Uncomment to view the head in your environment
    
    # Next Step: Missing Value Handling and Feature Engineering
    import numpy as np
    # Replace UKHLS special missing value codes with NaN
    # Common codes: -9 (missing), -8 (inapplicable), -7 (proxy), -2 (refusal), -1 (don't know)
    df = df.replace([-9, -8, -7, -2, -1], np.nan)
    print("\nInitial cleaning: UKHLS missing codes replaced with NaN.")
    
except FileNotFoundError:
    print(f"Error: File not found. Please confirm the file path is correct: {file_path}")
except ValueError as e:
    print(f"Error: An issue occurred during data reading (e.g., incorrect variable names or Stata file issue). Details: {e}")

Successfully extracted data from /Users/wanderer/Thesis/ModelProject/UKHLS_Data/n_indresp.dta.
DataFrame dimensions: (35471, 68)

First 5 rows of the data (Preview):

Initial cleaning: UKHLS missing codes replaced with NaN.


In [163]:
# 3.Data Storage (Saving to CSV)
# ------------------------------------
if df is not None:
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    try:
        # Save the DataFrame to a CSV file
        df.to_csv(output_path, index=False)
        print(f"\nSuccessfully saved the extracteddata to:")
        print(f"--> {output_path}")
        
    except Exception as e:
        print(f"Error saving file to CSV: {e}")



Successfully saved the extracteddata to:
--> /Users/wanderer/Thesis/ModelProject/Processed_Data/n_indresp_extracted.csv


In [165]:
# Configuration
# ------------------------------------
processed_dir = "abspath(root, cfg['paths']['data_processed'])"
file_name = "n_indresp_extracted.csv"
file_path = os.path.join(processed_dir, file_name)

In [167]:
print("--- 1. Data Loading and Feature Engineering ---")

try:
    # Load the extracted and cleaned CSV file
    df = pd.read_csv(file_path)
    print(f"Successfully loaded file: {file_path}. Dimensions: {df.shape}")
except FileNotFoundError:
    print(f"Error: File not found. Please confirm the path is correct: {file_path}")
    exit()

--- 1. Data Loading and Feature Engineering ---
Successfully loaded file: /Users/wanderer/Thesis/ModelProject/Processed_Data/n_indresp_extracted.csv. Dimensions: (35471, 68)


In [169]:
df.head()

Unnamed: 0,pidp,pid,n_hidp,n_scghq1_dv,n_scghq2_dv,n_sclfsato,n_health,n_mhcond8,n_mhcond9,n_mhcond6,n_mhcond19,n_mhcond10,n_mhcond4,n_mhcond2,n_mhcond11,n_mhcond12,n_mhcond5,n_mhcond13,n_mhcond14,n_mhcond3,n_mhcond15,n_mhcond16,n_mhcond17,n_mhcond18,n_mhcond97,n_mhcond96,n_sex,n_age_dv,n_marstat,n_racel_dv,n_nchild_dv,n_hhsize,n_urban_dv,n_finnow,n_finfut,n_ienddaty,n_ienddatm,n_ienddatd,n_indinub_lw,n_jbstat,n_jbhas,n_jbsemp,n_julk4wk,n_julkjb,n_jbhad,n_jubgn,n_eprosh,n_jlendm,n_jlendy,n_nnmpsp_dv,n_nmpsp_dv,n_nunmpsp_dv,n_j1none,n_j1still,n_j1mnth,n_j1year,n_j1endmnth,n_j1endyear,n_edtype,n_hiqual_dv,n_isced11_dv,n_qfhigh,n_btec1,n_btec2,n_btec3,n_btec4,n_feend,n_scend
0,22445,10127798.0,276365626,24.0,7.0,5.0,2.0,,,,,,,,,,,,,,,,,,,,2.0,37.0,2.0,1.0,2,4,1.0,2.0,3.0,2022.0,4.0,19.0,0.0,2.0,1.0,1.0,2.0,,,,,,,0.0,1.0,0.0,,,,,,,,1.0,7.0,,,,,,,
1,29925,10192697.0,617732426,23.0,11.0,3.0,1.0,,,,,,,,,,,,,,,,,,,,2.0,45.0,5.0,1.0,2,3,1.0,5.0,3.0,2022.0,8.0,1.0,0.0,2.0,1.0,1.0,2.0,,,,,,,0.0,1.0,0.0,,,,,,,,1.0,7.0,,,,,,,
2,76165,10689869.0,140161626,12.0,0.0,6.0,2.0,,,,,,,,,,,,,,,,,,,,2.0,39.0,2.0,1.0,2,4,1.0,3.0,1.0,2022.0,3.0,2.0,0.0,2.0,1.0,1.0,2.0,,,,,,,0.0,1.0,0.0,,,,,,,,3.0,3.0,,,,,,,
3,280165,12430439.0,753467226,16.0,5.0,3.0,2.0,,,,,,,,,,,,,,,,,,,,2.0,43.0,2.0,1.0,1,4,2.0,3.0,3.0,2022.0,12.0,11.0,0.0,1.0,2.0,,1.0,,,1.0,1.0,,,0.0,0.0,0.0,,,,,,,,4.0,2.0,,,,,,,
4,469205,13857142.0,413725626,16.0,4.0,6.0,2.0,,,,,,,,,,,,,,,,,,,,2.0,32.0,1.0,1.0,2,3,1.0,2.0,3.0,2022.0,5.0,10.0,0.0,2.0,1.0,1.0,2.0,,,,,,,0.0,1.0,0.0,,,,,,,,5.0,2.0,,,,,,,


In [175]:
print("--- 2. Exploratory Data Analysis: Initial Inspection ---")

# 1. Check the dimensions of the DataFrame (Rows, Columns)
print(f"\n[1] Data Dimensions (Rows, Columns): {df.shape}")

# 2. View the first 5 rows to get a quick look at the data structure
print("\n[2] Head of the Dataset:")
# print(df.head())

# 3. Get a concise summary of the DataFrame, including the number of non-null values and data types
print("\n[3] DataFrame Information (Data Types and Missing Value Overview):")
# The output of df.info() will show non-null counts, which helps identify missing data.
df.info()

# 4. Descriptive statistics for numerical columns: count, mean, std, min, max, quartiles
print("\n[4] Descriptive Statistics for Numerical Variables (df.describe()):")
# print(df.describe())

# 5. Descriptive statistics for categorical (object) columns: count, unique, top, freq
# 1. Generate the descriptive statistics for object columns
numerical_stats = df.describe(include=['number'])

# 2. Transpose the result using .T
transposed_stats = numerical_stats.T

# 3. Print the transposed DataFrame
print(transposed_stats)

--- 2. Exploratory Data Analysis: Initial Inspection ---

[1] Data Dimensions (Rows, Columns): (35471, 68)

[2] Head of the Dataset:

[3] DataFrame Information (Data Types and Missing Value Overview):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35471 entries, 0 to 35470
Data columns (total 68 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   pidp          35471 non-null  int64  
 1   pid           5814 non-null   float64
 2   n_hidp        35471 non-null  int64  
 3   n_scghq1_dv   34114 non-null  float64
 4   n_scghq2_dv   34114 non-null  float64
 5   n_sclfsato    34383 non-null  float64
 6   n_health      35261 non-null  float64
 7   n_mhcond8     8638 non-null   float64
 8   n_mhcond9     8638 non-null   float64
 9   n_mhcond6     8638 non-null   float64
 10  n_mhcond19    8638 non-null   float64
 11  n_mhcond10    8638 non-null   float64
 12  n_mhcond4     8638 non-null   float64
 13  n_mhcond2     8638 non-null   float64
 1

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [39]:
# Set up plotting style
sns.set_style("whitegrid")
# Set font for better compatibility (adjust if specific font is needed)
plt.rcParams['figure.figsize'] = (15, 10) 

In [41]:
# 0. Configuration
# ------------------------------------
processed_dir = "abspath(root, cfg['paths']['data_processed'])"
file_name = "n_indresp_extracted.csv"
file_path = os.path.join(processed_dir, file_name)

In [45]:
# --- GHQ Items List and UKHLS Missing Codes ---
UKHLS_MISSING_CODES = [-9.0, -8.0, -7.0, -2.0, -1.0]

# --- Helper Function 1: Clean Missing Codes (for float conversion) ---
def clean_missing_codes(df, cols):
    """Replaces UKHLS special negative codes with NaN and ensures float type."""
    # Note: Use errors='ignore' in astype(float) if the column might contain non-numeric strings
    for col in cols:
        if col in df.columns:
            df[col] = df[col].replace(UKHLS_MISSING_CODES, np.nan).astype(float, errors='ignore')
    return df

# --- Helper Function 2: Clean and Convert to Nullable Integer ---
def clean_and_convert_to_nullable_int(df, cols):
    """Converts UKHLS missing codes to NaN and converts to Pandas' nullable Int64."""
    for col in cols:
        if col in df.columns:
            df[col] = df[col].replace(UKHLS_MISSING_CODES, np.nan)
            try:
                # Convert to nullable Int64 (supports NaN)
                df[col] = df[col].astype('Int64')
            except Exception:
                # Fallback to float if conversion to Int64 fails
                df[col] = df[col].astype(float)
    return df

# ==============================================================================
# 0. Load Data and Initial Cleaning
# ==============================================================================
try:
    df = pd.read_csv(file_path) 
    print(f"Successfully loaded data from: {file_path}")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the path and filename.")
    exit() 
except Exception as e:
    print(f"An unexpected error occurred during file loading: {e}")
    exit()

# Apply base cleaning to all columns that might contain UKHLS codes
all_cols_to_clean = [col for col in df.columns if col not in ['pidp', 'pid', 'n_hidp']]
df = clean_missing_codes(df, all_cols_to_clean)

Successfully loaded data from: /Users/wanderer/Thesis/ModelProject/Processed_Data/n_indresp_extracted.csv


In [61]:
# ==============================================================================
# 1. Feature Engineering: Mental Health Status (Outcome Variable) - USER CODE
# ==============================================================================

# 1.1 Define Primary (Associated) and Secondary (Other) MH Variable Groups
primary_mh_vars = [
    'n_mhcond19', 'n_mhcond2', 'n_mhcond5', 'n_mhcond13', 
    'n_mhcond17', 'n_mhcond18', 'n_mhcond97'
]
secondary_mh_vars_no_none = [ # Exclude n_mhcond96 from the positive check for now
    'n_mhcond8', 'n_mhcond9', 'n_mhcond6', 'n_mhcond10', 
    'n_mhcond4', 'n_mhcond11', 'n_mhcond12', 'n_mhcond14', 
    'n_mhcond3', 'n_mhcond15', 'n_mhcond16'
]
all_positive_mh_vars = primary_mh_vars + secondary_mh_vars_no_none
all_mh_vars_including_96 = all_positive_mh_vars + ['n_mhcond96']

# 1.2 Clean and Prepare MH Data
# Convert UKHLS missing codes to NaN and ensure float type for calculation.
# This cleaning is handled by the initial clean_missing_codes call, but we ensure float type.
df[all_mh_vars_including_96] = df[all_mh_vars_including_96].astype(float)

# 1.3 Create Indicator Variables (1.0 = Yes mentioned, NaN = Missing, 0.0 = Not mentioned)
df['primary_mh_indicator'] = df[primary_mh_vars].max(axis=1, skipna=True)
df['secondary_mh_indicator'] = df[secondary_mh_vars_no_none].max(axis=1, skipna=True)
df['any_mh_problem'] = df[all_positive_mh_vars].max(axis=1, skipna=True)

# 1.4 Construct the Final Categorical Variable 'mental_health_status'
df['mental_health_status'] = np.nan # Default to NaN

# --- Prioritization Logic ---

# 1. Associated MH Issue (any primary condition is 1.0)
df.loc[df['primary_mh_indicator'] == 1.0, 
       'mental_health_status'] = 'Associated MH Issue'

# 2. Other MH Issue (secondary condition is 1.0 AND no primary condition is 1.0)
df.loc[(df['primary_mh_indicator'] == 0.0) & (df['secondary_mh_indicator'] == 1.0), 
       'mental_health_status'] = 'Other MH Issue'

# 3. No Reported MH Issue (Max of all positive MH vars must be 0.0)
mh_answered_but_no_problem = (df['any_mh_problem'] == 0.0)
df.loc[mh_answered_but_no_problem, 
       'mental_health_status'] = 'No Reported MH Issue'

# 4. Missing/Inapplicable 
df['mental_health_status'] = df['mental_health_status'].fillna('Missing/Inapplicable')

# 1.5 Clean up auxiliary columns
df = df.drop(columns=['primary_mh_indicator', 'secondary_mh_indicator', 'any_mh_problem'])
print("\n--- Feature Engineering 1: Mental Health Status Completed ---")
print(df['mental_health_status'].value_counts(dropna=False))


# ==============================================================================
# 2. Feature Engineering: Early Career Unemployment Risk - USER CODE
# ==============================================================================

# 2.1 Clean and Prepare Variables
time_vars = ['n_j1mnth', 'n_j1year', 'n_j1endmnth', 'n_j1endyear', 'n_j1none', 'n_j1still', 'n_jbstat']
df = clean_and_convert_to_nullable_int(df, time_vars)

# 2.2 Define Threshold for "Quick Exit"
QUICK_EXIT_THRESHOLD_MONTHS = 12

# 2.3 Calculate First Job Duration (in months)
df['first_job_duration_months'] = np.where(
    df['n_j1year'].notna() & df['n_j1endyear'].notna() & df['n_j1mnth'].notna() & df['n_j1endmnth'].notna(),
    (df['n_j1endyear'] - df['n_j1year']) * 12 + (df['n_j1endmnth'] - df['n_j1mnth']),
    np.nan
)

# 2.4 Construct the Early Career Risk Variable
df['early_career_unemp_risk'] = np.nan 

# Code 2: Never had a paid job
df.loc[df['n_j1none'] == 2, 'early_career_unemp_risk'] = 'Never Employed'

# Code 1: Still in first job
df.loc[df['n_j1still'] == 1, 'early_career_unemp_risk'] = 'Stable Start: Still in First Job'

# A. Early Failure: Duration < Threshold
df.loc[(df['n_j1still'] == 2) & (df['first_job_duration_months'] < QUICK_EXIT_THRESHOLD_MONTHS),
       'early_career_unemp_risk'] = 'Early Failure: Quick Exit'

# B. Stable Start: Left Long-Term Job: Duration >= Threshold
df.loc[(df['n_j1still'] == 2) & (df['first_job_duration_months'] >= QUICK_EXIT_THRESHOLD_MONTHS),
       'early_career_unemp_risk'] = 'Stable Start: Left Long-Term Job'

# C. Extended Unemployment after Early Failure (Unemployed now, n_jbstat=3)
df.loc[(df['early_career_unemp_risk'] == 'Early Failure: Quick Exit') & (df['n_jbstat'] == 3),
       'early_career_unemp_risk'] = 'Extended Unemp after Quick Exit'

# 2.5 Finalize Missing / Inapplicable
df['early_career_unemp_risk'] = df['early_career_unemp_risk'].fillna('Missing/Inapplicable')
       
print("\n--- Feature Engineering 2: Early Career Unemployment Risk Completed ---")
print(df['early_career_unemp_risk'].value_counts(dropna=False))


# ==============================================================================
# 3. Feature Engineering: Employment Volatility (Employment Stability) - USER CODE
# ==============================================================================

# 3.1 Clean and Prepare Variables
volatility_vars = ['n_nnmpsp_dv', 'n_nmpsp_dv', 'n_nunmpsp_dv']
df = clean_and_convert_to_nullable_int(df, volatility_vars)

# 3.2 Construct Total Spells (Job changes/state changes)
df['total_spells_since_last_int'] = df['n_nnmpsp_dv'] + df['n_nmpsp_dv']

# 3.3 Construct Employment Stability Index
df['employment_stability_level'] = np.nan

# Case 1: Highest Stability - Zero Spells
df.loc[df['total_spells_since_last_int'] == 0, 
       'employment_stability_level'] = 'High Stability: No change reported'
       
# Case 2: Moderate Stability - One Employment Spell, Zero Non-employment Spells
df.loc[(df['n_nmpsp_dv'] == 1) & (df['n_nnmpsp_dv'] == 0), 
       'employment_stability_level'] = 'Moderate Stability: Single Employment Spell'
       
# Case 3: Low Stability - Single Non-Employment Spell 
df.loc[(df['n_nmpsp_dv'] == 0) & (df['n_nnmpsp_dv'] >= 1), 
       'employment_stability_level'] = 'Low Stability: Non-Employment Spells Only'
       
# Case 4: High Volatility - Multiple Spells (Threshold >= 3)
df.loc[df['total_spells_since_last_int'] >= 3, 
       'employment_stability_level'] = 'High Volatility: Frequent Changes'

# 3.4 Finalize Missing/Inapplicable
df['employment_stability_level'] = df['employment_stability_level'].fillna('Missing/Inapplicable')

df['unemployment_spells_count'] = df['n_nunmpsp_dv']
df = df.drop(columns=['total_spells_since_last_int'])

print("\n--- Feature Engineering 3: Employment Stability Completed ---")
print(df['employment_stability_level'].value_counts(dropna=False))


# ==============================================================================
# A. GHQ-12 Score (Using Aggregated Variable n_scghq2_dv)
# ==============================================================================
ghq_caseness_var = 'n_scghq2_dv'

# Clean and rename n_scghq2_dv (already cleaned to float by the initial step)
df['ghq12_continuous_score'] = df[ghq_caseness_var].astype(float) 

print("\n--- Feature Engineering A: GHQ-12 Score (Aggregated) Completed ---")


# ==============================================================================
# B. Education Feature: Ordered Categorical Variable (ISCED-11 based)
# ==============================================================================

isced_vars = ['n_isced11_dv', 'n_hiqual_dv']
df = clean_and_convert_to_nullable_int(df, isced_vars) # Convert ISCED to Int for mapping

isced_var = 'n_isced11_dv'
education_mapping = {
    # Low Education: ISCED 0-3 (Codes 9, 2, 3)
    9: 'Low_Education',   
    2: 'Low_Education',   
    3: 'Low_Education',   
    
    # Medium Education: ISCED 4-5 (Codes 4, 5)
    4: 'Medium_Education', 
    5: 'Medium_Education', 
    
    # High Education: ISCED 6-8 (Codes 6, 7, 8)
    6: 'High_Education',   
    7: 'High_Education',   
    8: 'High_Education',   
}

df['education_level'] = df[isced_var].map(education_mapping).fillna('Missing/Inapplicable')
education_categories = ['Low_Education', 'Medium_Education', 'High_Education', 'Missing/Inapplicable']
df['education_level'] = pd.Categorical(df['education_level'], categories=education_categories, ordered=True)

print("--- Feature Engineering B: Education Level (Ordered) Created ---")

# ==============================================================================
# B.1. Feature Engineering: Vocational Qualification Indicator
# Goal: Create a binary indicator if the individual holds any BTEC qualification.
# ==============================================================================

vocational_vars = ['n_btec1', 'n_btec2', 'n_btec3', 'n_btec4']

# 1. Clean and ensure float type for calculation (already handled by initial cleaning, but re-assert)
df = clean_missing_codes(df, vocational_vars)

# 2. Identify Missingness: Check if all BTEC variables are NaN (Inapplicable/Missing)
df['all_btec_nan'] = df[vocational_vars].isnull().all(axis=1)

# 3. Create the binary indicator 'has_vocational_qual'
# Calculate the maximum value across the 4 BTEC columns. If any is 1.0 (Mentioned), the max is 1.0.
df['has_vocational_qual'] = df[vocational_vars].max(axis=1, skipna=True)

# 4. Handle Missingness: If all BTEC variables were missing, set the new indicator to NaN.
df.loc[df['all_btec_nan'], 'has_vocational_qual'] = np.nan

# 5. Convert to nullable integer
df = clean_and_convert_to_nullable_int(df, ['has_vocational_qual'])

# 6. Clean up auxiliary column
df = df.drop(columns=['all_btec_nan'])

print("\n--- Feature Engineering B.1: Vocational Qualification Indicator Completed ---")
print(df['has_vocational_qual'].value_counts(dropna=False))

# ==============================================================================
# C. Control Variables: Binary/Ordinal/Dummy Encoding - CORRECTED
# ==============================================================================

# Note: We assume core_categorical_vars cleaning (including n_sex, n_health, n_finnow) 
# to Int64 has already been executed successfully.

# C.1. Binary Variables (from codes)

# --- CORRECTED CODE FOR 'female' ---
# 1. Initialize the column with 0 (Male)
df['female'] = 0 
# 2. Use .loc to set the '2' (Female) code to 1
df.loc[df['n_sex'] == 2, 'female'] = 1
# 3. Use .loc to set rows where 'n_sex' is missing (NA) to NaN
df.loc[df['n_sex'].isna(), 'female'] = np.nan
df['female'] = df['female'].astype('Int64') # Convert final result to Nullable Int

# --- CORRECTED CODE FOR 'has_disability' ---
# n_health: 1=Yes, 2=No. Recode to 'has_disability' (1=Yes, 0=No).
# 1. Initialize the column with 0 (No)
df['has_disability'] = 0
# 2. Use .loc to set the '1' (Yes) code to 1
df.loc[df['n_health'] == 1, 'has_disability'] = 1
# 3. Use .loc to set rows where 'n_health' is missing (NA) to NaN
df.loc[df['n_health'].isna(), 'has_disability'] = np.nan
df['has_disability'] = df['has_disability'].astype('Int64')

# --- CORRECTED CODE FOR 'financial_difficulty' ---
# n_finnow: 4.0=Quite difficult, 5.0=Very difficult. Recode to 1=Difficulty, 0=No Difficulty.
# 1. Initialize the column with 0 (No Difficulty)
df['financial_difficulty'] = 0
# 2. Use .loc to set the difficulty codes (4, 5) to 1
df.loc[df['n_finnow'].isin([4, 5]), 'financial_difficulty'] = 1
# 3. Use .loc to set rows where 'n_finnow' is missing (NA) to NaN
df.loc[df['n_finnow'].isna(), 'financial_difficulty'] = np.nan
df['financial_difficulty'] = df['financial_difficulty'].astype('Int64')


# C.2. Categorical Variables: One-Hot Encoding (OHE)

# Marital Status (n_marstat)
df['n_marstat'] = df['n_marstat'].astype('category')
marstat_dummies = pd.get_dummies(df['n_marstat'], prefix='marstat', dummy_na=True, drop_first=False)
df = pd.concat([df, marstat_dummies], axis=1)

# Ethnicity (n_racel_dv)
df['n_racel_dv'] = df['n_racel_dv'].astype('category')
racel_dummies = pd.get_dummies(df['n_racel_dv'], prefix='race', dummy_na=True, drop_first=False)
df = pd.concat([df, racel_dummies], axis=1)

print("--- Feature Engineering C: Control Variables (Binary/OHE) Completed ---")

# ==============================================================================
# D. Advanced Employment Features: Status and Duration - (FIXED: Replacing np.where)
# ==============================================================================

# D.1. Current Unemployment Status (From n_jbstat)

# 1. Binary Indicator: Currently Unemployed
df['is_currently_unemployed'] = 0
# Use .loc for assignment based on Int64 column
df.loc[df['n_jbstat'] == 3, 'is_currently_unemployed'] = 1
# Preserve missingness
df.loc[df['n_jbstat'].isna(), 'is_currently_unemployed'] = np.nan
df['is_currently_unemployed'] = df['is_currently_unemployed'].astype('Int64')


# D.2. Key Non-Working/Inactive Indicators (Control Variables)

# D.2.1. Long-Term Sick/Disabled (Value 8.0)
df['inactive_lt_sick'] = 0
df.loc[df['n_jbstat'] == 8, 'inactive_lt_sick'] = 1
df.loc[df['n_jbstat'].isna(), 'inactive_lt_sick'] = np.nan
df['inactive_lt_sick'] = df['inactive_lt_sick'].astype('Int64')

# D.2.2. Looking After Family/Home (Value 6.0)
df['inactive_home_family'] = 0
df.loc[df['n_jbstat'] == 6, 'inactive_home_family'] = 1
df.loc[df['n_jbstat'].isna(), 'inactive_home_family'] = np.nan
df['inactive_home_family'] = df['inactive_home_family'].astype('Int64')


# D.3. Unemployment Duration (Months)
# Note: Duration calculation used only arithmetic, which is safe from this error.

date_vars = ['n_jlendm', 'n_jlendy', 'n_ienddatm', 'n_ienddaty']
# We assume date_vars were cleaned to Int64 earlier
# No change needed for this block, as it handles NA correctly with .notna() and np.where(..., np.nan)
# We re-run it here for completeness:
df['last_unemployment_duration_months'] = np.where(
    df['n_jlendy'].notna() & df['n_ienddaty'].notna() & df['n_jlendm'].notna() & df['n_ienddatm'].notna(),
    (df['n_ienddaty'] - df['n_jlendy']) * 12 + (df['n_ienddatm'] - df['n_jlendm']),
    np.nan
)
df.loc[df['n_jbstat'].isin([1, 2]), 'last_unemployment_duration_months'] = np.nan


# D.4. Job Search Intensity and Motivation (Auxiliary Indicators)

aux_employment_vars = ['n_jbhas', 'n_julk4wk', 'n_julkjb', 'n_jubgn', 'n_jbhad']
# We assume aux_employment_vars were cleaned to Int64 earlier

# D.4.1. Actively Seeking (Core ILO definition for the unemployed)
# n_julk4wk=1 (Yes, looked for work)
df['actively_seeking'] = 0
df.loc[df['n_julk4wk'] == 1, 'actively_seeking'] = 1
df.loc[df['n_julk4wk'].isna(), 'actively_seeking'] = np.nan
df['actively_seeking'] = df['actively_seeking'].astype('Int64')

# D.4.2. Marginal Attachment/Discouraged Worker 
# Condition: Did no paid work last week (n_jbhas=2) AND would like a job (n_julkjb=1)
df['marginally_attached'] = 0
df.loc[(df['n_jbhas'] == 2) & (df['n_julkjb'] == 1), 'marginally_attached'] = 1
# Identify missingness where either component is NA
missing_mask = df['n_jbhas'].isna() | df['n_julkjb'].isna()
df.loc[missing_mask, 'marginally_attached'] = np.nan
df['marginally_attached'] = df['marginally_attached'].astype('Int64')


# D.4.3. Subjective Employment Chance (n_eprosh)
# Recode subjective chance to be predictive (1=likely, 0=unlikely)
# 1/2 = Likely, 3/4 = Unlikely
df['subjective_job_chance_likely'] = 0
df.loc[df['n_eprosh'].isin([1, 2]), 'subjective_job_chance_likely'] = 1
df.loc[df['n_eprosh'].isna(), 'subjective_job_chance_likely'] = np.nan
df['subjective_job_chance_likely'] = df['subjective_job_chance_likely'].astype('Int64')

print("\n--- Feature Engineering D: Advanced Employment Features (Fixed) Completed ---")
print(df[['is_currently_unemployed', 'last_unemployment_duration_months', 'marginally_attached']].describe())

# ==============================================================================
# E. Demographic and Socioeconomic Control Variables (FIXED)
# Goal: Finalize encoding for age, marital status (using n_marstat), ethnicity, and financial perception.
# ==============================================================================

# Variables to ensure initial cleaning/Int64 conversion:
control_vars_to_clean = ['n_age_dv', 'n_health', 'n_marstat', 'n_racel_dv', 
                         'n_urban_dv', 'n_finnow', 'n_finfut']
df = clean_and_convert_to_nullable_int(df, control_vars_to_clean) # Assuming this function is correctly defined

# E.1. Continuous Variable: Age (n_age_dv)
# Age is already cleaned as Int64.

# E.2. Binary Variables (n_urban_dv)
# n_urban_dv: 1=Urban, 2=Rural. Create 'is_urban' (1=Urban, 0=Rural).
df['is_urban'] = 0
df.loc[df['n_urban_dv'] == 1, 'is_urban'] = 1
df.loc[df['n_urban_dv'].isna(), 'is_urban'] = np.nan
df['is_urban'] = df['is_urban'].astype('Int64')

# E.3. Ordinal/Binary Financial Variables 
# n_finnow -> financial_difficulty (already handled in previous steps using .loc)

# n_finfut: Subjective financial situation - future (1=Better, 2=Worse, 3=Same)
df['n_finfut'] = df['n_finfut'].astype('category')
finfut_dummies = pd.get_dummies(df['n_finfut'], prefix='finfut', dummy_na=True, drop_first=False)
df = pd.concat([df, finfut_dummies], axis=1)


# E.4. High-Dimensional Categorical Variable: Ethnicity (n_racel_dv)
# Goal: Group 17 detailed codes into broader, more statistically meaningful categories (White, Asian, Black, etc.).

# Define the grouping map based on standard UK research practice:
ethnic_group_map = {
    1: 'White', 2: 'White', 3: 'White', 4: 'White',
    5: 'Mixed', 6: 'Mixed', 7: 'Mixed', 8: 'Mixed',
    9: 'Asian', 10: 'Asian', 11: 'Asian', 12: 'Asian', 13: 'Asian',
    14: 'Black', 15: 'Black', 16: 'Black',
    17: 'Other', 97: 'Other'
}

df['ethnic_group'] = df['n_racel_dv'].map(ethnic_group_map)

# Apply One-Hot Encoding to the grouped variable
df['ethnic_group'] = df['ethnic_group'].astype('category')
ethnic_dummies = pd.get_dummies(df['ethnic_group'], prefix='race_grp', dummy_na=True, drop_first=False)
df = pd.concat([df, ethnic_dummies], axis=1)


# E.5. Categorical Variable: Marital Status (n_marstat) - FIXED
marital_var_name = 'n_marstat' # Use the correct column name

# 1. Handle the 'Under 16 years' code (0.0) if present, converting it to NaN/NA.
# This ensures that non-adults are excluded from the adult marital status analysis.
df[marital_var_name] = df[marital_var_name].replace({0.0: np.nan})

# 2. Apply One-Hot Encoding
df[marital_var_name] = df[marital_var_name].astype('category')
marstat_dummies_final = pd.get_dummies(df[marital_var_name], prefix='marstat_final', dummy_na=True, drop_first=False)
df = pd.concat([df, marstat_dummies_final], axis=1)


print("\n--- Feature Engineering E: Demographic and Control Variables (Fixed) Completed ---")


print("\n--- Feature Engineering E: Demographic and Control Variables Completed ---")

# ==============================================================================
# 5. Final Data Prep and Save (Complete and Optimized)
# Goal: Keep only final engineered features and essential IDs/controls.
# ==============================================================================

# 1. List of ALL desired final columns
columns_to_keep = [
    # 1. IDs/Raw Continuous/Counters 
    'pidp', 'n_age_dv', 'n_hhsize', 'n_nchild_dv', 'unemployment_spells_count', 
    
    # 2. Outcomes
    'mental_health_status', 'ghq12_continuous_score',
    
    # 3. Core Predictors (Categorical/Ordinal)
    'early_career_unemp_risk', 'employment_stability_level', 'education_level',
    
    # 4. Employment Features (Binary/Continuous) - Step D
    'is_currently_unemployed', 'last_unemployment_duration_months', 
    'inactive_lt_sick', 'inactive_home_family',
    'actively_seeking', 'marginally_attached', 'subjective_job_chance_likely',
    
    # 5. Control Variables (Binary/Simple) - Steps C & E
    'female', 'has_disability', 'financial_difficulty', 'is_urban',
    'has_vocational_qual', 
]

# 6. Add all generated dummy columns (One-Hot Encoded variables)
# Collect all prefixes used for OHE variables across all steps
dummy_prefixes = (
    'marstat_final_', # Final Marital Status (n_marstat_dv fix)
    'race_grp_',      # Ethnic Grouping (Step E)
    'finfut_',        # Future Financial Status (Step E)
    'marstat_',       # Old Marital Status (if any)
    'race_',          # Old Ethnicity (if any)
)

# Dynamically capture all columns starting with the specified prefixes
dummy_cols = [col for col in df.columns if col.startswith(dummy_prefixes)]

# Combine and ensure final columns only include those existing in the DataFrame
final_columns = list(set(columns_to_keep + dummy_cols))
final_columns = [col for col in final_columns if col in df.columns]

# Filter the DataFrame to keep only the final model columns
df_final = df[final_columns].copy()


# Save the optimized file
output_file_name = "n_indresp_FINAL_FEATURES_OPTIMIZED.csv"
output_file_path = os.path.join(processed_dir, output_file_name)
df_final.to_csv(output_file_path, index=False)
print(f"\n--- ALL Feature Engineering Stages Complete. Optimized data saved to: {output_file_path} ---")

  df.loc[df['primary_mh_indicator'] == 1.0,
  df.loc[df['n_j1none'] == 2, 'early_career_unemp_risk'] = 'Never Employed'
  df.loc[df['total_spells_since_last_int'] == 0,



--- Feature Engineering 1: Mental Health Status Completed ---
mental_health_status
Missing/Inapplicable    26833
No Reported MH Issue     6229
Associated MH Issue      2054
Other MH Issue            355
Name: count, dtype: int64

--- Feature Engineering 2: Early Career Unemployment Risk Completed ---
early_career_unemp_risk
Missing/Inapplicable                29180
Stable Start: Left Long-Term Job     4078
Early Failure: Quick Exit            1171
Stable Start: Still in First Job      691
Never Employed                        275
Extended Unemp after Quick Exit        76
Name: count, dtype: int64

--- Feature Engineering 3: Employment Stability Completed ---
employment_stability_level
Missing/Inapplicable                           14142
Moderate Stability: Single Employment Spell    12422
Low Stability: Non-Employment Spells Only       8269
High Stability: No change reported               606
High Volatility: Frequent Changes                 32
Name: count, dtype: int64

--- Feature E