In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the data from SOEP repository
file_path = "/path/to/your/data/pgen.sav"

# select columns of interest
usecols = ['pid', 'syear', 'pgbilzeit', 'hid', 'pgfamstd', 'pgbilzt' ,'pgtatzeit', 'pgemplst', 'pglfs', 'pglabgro']

# filter data
df1 = pd.read_spss(file_path, usecols=usecols)

# Rename columns
df1.rename(columns={'pgbilzeit': 'education', 'pgfamstd': 'marital_status', 'pgtatzeit': 'working_hours', 'pgemplst' : 'empl_status', 'pglfs': 'lf_status', 'pglabgro': 'gross_income'}, inplace=True)

#limit syear to 2011-2021
df1 = df1[(df1['syear'] >= 2011) & (df1['syear'] <= 2021)]

# Replacement dictionary
replacement_dict = {
    '[1] Married': 1,
    '[2] Married, But Separated': 0,
    '[3] Single': 0,
    '[4] Divorced': 0,
    '[5] [de] Verwitwet / Lebenspartner/-in aus eingetragener gleichgeschlechtlicher Partnerschaft verstorben': 0,
    '[6] husband/wife abroad': 0,
    '[7] Registered same-sex partnership': 1,
    '[7] Registered Same-Sex Partnership, Living Together': 1,
    '[8] Registered same sex partnership, separated': 0,
    '[8] Registered Same-Sex Partnership, Living Apart': 0,

    '[1] Non-working (NW): without further information': 0,
    '[2] NW: age 65 and older': 0,
    '[3] NW: and currently in training/education': 0,
    '[4] NW: on parental leave (since 1991)': 0,
    '[5] NW: in military/community service': 0,
    '[6] NW: and registered unemployed': 0,
    '[8] NW: but occasional secondary job (1985-2016)': 0,
    '[9] NW: but paid work in past 7 days (since 1999)': 0,
    '[10] NW: but regular secondary job (1985-2016)': 0,
    '[11] Working': 1,
    '[12] Working, but inactive within past 7 days (since 2000)': 1,
    '[13] NW: but paid secondary job (since 2017)': 1,

    '[-1] No answer': np.nan,
    '[-2] Does not apply': np.nan,
    '[-3] Answer Improbable': np.nan,
    '[-3] Not valid': np.nan,
    '[-4] Inadmissible Multiple Answer': np.nan,
    '[-4] Inadmissible multiple response': np.nan,
    '[-5] Not Contained In Questionnaire': np.nan,
    '[-5] Not Included In Questionnaire Version': np.nan,
    '[-5] Not Included In Questionnaire Version': np.nan,
    '[-5] Not included in this version of the questionnaire': np.nan,
    '[-6] Questionnaire Version With Modified Filtering': np.nan,

    '[1] Full-Time Employment': 1,
    '[2] Regular Part-Time Employment': 2,
    '[3] Vocational Training': 2,
    '[4] Marginal, Irregular Part-Time Employment': 2,
    '[5] Not Employed': 0,
    '[6] Sheltered workshop (since 1998)': 2,
}

# Replace the entire strings as needed in all relevant columns
df1.replace(replacement_dict, inplace=True)

# Apply replacements to the entire DataFrame to catch any other special cases
df1 = df1.applymap(lambda x: replacement_dict.get(x, x) if isinstance(x, str) else x)

print(df1)   
# Display the columns of the DataFrame
print(df1.columns)


              hid         pid   syear  marital_status  gross_income  \
449736       60.0       602.0  2011.0             1.0         615.0   
449737       94.0       901.0  2011.0             0.0        1783.0   
449738      159.0      1501.0  2011.0             0.0        7000.0   
449739      167.0      1601.0  2011.0             0.0           NaN   
449740      230.0      2301.0  2011.0             0.0        4300.0   
...           ...         ...     ...             ...           ...   
771205  5554900.0  55549002.0  2021.0             0.0        2100.0   
771206  5556040.0  55560402.0  2021.0             0.0           NaN   
771207  5556240.0  55562402.0  2021.0             0.0        2300.0   
771208  5556470.0  55564702.0  2021.0             0.0           NaN   
771209  5557210.0  55572102.0  2021.0             0.0        1800.0   

       empl_status  lf_status  working_hours  education  
449736           2        1.0           15.0       18.0  
449737           1        1.0  

In [15]:
df1.to_csv("/path/to/your/directory/N.covariates.csv", index=False)