In [1]:
import pandas as pd

# Load the dataset
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_health_data_2022_cleaned_with_mhi5.csv'
data = pd.read_csv(file_path)

# Define the columns and reverse scoring indicators for each trait
traits = {
    'extraversion': {
        'columns': {
            'cp22n020': 1,
            'cp22n025': -1,
            'cp22n030': 1,
            'cp22n035': -1,
            'cp22n040': 1,
            'cp22n045': -1,
            'cp22n050': 1,
            'cp22n055': -1,
            'cp22n060': 1,
            'cp22n065': -1
        }
    },
    'agreeableness': {
        'columns': {
            'cp22n021': -2,
            'cp22n026': 2,
            'cp22n031': -2,
            'cp22n036': 2,
            'cp22n041': -2,
            'cp22n046': 2,
            'cp22n051': -2,
            'cp22n056': 2,
            'cp22n061': 2,
            'cp22n066': 2
        }
    },
    'conscientiousness': {
        'columns': {
            'cp22n022': 3,
            'cp22n027': -3,
            'cp22n032': 3,
            'cp22n037': -3,
            'cp22n042': 3,
            'cp22n047': -3,
            'cp22n052': 3,
            'cp22n057': -3,
            'cp22n062': 3,
            'cp22n067': 3
        }
    },
    'emotional_stability': {
        'columns': {
            'cp22n023': -4,
            'cp22n028': 4,
            'cp22n033': -4,
            'cp22n038': 4,
            'cp22n043': -4,
            'cp22n048': -4,
            'cp22n053': -4,
            'cp22n058': -4,
            'cp22n063': -4,
            'cp22n068': -4
        }
    },
    'intellect_imagination': {
        'columns': {
            'cp22n024': 5,
            'cp22n029': -5,
            'cp22n034': 5,
            'cp22n039': -5,
            'cp22n044': 5,
            'cp22n049': -5,
            'cp22n054': 5,
            'cp22n059': 5,
            'cp22n064': 5,
            'cp22n069': 5
        }
    }
}

# Convert relevant columns to numeric to prevent errors
for trait, info in traits.items():
    for col in info['columns'].keys():
        data[col] = pd.to_numeric(data[col], errors='coerce')

# Calculate each trait
for trait, info in traits.items():
    # Reverse score columns where needed
    for col, reverse in info['columns'].items():
        if reverse < 0:
            data[col] = data[col].max() + data[col].min() - data[col]  # Reverse the score

    # Calculate the trait as the average of its columns
    data[trait] = data[list(info['columns'].keys())].mean(axis=1)

# Display the first few rows of the new columns for verification
print(data[['extraversion', 'agreeableness', 'conscientiousness', 'emotional_stability', 'intellect_imagination']].head())

# Optionally, save the final dataset with all traits
#data.to_csv(r'C:\Users\u1246538\Downloads\liss-data\merged_data_with_traits.csv', index=False)


   extraversion  agreeableness  conscientiousness  emotional_stability  \
0           2.1            3.7                4.4                  4.3   
1           3.5            4.0                4.4                  2.7   
2           3.7            3.7                3.8                  2.6   
3           3.6            3.5                3.1                  4.1   
4           3.7            3.1                3.2                  4.2   

   intellect_imagination  
0                    3.9  
1                    3.4  
2                    3.3  
3                    3.3  
4                    3.8  


In [3]:
import pandas as pd

# Load the dataset
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_health_data_2022_cleaned_with_mhi5.csv'
data = pd.read_csv(file_path)

# Convert columns to numeric, handling non-numeric values as NaN
columns = ['cp22n070', 'cp22n071', 'cp22n072', 'cp22n073', 'cp22n074', 
           'cp22n075', 'cp22n076', 'cp22n077', 'cp22n078', 'cp22n079']
data[columns] = data[columns].apply(pd.to_numeric, errors='coerce')

# Reverse score specified columns
reverse_scored_columns = ['cp22n074', 'cp22n079', 'cp22n078', 'cp22n077', 'cp22n072']
for col in reverse_scored_columns:
    data[col] = 8 - data[col]  # Reverse scoring on a 1-7 scale (8 - original score)

# Calculate the self-esteem score as the average of these columns
data['self_esteem'] = data[columns].mean(axis=1)

# Display the first few rows of the new column for verification
print(data[['self_esteem']].head())



   self_esteem
0          6.2
1          5.2
2          3.9
3          5.8
4          6.1


In [5]:
import pandas as pd

# Load the dataset
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_health_data_2022_cleaned_with_mhi5.csv'
data = pd.read_csv(file_path)

# Define the columns for instrumental and terminal values
instrumental_columns = [f'cp22n{str(i).zfill(3)}' for i in range(99, 117)]  # Q99 to Q116
terminal_columns = [f'cp22n{str(i).zfill(3)}' for i in range(117, 135)]  # Q117 to Q134

# Convert these columns to numeric, handling non-numeric values as NaN
all_value_columns = instrumental_columns + terminal_columns
data[all_value_columns] = data[all_value_columns].apply(pd.to_numeric, errors='coerce')

# Calculate the instrumental and terminal values as averages
data['instrumental_values'] = data[instrumental_columns].mean(axis=1)
data['terminal_values'] = data[terminal_columns].mean(axis=1)

# Calculate the composite values as the average of both instrumental and terminal values
data['composite_values'] = data[['instrumental_values', 'terminal_values']].mean(axis=1)

# Display the first few rows of the new columns for verification
print(data[['instrumental_values', 'terminal_values', 'composite_values']].head())



   instrumental_values  terminal_values  composite_values
0             5.777778         5.722222          5.750000
1             6.500000         6.500000          6.500000
2             4.333333         4.944444          4.638889
3             4.222222         5.333333          4.777778
4             5.000000         4.666667          4.833333


In [7]:
import pandas as pd
# this code is all the above in one and saved into a new dataset with the new column; extraversion, agrreableness,.... self-esteem,...composite_values
# Load the dataset
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_health_data_2022_cleaned_with_mhi5.csv'

data = pd.read_csv(file_path)

# Define the columns and reverse scoring indicators for each trait
traits = {
    'extraversion': {
        'columns': {
            'cp22n020': 1,
            'cp22n025': -1,
            'cp22n030': 1,
            'cp22n035': -1,
            'cp22n040': 1,
            'cp22n045': -1,
            'cp22n050': 1,
            'cp22n055': -1,
            'cp22n060': 1,
            'cp22n065': -1
        }
    },
    'agreeableness': {
        'columns': {
            'cp22n021': -2,
            'cp22n026': 2,
            'cp22n031': -2,
            'cp22n036': 2,
            'cp22n041': -2,
            'cp22n046': 2,
            'cp22n051': -2,
            'cp22n056': 2,
            'cp22n061': 2,
            'cp22n066': 2
        }
    },
    'conscientiousness': {
        'columns': {
            'cp22n022': 3,
            'cp22n027': -3,
            'cp22n032': 3,
            'cp22n037': -3,
            'cp22n042': 3,
            'cp22n047': -3,
            'cp22n052': 3,
            'cp22n057': -3,
            'cp22n062': 3,
            'cp22n067': 3
        }
    },
    'emotional_stability': {
        'columns': {
            'cp22n023': -4,
            'cp22n028': 4,
            'cp22n033': -4,
            'cp22n038': 4,
            'cp22n043': -4,
            'cp22n048': -4,
            'cp22n053': -4,
            'cp22n058': -4,
            'cp22n063': -4,
            'cp22n068': -4
        }
    },
    'intellect_imagination': {
        'columns': {
            'cp22n024': 5,
            'cp22n029': -5,
            'cp22n034': 5,
            'cp22n039': -5,
            'cp22n044': 5,
            'cp22n049': -5,
            'cp22n054': 5,
            'cp22n059': 5,
            'cp22n064': 5,
            'cp22n069': 5
        }
    }
}

# Convert relevant columns to numeric to prevent errors
for trait, info in traits.items():
    for col in info['columns'].keys():
        data[col] = pd.to_numeric(data[col], errors='coerce')

# Calculate each trait
for trait, info in traits.items():
    # Reverse score columns where needed
    for col, reverse in info['columns'].items():
        if reverse < 0:
            data[col] = data[col].max() + data[col].min() - data[col]  # Reverse the score
    # Calculate the trait as the average of its columns
    data[trait] = data[list(info['columns'].keys())].mean(axis=1)

# Self-esteem calculation
self_esteem_columns = ['cp22n070', 'cp22n071', 'cp22n072', 'cp22n073', 'cp22n074', 
                       'cp22n075', 'cp22n076', 'cp22n077', 'cp22n078', 'cp22n079']
data[self_esteem_columns] = data[self_esteem_columns].apply(pd.to_numeric, errors='coerce')
reverse_scored_columns = ['cp22n074', 'cp22n079', 'cp22n078', 'cp22n077', 'cp22n072']
for col in reverse_scored_columns:
    data[col] = 8 - data[col]  # Reverse scoring on a 1-7 scale

data['self_esteem'] = data[self_esteem_columns].mean(axis=1)

# Values calculation
instrumental_columns = [f'cp22n{str(i).zfill(3)}' for i in range(99, 117)]  # Q99 to Q116
terminal_columns = [f'cp22n{str(i).zfill(3)}' for i in range(117, 135)]  # Q117 to Q134
all_value_columns = instrumental_columns + terminal_columns
data[all_value_columns] = data[all_value_columns].apply(pd.to_numeric, errors='coerce')

data['instrumental_values'] = data[instrumental_columns].mean(axis=1)
data['terminal_values'] = data[terminal_columns].mean(axis=1)
data['composite_values'] = data[['instrumental_values', 'terminal_values']].mean(axis=1)

# Display the first few rows of the new columns for verification
print(data[['extraversion', 'agreeableness', 'conscientiousness', 
            'emotional_stability', 'intellect_imagination', 
            'self_esteem', 'instrumental_values', 'terminal_values', 
            'composite_values']].head())

# Save the final dataset with all traits and values
output_path = r'C:\Users\u1246538\Downloads\liss-data\merged_data_2022_with_traits_and_values.csv'
data.to_csv(output_path, index=False)

print(f"Updated dataset saved to {output_path}")


   extraversion  agreeableness  conscientiousness  emotional_stability  \
0           2.1            3.7                4.4                  4.3   
1           3.5            4.0                4.4                  2.7   
2           3.7            3.7                3.8                  2.6   
3           3.6            3.5                3.1                  4.1   
4           3.7            3.1                3.2                  4.2   

   intellect_imagination  self_esteem  instrumental_values  terminal_values  \
0                    3.9          6.2             5.777778         5.722222   
1                    3.4          5.2             6.500000         6.500000   
2                    3.3          3.9             4.333333         4.944444   
3                    3.3          5.8             4.222222         5.333333   
4                    3.8          6.1             5.000000         4.666667   

   composite_values  
0          5.750000  
1          6.500000  
2          4.6

  data['instrumental_values'] = data[instrumental_columns].mean(axis=1)
  data['terminal_values'] = data[terminal_columns].mean(axis=1)
  data['composite_values'] = data[['instrumental_values', 'terminal_values']].mean(axis=1)


In [9]:
import pandas as pd

# Load the merged dataset
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_data_2022_with_traits_and_values.csv'
data = pd.read_csv(file_path)

# List of columns to drop, including the new ones for values and traits
columns_to_drop = [
    'cp22n020', 'cp22n025', 'cp22n030', 'cp22n035', 'cp22n040', 
    'cp22n045', 'cp22n050', 'cp22n055', 'cp22n060', 'cp22n065',
    'cp22n021', 'cp22n026', 'cp22n031', 'cp22n036', 'cp22n041',
    'cp22n046', 'cp22n051', 'cp22n056', 'cp22n061', 'cp22n066',
    'cp22n022', 'cp22n027', 'cp22n032', 'cp22n037', 'cp22n042',
    'cp22n047', 'cp22n052', 'cp22n057', 'cp22n062', 'cp22n067',
    'cp22n023', 'cp22n028', 'cp22n033', 'cp22n038', 'cp22n043',
    'cp22n048', 'cp22n053', 'cp22n058', 'cp22n063', 'cp22n068',
    'cp22n024', 'cp22n029', 'cp22n034', 'cp22n039', 'cp22n044',
    'cp22n049', 'cp22n054', 'cp22n059', 'cp22n064', 'cp22n069',
    'cp22n070', 'cp22n071', 'cp22n072', 'cp22n073', 'cp22n074', 
    'cp22n075', 'cp22n076', 'cp22n077', 'cp22n078', 'cp22n079',
    
    # New columns to drop which form the "values"columns
    'cp22n099', 'cp22n100', 'cp22n101', 'cp22n102', 'cp22n103',
    'cp22n104', 'cp22n105', 'cp22n106', 'cp22n107', 'cp22n108',
    'cp22n109', 'cp22n110', 'cp22n111', 'cp22n112', 'cp22n113',
    'cp22n114', 'cp22n115', 'cp22n116', 'cp22n117', 'cp22n118',
    'cp22n119', 'cp22n120', 'cp22n121', 'cp22n122', 'cp22n123',
    'cp22n124', 'cp22n125', 'cp22n126', 'cp22n127', 'cp22n128',
    'cp22n129', 'cp22n130', 'cp22n131', 'cp22n132', 'cp22n133',
    'cp22n134'
]

# Drop the specified columns
data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save the updated dataset to a new CSV file
output_path = r'C:\Users\u1246538\Downloads\liss-data\merged_data_2022_with_traits_and_values_cleaned.csv'
data.to_csv(output_path, index=False)

print(f"Updated dataset saved to {output_path}")


Updated dataset saved to C:\Users\u1246538\Downloads\liss-data\merged_data_2022_with_traits_and_values_cleaned.csv


In [11]:
import pandas as pd

# Load the cleaned dataset
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_data_2022_with_traits_and_values_cleaned.csv'
data = pd.read_csv(file_path)

# Check the shape of the dataset
shape = data.shape
print(f"The shape of the dataset is: {shape}")

# Print the column names
print("The columns in the dataset are:")
print(data.columns.tolist())


The shape of the dataset is: (995, 52)
The columns in the dataset are:
['ch22o011', 'ch22o012', 'ch22o013', 'ch22o014', 'ch22o015', 'ch22o162', 'nomem_encr', 'uy22a016', 'uy22a015', 'cr22o143', 'cr22o144', 'cs22o439', 'cs22o487', 'cs22o280', 'cs22o436', 'cs22o472', 'cs22o473', 'cs22o474', 'cs22o577', 'cs22o578', 'cs22o579', 'cs22o580', 'cs22o581', 'cv23o012', 'geslacht', 'gender', 'gebjaar', 'leeftijd', 'lftdcat', 'lftdhhh', 'burgstat', 'woonvorm', 'herkomstgroep', 'oplcat', 'brutohh_f', 'nettohh_f', 'brutoink', 'brutoink_f', 'nettoink', 'nettoink_f', 'standardized_score', 'mental_health_class', 'class', 'extraversion', 'agreeableness', 'conscientiousness', 'emotional_stability', 'intellect_imagination', 'self_esteem', 'instrumental_values', 'terminal_values', 'composite_values']


In [13]:
import pandas as pd

# Load the cleaned dataset
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_data_2022_with_traits_and_values_cleaned.csv'
data = pd.read_csv(file_path)

# Define the columns to drop (MHi-5 columns)
mhi5_columns_to_drop = ['ch22o011', 'ch22o012', 'ch22o013', 'ch22o014', 'ch22o015']

# Drop the specified columns from the DataFrame
data.drop(columns=mhi5_columns_to_drop, inplace=True)

# Check the shape of the dataset after dropping columns
shape = data.shape
print(f"The shape of the dataset after dropping MHi-5 columns is: {shape}")

# Print the remaining column names
print("The remaining columns in the dataset are:")
print(data.columns.tolist())

# Optionally, save the cleaned dataset to a new CSV file (if you want to keep a copy)
output_path = r'C:\Users\u1246538\Downloads\liss-data\merged_data_2022_with_traits_and_values_cleaned_final.csv'
data.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to {output_path}")


The shape of the dataset after dropping MHi-5 columns is: (995, 47)
The remaining columns in the dataset are:
['ch22o162', 'nomem_encr', 'uy22a016', 'uy22a015', 'cr22o143', 'cr22o144', 'cs22o439', 'cs22o487', 'cs22o280', 'cs22o436', 'cs22o472', 'cs22o473', 'cs22o474', 'cs22o577', 'cs22o578', 'cs22o579', 'cs22o580', 'cs22o581', 'cv23o012', 'geslacht', 'gender', 'gebjaar', 'leeftijd', 'lftdcat', 'lftdhhh', 'burgstat', 'woonvorm', 'herkomstgroep', 'oplcat', 'brutohh_f', 'nettohh_f', 'brutoink', 'brutoink_f', 'nettoink', 'nettoink_f', 'standardized_score', 'mental_health_class', 'class', 'extraversion', 'agreeableness', 'conscientiousness', 'emotional_stability', 'intellect_imagination', 'self_esteem', 'instrumental_values', 'terminal_values', 'composite_values']
Cleaned dataset saved to C:\Users\u1246538\Downloads\liss-data\merged_data_2022_with_traits_and_values_cleaned_final.csv


In [15]:
import pandas as pd

# Load the dataset
file_path = r'C:/Users/u1246538/Downloads/liss-data/merged_df_2023_cleaned.csv'
df = pd.read_csv(file_path)

# Display the shape of the DataFrame
print("Shape of the DataFrame:", df.shape)

# Display all columns in the DataFrame
print("\nColumns in the DataFrame:")
print(df.columns.tolist())



Shape of the DataFrame: (895, 139)

Columns in the DataFrame:
['ch23p011', 'ch23p012', 'ch23p013', 'ch23p014', 'ch23p015', 'nomem_encr', 'ch22o162', 'cp22n020', 'cp22n021', 'cp22n022', 'cp22n023', 'cp22n024', 'cp22n025', 'cp22n026', 'cp22n027', 'cp22n028', 'cp22n029', 'cp22n030', 'cp22n031', 'cp22n032', 'cp22n033', 'cp22n034', 'cp22n035', 'cp22n036', 'cp22n037', 'cp22n038', 'cp22n039', 'cp22n040', 'cp22n041', 'cp22n042', 'cp22n043', 'cp22n044', 'cp22n045', 'cp22n046', 'cp22n047', 'cp22n048', 'cp22n049', 'cp22n050', 'cp22n051', 'cp22n052', 'cp22n053', 'cp22n054', 'cp22n055', 'cp22n056', 'cp22n057', 'cp22n058', 'cp22n059', 'cp22n060', 'cp22n061', 'cp22n062', 'cp22n063', 'cp22n064', 'cp22n065', 'cp22n066', 'cp22n067', 'cp22n068', 'cp22n069', 'cp22n070', 'cp22n071', 'cp22n072', 'cp22n073', 'cp22n074', 'cp22n075', 'cp22n076', 'cp22n077', 'cp22n078', 'cp22n079', 'cp22n099', 'cp22n100', 'cp22n101', 'cp22n102', 'cp22n103', 'cp22n104', 'cp22n105', 'cp22n106', 'cp22n107', 'cp22n108', 'cp22n109',

In [23]:
import pandas as pd
# this code is all the above in one and saved into a new dataset with the new column; extraversion, agrreableness,.... self-esteem,...composite_values
# Load the dataset
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_df_2023_cleaned.csv'

data = pd.read_csv(file_path)

# Define the columns and reverse scoring indicators for each trait
traits = {
    'extraversion': {
        'columns': {
            'cp22n020': 1,
            'cp22n025': -1,
            'cp22n030': 1,
            'cp22n035': -1,
            'cp22n040': 1,
            'cp22n045': -1,
            'cp22n050': 1,
            'cp22n055': -1,
            'cp22n060': 1,
            'cp22n065': -1
        }
    },
    'agreeableness': {
        'columns': {
            'cp22n021': -2,
            'cp22n026': 2,
            'cp22n031': -2,
            'cp22n036': 2,
            'cp22n041': -2,
            'cp22n046': 2,
            'cp22n051': -2,
            'cp22n056': 2,
            'cp22n061': 2,
            'cp22n066': 2
        }
    },
    'conscientiousness': {
        'columns': {
            'cp22n022': 3,
            'cp22n027': -3,
            'cp22n032': 3,
            'cp22n037': -3,
            'cp22n042': 3,
            'cp22n047': -3,
            'cp22n052': 3,
            'cp22n057': -3,
            'cp22n062': 3,
            'cp22n067': 3
        }
    },
    'emotional_stability': {
        'columns': {
            'cp22n023': -4,
            'cp22n028': 4,
            'cp22n033': -4,
            'cp22n038': 4,
            'cp22n043': -4,
            'cp22n048': -4,
            'cp22n053': -4,
            'cp22n058': -4,
            'cp22n063': -4,
            'cp22n068': -4
        }
    },
    'intellect_imagination': {
        'columns': {
            'cp22n024': 5,
            'cp22n029': -5,
            'cp22n034': 5,
            'cp22n039': -5,
            'cp22n044': 5,
            'cp22n049': -5,
            'cp22n054': 5,
            'cp22n059': 5,
            'cp22n064': 5,
            'cp22n069': 5
        }
    }
}

# Convert relevant columns to numeric to prevent errors
for trait, info in traits.items():
    for col in info['columns'].keys():
        data[col] = pd.to_numeric(data[col], errors='coerce')

# Calculate each trait
for trait, info in traits.items():
    # Reverse score columns where needed
    for col, reverse in info['columns'].items():
        if reverse < 0:
            data[col] = data[col].max() + data[col].min() - data[col]  # Reverse the score
    # Calculate the trait as the average of its columns
    data[trait] = data[list(info['columns'].keys())].mean(axis=1)

# Self-esteem calculation
self_esteem_columns = ['cp22n070', 'cp22n071', 'cp22n072', 'cp22n073', 'cp22n074', 
                       'cp22n075', 'cp22n076', 'cp22n077', 'cp22n078', 'cp22n079']
data[self_esteem_columns] = data[self_esteem_columns].apply(pd.to_numeric, errors='coerce')
reverse_scored_columns = ['cp22n074', 'cp22n079', 'cp22n078', 'cp22n077', 'cp22n072']
for col in reverse_scored_columns:
    data[col] = 8 - data[col]  # Reverse scoring on a 1-7 scale

data['self_esteem'] = data[self_esteem_columns].mean(axis=1)

# Values calculation
instrumental_columns = [f'cp22n{str(i).zfill(3)}' for i in range(99, 117)]  # Q99 to Q116
terminal_columns = [f'cp22n{str(i).zfill(3)}' for i in range(117, 135)]  # Q117 to Q134
all_value_columns = instrumental_columns + terminal_columns
data[all_value_columns] = data[all_value_columns].apply(pd.to_numeric, errors='coerce')

data['instrumental_values'] = data[instrumental_columns].mean(axis=1)
data['terminal_values'] = data[terminal_columns].mean(axis=1)
data['composite_values'] = data[['instrumental_values', 'terminal_values']].mean(axis=1)

# Display the first few rows of the new columns for verification
print(data[['extraversion', 'agreeableness', 'conscientiousness', 
            'emotional_stability', 'intellect_imagination', 
            'self_esteem', 'instrumental_values', 'terminal_values', 
            'composite_values']].head())

# Save the final dataset with all traits and values
output_path = r'C:\Users\u1246538\Downloads\liss-data\merged_df_2023_cleaned.csv'
data.to_csv(output_path, index=False)

print(f"Updated dataset saved to {output_path}")

   extraversion  agreeableness  conscientiousness  emotional_stability  \
0           2.1            3.7                4.4                  4.3   
1           3.5            4.0                4.4                  2.7   
2           3.7            3.7                3.8                  2.6   
3           3.6            3.5                3.1                  4.1   
4           3.7            3.1                3.2                  4.2   

   intellect_imagination  self_esteem  instrumental_values  terminal_values  \
0                    3.9          6.2             5.777778         5.722222   
1                    3.4          5.2             6.500000         6.500000   
2                    3.3          3.9             4.333333         4.944444   
3                    3.3          5.8             4.222222         5.333333   
4                    3.8          6.1             5.000000         4.666667   

   composite_values  
0          5.750000  
1          6.500000  
2          4.6

  data['instrumental_values'] = data[instrumental_columns].mean(axis=1)
  data['terminal_values'] = data[terminal_columns].mean(axis=1)
  data['composite_values'] = data[['instrumental_values', 'terminal_values']].mean(axis=1)


In [25]:
import pandas as pd

# Load the dataset
file_path = r'C:/Users/u1246538/Downloads/liss-data/merged_df_2023_cleaned.csv'
df = pd.read_csv(file_path)

# Display the shape of the DataFrame
print("Shape of the DataFrame:", df.shape)

# Display all columns in the DataFrame
print("\nColumns in the DataFrame:")
print(df.columns.tolist())

Shape of the DataFrame: (895, 148)

Columns in the DataFrame:
['ch23p011', 'ch23p012', 'ch23p013', 'ch23p014', 'ch23p015', 'nomem_encr', 'ch22o162', 'cp22n020', 'cp22n021', 'cp22n022', 'cp22n023', 'cp22n024', 'cp22n025', 'cp22n026', 'cp22n027', 'cp22n028', 'cp22n029', 'cp22n030', 'cp22n031', 'cp22n032', 'cp22n033', 'cp22n034', 'cp22n035', 'cp22n036', 'cp22n037', 'cp22n038', 'cp22n039', 'cp22n040', 'cp22n041', 'cp22n042', 'cp22n043', 'cp22n044', 'cp22n045', 'cp22n046', 'cp22n047', 'cp22n048', 'cp22n049', 'cp22n050', 'cp22n051', 'cp22n052', 'cp22n053', 'cp22n054', 'cp22n055', 'cp22n056', 'cp22n057', 'cp22n058', 'cp22n059', 'cp22n060', 'cp22n061', 'cp22n062', 'cp22n063', 'cp22n064', 'cp22n065', 'cp22n066', 'cp22n067', 'cp22n068', 'cp22n069', 'cp22n070', 'cp22n071', 'cp22n072', 'cp22n073', 'cp22n074', 'cp22n075', 'cp22n076', 'cp22n077', 'cp22n078', 'cp22n079', 'cp22n099', 'cp22n100', 'cp22n101', 'cp22n102', 'cp22n103', 'cp22n104', 'cp22n105', 'cp22n106', 'cp22n107', 'cp22n108', 'cp22n109',

In [27]:
import pandas as pd

# Load the merged dataset
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_df_2023_cleaned.csv'
data = pd.read_csv(file_path)

# List of columns to drop, including the new ones for values and traits
columns_to_drop = [
    'cp22n020', 'cp22n025', 'cp22n030', 'cp22n035', 'cp22n040', 
    'cp22n045', 'cp22n050', 'cp22n055', 'cp22n060', 'cp22n065',
    'cp22n021', 'cp22n026', 'cp22n031', 'cp22n036', 'cp22n041',
    'cp22n046', 'cp22n051', 'cp22n056', 'cp22n061', 'cp22n066',
    'cp22n022', 'cp22n027', 'cp22n032', 'cp22n037', 'cp22n042',
    'cp22n047', 'cp22n052', 'cp22n057', 'cp22n062', 'cp22n067',
    'cp22n023', 'cp22n028', 'cp22n033', 'cp22n038', 'cp22n043',
    'cp22n048', 'cp22n053', 'cp22n058', 'cp22n063', 'cp22n068',
    'cp22n024', 'cp22n029', 'cp22n034', 'cp22n039', 'cp22n044',
    'cp22n049', 'cp22n054', 'cp22n059', 'cp22n064', 'cp22n069',
    'cp22n070', 'cp22n071', 'cp22n072', 'cp22n073', 'cp22n074', 
    'cp22n075', 'cp22n076', 'cp22n077', 'cp22n078', 'cp22n079',
    
    # New columns to drop which form the "values"columns
    'cp22n099', 'cp22n100', 'cp22n101', 'cp22n102', 'cp22n103',
    'cp22n104', 'cp22n105', 'cp22n106', 'cp22n107', 'cp22n108',
    'cp22n109', 'cp22n110', 'cp22n111', 'cp22n112', 'cp22n113',
    'cp22n114', 'cp22n115', 'cp22n116', 'cp22n117', 'cp22n118',
    'cp22n119', 'cp22n120', 'cp22n121', 'cp22n122', 'cp22n123',
    'cp22n124', 'cp22n125', 'cp22n126', 'cp22n127', 'cp22n128',
    'cp22n129', 'cp22n130', 'cp22n131', 'cp22n132', 'cp22n133',
    'cp22n134'
]

# Drop the specified columns
data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save the updated dataset to a new CSV file
output_path = r'C:\Users\u1246538\Downloads\liss-data\merged_df_2023_cleaned_with_traits_and_values_cleaned.csv'
data.to_csv(output_path, index=False)

print(f"Updated dataset saved to {output_path}")


Updated dataset saved to C:\Users\u1246538\Downloads\liss-data\merged_df_2023_cleaned_with_traits_and_values_cleaned.csv


In [29]:
import pandas as pd

# Load the dataset
file_path = r'C:/Users/u1246538/Downloads/liss-data/merged_df_2023_cleaned_with_traits_and_values_cleaned.csv'
df = pd.read_csv(file_path)

# Display the shape of the DataFrame
print("Shape of the DataFrame:", df.shape)

# Display all columns in the DataFrame
print("\nColumns in the DataFrame:")
print(df.columns.tolist())

Shape of the DataFrame: (895, 52)

Columns in the DataFrame:
['ch23p011', 'ch23p012', 'ch23p013', 'ch23p014', 'ch23p015', 'nomem_encr', 'ch22o162', 'uy22a016', 'uy22a015', 'cr22o143', 'cr22o144', 'cs22o439', 'cs22o487', 'cs22o280', 'cs22o436', 'cs22o472', 'cs22o473', 'cs22o474', 'cs22o577', 'cs22o578', 'cs22o579', 'cs22o580', 'cs22o581', 'cv23o012', 'geslacht', 'gender', 'gebjaar', 'leeftijd', 'lftdcat', 'lftdhhh', 'burgstat', 'woonvorm', 'herkomstgroep', 'oplcat', 'brutohh_f', 'nettohh_f', 'brutoink', 'brutoink_f', 'nettoink', 'nettoink_f', 'standardized_score', 'mental_health_class', 'class', 'extraversion', 'agreeableness', 'conscientiousness', 'emotional_stability', 'intellect_imagination', 'self_esteem', 'instrumental_values', 'terminal_values', 'composite_values']


In [33]:
import pandas as pd

# Load the cleaned dataset
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_df_2023_cleaned_with_traits_and_values_cleaned.csv'
data = pd.read_csv(file_path)

# Define the columns to drop (MHi-5 columns)
mhi5_columns_to_drop = ['ch23p011', 'ch23p012', 'ch23p013', 'ch23p014', 'ch23p015']

# Drop the specified columns from the DataFrame
data.drop(columns=mhi5_columns_to_drop, inplace=True)

# Check the shape of the dataset after dropping columns
shape = data.shape
print(f"The shape of the dataset after dropping MHi-5 columns is: {shape}")

# Print the remaining column names
print("The remaining columns in the dataset are:")
print(data.columns.tolist())

# Optionally, save the cleaned dataset to a new CSV file (if you want to keep a copy)
output_path = r'C:\Users\u1246538\Downloads\liss-data\merged_df_2023_cleaned_with_traits_and_values_cleaned_final.csv'
data.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to {output_path}")

The shape of the dataset after dropping MHi-5 columns is: (895, 47)
The remaining columns in the dataset are:
['nomem_encr', 'ch22o162', 'uy22a016', 'uy22a015', 'cr22o143', 'cr22o144', 'cs22o439', 'cs22o487', 'cs22o280', 'cs22o436', 'cs22o472', 'cs22o473', 'cs22o474', 'cs22o577', 'cs22o578', 'cs22o579', 'cs22o580', 'cs22o581', 'cv23o012', 'geslacht', 'gender', 'gebjaar', 'leeftijd', 'lftdcat', 'lftdhhh', 'burgstat', 'woonvorm', 'herkomstgroep', 'oplcat', 'brutohh_f', 'nettohh_f', 'brutoink', 'brutoink_f', 'nettoink', 'nettoink_f', 'standardized_score', 'mental_health_class', 'class', 'extraversion', 'agreeableness', 'conscientiousness', 'emotional_stability', 'intellect_imagination', 'self_esteem', 'instrumental_values', 'terminal_values', 'composite_values']
Cleaned dataset saved to C:\Users\u1246538\Downloads\liss-data\merged_df_2023_cleaned_with_traits_and_values_cleaned_final.csv


In [35]:
import pandas as pd

# Load the dataset
file_path = r'C:/Users/u1246538/Downloads/liss-data/merged_health_data_2022_cleaned_with_difference_mhi5.csv'
df = pd.read_csv(file_path)

# Display the shape of the DataFrame
print("Shape of the DataFrame:", df.shape)

# Display all columns in the DataFrame
print("\nColumns in the DataFrame:")
print(df.columns.tolist())

Shape of the DataFrame: (995, 138)

Columns in the DataFrame:
['ch22o011', 'ch22o012', 'ch22o013', 'ch22o014', 'ch22o015', 'ch22o162', 'nomem_encr', 'cp22n020', 'cp22n021', 'cp22n022', 'cp22n023', 'cp22n024', 'cp22n025', 'cp22n026', 'cp22n027', 'cp22n028', 'cp22n029', 'cp22n030', 'cp22n031', 'cp22n032', 'cp22n033', 'cp22n034', 'cp22n035', 'cp22n036', 'cp22n037', 'cp22n038', 'cp22n039', 'cp22n040', 'cp22n041', 'cp22n042', 'cp22n043', 'cp22n044', 'cp22n045', 'cp22n046', 'cp22n047', 'cp22n048', 'cp22n049', 'cp22n050', 'cp22n051', 'cp22n052', 'cp22n053', 'cp22n054', 'cp22n055', 'cp22n056', 'cp22n057', 'cp22n058', 'cp22n059', 'cp22n060', 'cp22n061', 'cp22n062', 'cp22n063', 'cp22n064', 'cp22n065', 'cp22n066', 'cp22n067', 'cp22n068', 'cp22n069', 'cp22n070', 'cp22n071', 'cp22n072', 'cp22n073', 'cp22n074', 'cp22n075', 'cp22n076', 'cp22n077', 'cp22n078', 'cp22n079', 'cp22n099', 'cp22n100', 'cp22n101', 'cp22n102', 'cp22n103', 'cp22n104', 'cp22n105', 'cp22n106', 'cp22n107', 'cp22n108', 'cp22n109',

In [39]:
import pandas as pd
# this code is all the above in one and saved into a new dataset with the new column; extraversion, agrreableness,.... self-esteem,...composite_values
# Load the dataset
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_health_data_2022_cleaned_with_difference_mhi5.csv'

data = pd.read_csv(file_path)

# Define the columns and reverse scoring indicators for each trait
traits = {
    'extraversion': {
        'columns': {
            'cp22n020': 1,
            'cp22n025': -1,
            'cp22n030': 1,
            'cp22n035': -1,
            'cp22n040': 1,
            'cp22n045': -1,
            'cp22n050': 1,
            'cp22n055': -1,
            'cp22n060': 1,
            'cp22n065': -1
        }
    },
    'agreeableness': {
        'columns': {
            'cp22n021': -2,
            'cp22n026': 2,
            'cp22n031': -2,
            'cp22n036': 2,
            'cp22n041': -2,
            'cp22n046': 2,
            'cp22n051': -2,
            'cp22n056': 2,
            'cp22n061': 2,
            'cp22n066': 2
        }
    },
    'conscientiousness': {
        'columns': {
            'cp22n022': 3,
            'cp22n027': -3,
            'cp22n032': 3,
            'cp22n037': -3,
            'cp22n042': 3,
            'cp22n047': -3,
            'cp22n052': 3,
            'cp22n057': -3,
            'cp22n062': 3,
            'cp22n067': 3
        }
    },
    'emotional_stability': {
        'columns': {
            'cp22n023': -4,
            'cp22n028': 4,
            'cp22n033': -4,
            'cp22n038': 4,
            'cp22n043': -4,
            'cp22n048': -4,
            'cp22n053': -4,
            'cp22n058': -4,
            'cp22n063': -4,
            'cp22n068': -4
        }
    },
    'intellect_imagination': {
        'columns': {
            'cp22n024': 5,
            'cp22n029': -5,
            'cp22n034': 5,
            'cp22n039': -5,
            'cp22n044': 5,
            'cp22n049': -5,
            'cp22n054': 5,
            'cp22n059': 5,
            'cp22n064': 5,
            'cp22n069': 5
        }
    }
}

# Convert relevant columns to numeric to prevent errors
for trait, info in traits.items():
    for col in info['columns'].keys():
        data[col] = pd.to_numeric(data[col], errors='coerce')

# Calculate each trait
for trait, info in traits.items():
    # Reverse score columns where needed
    for col, reverse in info['columns'].items():
        if reverse < 0:
            data[col] = data[col].max() + data[col].min() - data[col]  # Reverse the score
    # Calculate the trait as the average of its columns
    data[trait] = data[list(info['columns'].keys())].mean(axis=1)

# Self-esteem calculation
self_esteem_columns = ['cp22n070', 'cp22n071', 'cp22n072', 'cp22n073', 'cp22n074', 
                       'cp22n075', 'cp22n076', 'cp22n077', 'cp22n078', 'cp22n079']
data[self_esteem_columns] = data[self_esteem_columns].apply(pd.to_numeric, errors='coerce')
reverse_scored_columns = ['cp22n074', 'cp22n079', 'cp22n078', 'cp22n077', 'cp22n072']
for col in reverse_scored_columns:
    data[col] = 8 - data[col]  # Reverse scoring on a 1-7 scale

data['self_esteem'] = data[self_esteem_columns].mean(axis=1)

# Values calculation
instrumental_columns = [f'cp22n{str(i).zfill(3)}' for i in range(99, 117)]  # Q99 to Q116
terminal_columns = [f'cp22n{str(i).zfill(3)}' for i in range(117, 135)]  # Q117 to Q134
all_value_columns = instrumental_columns + terminal_columns
data[all_value_columns] = data[all_value_columns].apply(pd.to_numeric, errors='coerce')

data['instrumental_values'] = data[instrumental_columns].mean(axis=1)
data['terminal_values'] = data[terminal_columns].mean(axis=1)
data['composite_values'] = data[['instrumental_values', 'terminal_values']].mean(axis=1)

# Display the first few rows of the new columns for verification
print(data[['extraversion', 'agreeableness', 'conscientiousness', 
            'emotional_stability', 'intellect_imagination', 
            'self_esteem', 'instrumental_values', 'terminal_values', 
            'composite_values']].head())

# Save the final dataset with all traits and values
output_path = r'C:\Users\u1246538\Downloads\liss-data\merged_health_data_2022_cleaned_with_difference_mhi5.csv'
data.to_csv(output_path, index=False)

print(f"Updated dataset saved to {output_path}")

   extraversion  agreeableness  conscientiousness  emotional_stability  \
0           2.1            3.7                4.4                  4.3   
1           3.5            4.0                4.4                  2.7   
2           3.7            3.7                3.8                  2.6   
3           3.6            3.5                3.1                  4.1   
4           3.7            3.1                3.2                  4.2   

   intellect_imagination  self_esteem  instrumental_values  terminal_values  \
0                    3.9          6.2             5.777778         5.722222   
1                    3.4          5.2             6.500000         6.500000   
2                    3.3          3.9             4.333333         4.944444   
3                    3.3          5.8             4.222222         5.333333   
4                    3.8          6.1             5.000000         4.666667   

   composite_values  
0          5.750000  
1          6.500000  
2          4.6

  data['instrumental_values'] = data[instrumental_columns].mean(axis=1)
  data['terminal_values'] = data[terminal_columns].mean(axis=1)
  data['composite_values'] = data[['instrumental_values', 'terminal_values']].mean(axis=1)


In [41]:
import pandas as pd

# Load the merged dataset
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_health_data_2022_cleaned_with_difference_mhi5.csv'
data = pd.read_csv(file_path)

# List of columns to drop, including the new ones for values and traits
columns_to_drop = [
    'cp22n020', 'cp22n025', 'cp22n030', 'cp22n035', 'cp22n040', 
    'cp22n045', 'cp22n050', 'cp22n055', 'cp22n060', 'cp22n065',
    'cp22n021', 'cp22n026', 'cp22n031', 'cp22n036', 'cp22n041',
    'cp22n046', 'cp22n051', 'cp22n056', 'cp22n061', 'cp22n066',
    'cp22n022', 'cp22n027', 'cp22n032', 'cp22n037', 'cp22n042',
    'cp22n047', 'cp22n052', 'cp22n057', 'cp22n062', 'cp22n067',
    'cp22n023', 'cp22n028', 'cp22n033', 'cp22n038', 'cp22n043',
    'cp22n048', 'cp22n053', 'cp22n058', 'cp22n063', 'cp22n068',
    'cp22n024', 'cp22n029', 'cp22n034', 'cp22n039', 'cp22n044',
    'cp22n049', 'cp22n054', 'cp22n059', 'cp22n064', 'cp22n069',
    'cp22n070', 'cp22n071', 'cp22n072', 'cp22n073', 'cp22n074', 
    'cp22n075', 'cp22n076', 'cp22n077', 'cp22n078', 'cp22n079',
    
    # New columns to drop which form the "values"columns
    'cp22n099', 'cp22n100', 'cp22n101', 'cp22n102', 'cp22n103',
    'cp22n104', 'cp22n105', 'cp22n106', 'cp22n107', 'cp22n108',
    'cp22n109', 'cp22n110', 'cp22n111', 'cp22n112', 'cp22n113',
    'cp22n114', 'cp22n115', 'cp22n116', 'cp22n117', 'cp22n118',
    'cp22n119', 'cp22n120', 'cp22n121', 'cp22n122', 'cp22n123',
    'cp22n124', 'cp22n125', 'cp22n126', 'cp22n127', 'cp22n128',
    'cp22n129', 'cp22n130', 'cp22n131', 'cp22n132', 'cp22n133',
    'cp22n134'
]

# Drop the specified columns
data.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save the updated dataset to a new CSV file
output_path = r'C:\Users\u1246538\Downloads\liss-data\merged_difference_cleaned_with_traits_and_values_cleaned.csv'
data.to_csv(output_path, index=False)

print(f"Updated dataset saved to {output_path}")


Updated dataset saved to C:\Users\u1246538\Downloads\liss-data\merged_difference_cleaned_with_traits_and_values_cleaned.csv


In [45]:
import pandas as pd

# Load the cleaned dataset
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_difference_cleaned_with_traits_and_values_cleaned.csv'
data = pd.read_csv(file_path)

# Define the columns to drop (MHi-5 columns)
mhi5_columns_to_drop = ['ch22o011', 'ch22o012', 'ch22o013', 'ch22o014', 'ch22o015']

# Drop the specified columns from the DataFrame
data.drop(columns=mhi5_columns_to_drop, inplace=True)

# Check the shape of the dataset after dropping columns
shape = data.shape
print(f"The shape of the dataset after dropping MHi-5 columns is: {shape}")

# Print the remaining column names
print("The remaining columns in the dataset are:")
print(data.columns.tolist())

# Optionally, save the cleaned dataset to a new CSV file (if you want to keep a copy)
output_path = r'C:\Users\u1246538\Downloads\liss-data\merged_difference_cleaned_with_traits_and_values_cleaned_final.csv'
data.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to {output_path}")

The shape of the dataset after dropping MHi-5 columns is: (995, 46)
The remaining columns in the dataset are:
['ch22o162', 'nomem_encr', 'uy22a016', 'uy22a015', 'cr22o143', 'cr22o144', 'cs22o439', 'cs22o487', 'cs22o280', 'cs22o436', 'cs22o472', 'cs22o473', 'cs22o474', 'cs22o577', 'cs22o578', 'cs22o579', 'cs22o580', 'cs22o581', 'cv23o012', 'geslacht', 'gender', 'gebjaar', 'leeftijd', 'lftdcat', 'lftdhhh', 'burgstat', 'woonvorm', 'herkomstgroep', 'oplcat', 'brutohh_f', 'nettohh_f', 'brutoink', 'brutoink_f', 'nettoink', 'nettoink_f', 'difference_mhi5', 'change_class', 'extraversion', 'agreeableness', 'conscientiousness', 'emotional_stability', 'intellect_imagination', 'self_esteem', 'instrumental_values', 'terminal_values', 'composite_values']
Cleaned dataset saved to C:\Users\u1246538\Downloads\liss-data\merged_difference_cleaned_with_traits_and_values_cleaned_final.csv


In [51]:
import pandas as pd

# Load the dataset
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_df_2023_cleaned_with_traits_and_values_cleaned_final.csv'
data = pd.read_csv(file_path)

# Define the column groups for correlation checks
group1 = ['uy22a016', 'uy22a015']
group2 = ['cr22o143', 'cr22o144']
group3 = [
    'cs22o439', 'cs22o487', 'cs22o280', 'cs22o436', 'cs22o472', 'cs22o473',
    'cs22o474', 'cs22o577', 'cs22o578', 'cs22o579', 'cs22o580', 'cs22o581'
]

# Replace empty strings with NaN to handle missing data
data.replace(" ", pd.NA, inplace=True)

# Convert relevant columns to numeric to ensure compatibility
data[group1 + group2 + group3] = data[group1 + group2 + group3].apply(pd.to_numeric, errors='coerce')

# Calculate and display correlation for each group
def display_correlation(data, columns, group_name):
    print(f"\nCorrelation Matrix for {group_name}:")
    correlation_matrix = data[columns].corr()
    print(correlation_matrix)
    return correlation_matrix

# Group 1 Correlation
corr_group1 = display_correlation(data, group1, 'Group 1 (uy22a016, uy22a015)')

# Group 2 Correlation
corr_group2 = display_correlation(data, group2, 'Group 2 (cr22o143, cr22o144)') # vraag143 heb je eeen geloof( ja1 of nee2) vraag 144zoja welke geloof? Correlation requires paired numeric data across both columns for all observations. Since cr22o144 is missing for non-religious individuals, calculating a standard correlation isn’t feasible or meaningful.

# Group 3 Correlation
corr_group3 = display_correlation(data, group3, 'Group 3 (Other Selected Columns)')
## results show that none of these columns correlate highly with each other, so i leave them all in the datasets.


Correlation Matrix for Group 1 (uy22a016, uy22a015):
          uy22a016  uy22a015
uy22a016   1.00000   0.42535
uy22a015   0.42535   1.00000

Correlation Matrix for Group 2 (cr22o143, cr22o144):
          cr22o143  cr22o144
cr22o143       1.0       NaN
cr22o144       NaN       1.0

Correlation Matrix for Group 3 (Other Selected Columns):
          cs22o439  cs22o487  cs22o280  cs22o436  cs22o472  cs22o473  \
cs22o439  1.000000  0.270471  0.267300  0.321925 -0.243749 -0.099576   
cs22o487  0.270471  1.000000  0.193571  0.131803 -0.153342 -0.095871   
cs22o280  0.267300  0.193571  1.000000  0.145708 -0.005329 -0.031029   
cs22o436  0.321925  0.131803  0.145708  1.000000 -0.428616 -0.154808   
cs22o472 -0.243749 -0.153342 -0.005329 -0.428616  1.000000  0.150654   
cs22o473 -0.099576 -0.095871 -0.031029 -0.154808  0.150654  1.000000   
cs22o474 -0.119935 -0.047321 -0.050278 -0.164461  0.110556  0.213500   
cs22o577 -0.276233 -0.157967 -0.072769 -0.404556  0.392346  0.223106   
cs22o578 -0.

In [53]:
import pandas as pd

# Load the data
file_path = r'C:\Users\u1246538\Downloads\liss-data\merged_df_2023_cleaned_with_traits_and_values_cleaned_final.csv'
data = pd.read_csv(file_path)

# Select only the columns of interest
income_columns = ['brutohh_f', 'nettohh_f', 'brutoink', 'brutoink_f', 'nettoink', 'nettoink_f']
income_data = data[income_columns]

# Ensure columns are numeric
income_data = income_data.apply(pd.to_numeric, errors='coerce')

# Calculate the correlation matrix
income_corr_matrix = income_data.corr()

# Display the correlation matrix
print("Correlation Matrix for Income Columns:")
print(income_corr_matrix)

Correlation Matrix for Income Columns:
            brutohh_f  nettohh_f  brutoink  brutoink_f  nettoink  nettoink_f
brutohh_f    1.000000   0.967664  0.567802    0.567833  0.517508    0.515102
nettohh_f    0.967664   1.000000  0.356771    0.497512  0.494849    0.494849
brutoink     0.567802   0.356771  1.000000    1.000000  0.722010    0.709469
brutoink_f   0.567833   0.497512  1.000000    1.000000  0.962813    0.966688
nettoink     0.517508   0.494849  0.722010    0.962813  1.000000    1.000000
nettoink_f   0.515102   0.494849  0.709469    0.966688  1.000000    1.000000


In [61]:
import pandas as pd
# here i drop all vriables that are highly correlated with each other and also columns that i dont need for moddeling.
# Define the file paths for the three files
file_paths = [
    r'C:\Users\u1246538\Downloads\liss-data\merged_difference_cleaned_with_traits_and_values_cleaned_final.csv',
    r'C:\Users\u1246538\Downloads\liss-data\merged_df_2023_cleaned_with_traits_and_values_cleaned_final.csv',
    r'C:\Users\u1246538\Downloads\liss-data\merged_data_2022_with_traits_and_values_cleaned_final.csv'
]

# List of columns to drop
columns_to_drop = ['gender', 'gebjaar', 'lftdcat', 'lftdhhh', 'brutoink', 'brutoink_f', 'nettoink', 'brutohh_f']

# Loop through each file and clean it
for file_path in file_paths:
    # Load the dataset
    data = pd.read_csv(file_path)
    
    # Drop the specified columns
    data_cleaned = data.drop(columns=columns_to_drop)
    
    # Save the cleaned dataset to a new file
    output_path = file_path.replace('.csv', '_cleaned.csv')
    data_cleaned.to_csv(output_path, index=False)
    
    # Print the shape of the cleaned dataset and all columns
    print(f"Cleaned dataset saved to {output_path}")
    print(f"Shape of the cleaned dataset: {data_cleaned.shape}")
    print("Columns in the cleaned dataset:")
    print(data_cleaned.columns.tolist())
    print("\n" + "="*50 + "\n")  # Separator between the outputs for better readability


Cleaned dataset saved to C:\Users\u1246538\Downloads\liss-data\merged_difference_cleaned_with_traits_and_values_cleaned_final_cleaned.csv
Shape of the cleaned dataset: (995, 38)
Columns in the cleaned dataset:
['ch22o162', 'nomem_encr', 'uy22a016', 'uy22a015', 'cr22o143', 'cr22o144', 'cs22o439', 'cs22o487', 'cs22o280', 'cs22o436', 'cs22o472', 'cs22o473', 'cs22o474', 'cs22o577', 'cs22o578', 'cs22o579', 'cs22o580', 'cs22o581', 'cv23o012', 'geslacht', 'leeftijd', 'burgstat', 'woonvorm', 'herkomstgroep', 'oplcat', 'nettohh_f', 'nettoink_f', 'difference_mhi5', 'change_class', 'extraversion', 'agreeableness', 'conscientiousness', 'emotional_stability', 'intellect_imagination', 'self_esteem', 'instrumental_values', 'terminal_values', 'composite_values']


Cleaned dataset saved to C:\Users\u1246538\Downloads\liss-data\merged_df_2023_cleaned_with_traits_and_values_cleaned_final_cleaned.csv
Shape of the cleaned dataset: (895, 39)
Columns in the cleaned dataset:
['nomem_encr', 'ch22o162', 'uy22a0