In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data 
data = pd.read_csv('merged_data_2022_with_traits_and_values_cleaned_final_cleaned.csv')  

# 1. Display basic info and statistics
print("Dataset Information:")
data.info()

print("\nSummary Statistics:")
print(data.describe())




Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 to 994
Data columns (total 39 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ch22o162               995 non-null    object 
 1   nomem_encr             995 non-null    int64  
 2   uy22a016               995 non-null    object 
 3   uy22a015               995 non-null    object 
 4   cr22o143               995 non-null    float64
 5   cr22o144               995 non-null    object 
 6   cs22o439               995 non-null    object 
 7   cs22o487               995 non-null    object 
 8   cs22o280               995 non-null    object 
 9   cs22o436               995 non-null    object 
 10  cs22o472               995 non-null    object 
 11  cs22o473               995 non-null    object 
 12  cs22o474               995 non-null    object 
 13  cs22o577               995 non-null    object 
 14  cs22o578               995 non-null  

In [2]:
print(data['herkomstgroep'])
print(data['herkomstgroep'].value_counts(dropna=False))

0      102
1      102
2        0
3        0
4        0
      ... 
990    101
991      0
992      0
993      0
994      0
Name: herkomstgroep, Length: 995, dtype: object
herkomstgroep
0      760
102     73
201     50
202     42
101     36
        34
Name: count, dtype: int64


In [3]:
import pandas as pd

# Example: Ensure the 'herkomstgroep' column is numeric (if it's currently a string)
data['herkomstgroep'] = pd.to_numeric(data['herkomstgroep'], errors='coerce')

# Recoding logic
def recode_background(value):
    if value in [0, 101, 201]:
        return 1  # Western background
    elif value in [102, 202]:
        return 2  # Non-Western/Ethnic background
    elif value == 999:
        return None  # Missing value for unknown origin
    else:
        return None  # Any other unexpected value

# Apply recoding logic to the 'herkomstgroep' column
data['Ethnic_Background'] = data['herkomstgroep'].apply(recode_background)

# Display the updated DataFrame
print(data['Ethnic_Background'].value_counts(dropna=False))
print(data['Ethnic_Background'])


Ethnic_Background
1.0    846
2.0    115
NaN     34
Name: count, dtype: int64
0      2.0
1      2.0
2      1.0
3      1.0
4      1.0
      ... 
990    1.0
991    1.0
992    1.0
993    1.0
994    1.0
Name: Ethnic_Background, Length: 995, dtype: float64


In [7]:
# Rename columns using a dictionary with descriptive names
data.rename(columns={
    'ch22o162': 'used_hallucinogens',
    'uy22a015': 'LargeSnacks_Weekly',
    'uy22a016': 'SmallSnacks_Daily',  
    'cr22o143': 'Religious_Membership',
    'cr22o144': 'Religion',
    'cs22o439': 'hrs_social_media_viewing',
    'cs22o487': 'hrs_social_media_posting',
    'cs22o280': 'hrs_messaging',
    'cs22o436': 'social_media_frequency',
    'cs22o472': 'days_on_facebook',
    'cs22o473': 'days_on_twitter',
    'cs22o474': 'days_on_LinkedIn',
    'cs22o577': 'days_on_Insta',
    'cs22o578': 'days_on_Snapchat',
    'cs22o579': 'days_on_TikTok',
    'cs22o580': 'days_on_Youtube',
    'cs22o581': 'days_pinterest',
    'cv23o012': 'political_interest',
    'geslacht': 'sex',  
    'leeftijd': 'age',  
    'oplcat': 'education',  
    'burgstat': 'marital_status', 
    'woonvorm': 'living_arrangement',  
    'nettoink_f': 'personal_net_income_eur',  
    'standardized_score': 'mhi5_std_score_2022',
    'class': 'mhi5_class_2022'
}, inplace=True)

# Display the new column names to verify
print("Updated column names:\n", data.columns)

# 1. Display basic info and statistics
print("Dataset Information:")
data.info()


Updated column names:
 Index(['used_hallucinogens', 'nomem_encr', 'SmallSnacks_Daily',
       'LargeSnacks_Weekly', 'Religious_Membership', 'Religion',
       'hrs_social_media_viewing', 'hrs_social_media_posting', 'hrs_messaging',
       'social_media_frequency', 'days_on_facebook', 'days_on_twitter',
       'days_on_LinkedIn', 'days_on_Insta', 'days_on_Snapchat',
       'days_on_TikTok', 'days_on_Youtube', 'days_pinterest',
       'political_interest', 'sex', 'age', 'marital_status',
       'living_arrangement', 'herkomstgroep', 'education', 'nettohh_f',
       'personal_net_income_eur', 'mhi5_std_score_2022', 'mental_health_class',
       'mhi5_class_2022', 'extraversion', 'agreeableness', 'conscientiousness',
       'emotional_stability', 'intellect_imagination', 'self_esteem',
       'instrumental_values', 'terminal_values', 'composite_values',
       'Ethnic_Background'],
      dtype='object')
Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 t

In [9]:
# Updated list of columns to convert to numeric
columns_to_convert = [
    'used_hallucinogens', 
    'LargeSnacks_Weekly', 
    'SmallSnacks_Daily', 
    'Religious_Membership', 
    'Religion',
    'hrs_social_media_viewing', 
    'hrs_social_media_posting', 
    'hrs_messaging',
    'social_media_frequency',
    'political_interest',
    'days_on_facebook',
    'days_on_twitter',
    'days_on_LinkedIn',
    'days_on_Insta',
    'days_on_Snapchat',
    'days_on_TikTok',
    'days_on_Youtube',
    'days_pinterest',
    'education',            
    'nettohh_f',         
    'personal_net_income_eur' 
]

# Convert the specified columns to numeric
data[columns_to_convert] = data[columns_to_convert].apply(pd.to_numeric, errors='coerce')

# Check the data types after conversion to confirm
print(data[columns_to_convert].dtypes)


used_hallucinogens          float64
LargeSnacks_Weekly          float64
SmallSnacks_Daily           float64
Religious_Membership        float64
Religion                    float64
hrs_social_media_viewing    float64
hrs_social_media_posting    float64
hrs_messaging               float64
social_media_frequency      float64
political_interest          float64
days_on_facebook            float64
days_on_twitter             float64
days_on_LinkedIn            float64
days_on_Insta               float64
days_on_Snapchat            float64
days_on_TikTok              float64
days_on_Youtube             float64
days_pinterest              float64
education                   float64
nettohh_f                   float64
personal_net_income_eur     float64
dtype: object


In [11]:
# Drop the 'herkomstgroep' and 'mental_health_class' columns
data = data.drop(columns=['herkomstgroep', 'mental_health_class', 'nettohh_f'])

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 to 994
Data columns (total 37 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   used_hallucinogens        992 non-null    float64
 1   nomem_encr                995 non-null    int64  
 2   SmallSnacks_Daily         991 non-null    float64
 3   LargeSnacks_Weekly        991 non-null    float64
 4   Religious_Membership      995 non-null    float64
 5   Religion                  218 non-null    float64
 6   hrs_social_media_viewing  901 non-null    float64
 7   hrs_social_media_posting  907 non-null    float64
 8   hrs_messaging             871 non-null    float64
 9   social_media_frequency    986 non-null    float64
 10  days_on_facebook          950 non-null    float64
 11  days_on_twitter           950 non-null    float64
 12  days_on_LinkedIn          950 non-null    float64
 13  days_on_Insta             950 non-null    float64
 14  days_on_Sn

In [13]:
# Save the DataFrame to a CSV file in the current directory
data.to_csv("Renamed_col_dataset_2022.csv", index=False)

print("File saved successfully as 'renamed.csv'")



File saved successfully as 'renamed.csv'


In [None]:
# 2. Distribution plots (individual plots for each variable)
# Replace with your actual column names
predictors = ['Predictor1', 'Predictor2', 'Predictor3']  # Replace with actual predictor variable names
target_2022 = 'MHI5_2022'
target_2023 = 'MHI5_2023'

# Check skewness for each variable
print("\nSkewness of variables:")
skewness = data[predictors + [target_2022, target_2023]].skew()
print(skewness)

# Plotting distributions for predictors
for col in predictors + [target_2022, target_2023]:
    plt.figure(figsize=(8, 6))
    sns.histplot(data[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()

# 3. Boxplots for each variable
for col in predictors + [target_2022, target_2023]:
    plt.figure(figsize=(8, 6))
    sns.boxplot(data[col])
    plt.title(f"Boxplot of {col}")
    plt.xlabel(col)
    plt.show()

# Check for potential outliers
# Outliers are defined as values beyond 1.5 times the interquartile range (IQR)
outlier_counts = {}
for col in predictors + [target_2022, target_2023]:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
    outlier_counts[col] = len(outliers)

print("\nOutlier counts for each variable:")
print(outlier_counts)

# 4. Correlation Matrix
corr_matrix = data.corr()  # Calculate correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix Heatmap")
plt.show()

# 5. Scatter plots for each predictor vs target variables
# Scatter plots for MHI5_2022
for predictor in predictors:
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=data[predictor], y=data[target_2022])
    plt.title(f"Scatter Plot: {predictor} vs MHI5_2022")
    plt.xlabel(predictor)
    plt.ylabel("MHI5_2022")
    plt.show()

# Scatter plots for MHI5_2023
for predictor in predictors:
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=data[predictor], y=data[target_2023])
    plt.title(f"Scatter Plot: {predictor} vs MHI5_2023")
    plt.xlabel(predictor)
    plt.ylabel("MHI5_2023")
    plt.show()