In [None]:
## SunnyX9
## 10.4.2024
## PreFer data challenge

In [None]:
import pandas as pd

file_path = "/PreFer_train_background_data.csv"
df = pd.read_csv(file_path)

print(df.head())

In [None]:
## NA per case 

unique_cases = df.iloc[:, 0].unique()
total_variables = len(df.columns) - 1

for case in unique_cases:

    df_case = df[df.iloc[:, 0] == case]
    
    total_instances = df_case.shape[0]
    
    missing_values = df_case.iloc[:, 1:].isnull().sum().sum()
    
    total_possible_values = total_instances * total_variables
    
    share_of_missing = (missing_values / total_possible_values) * 100
    
    print(f"case {case}: {missing_values} missing values out of {total_possible_values} possible values, {share_of_missing:.2f}% missing, {total_instances} instances")

# case = person
# missing values = per person
# instances = total rows per person

In [None]:
# NA per predictor

for column in df.columns[1:]:
    missing_count = df[column].isnull().sum()
    print(f"{column} contains {missing_count} missing values")

# lots of NAs:
# gross and net income
# level of urbanisation (sted)
# edu level CBS (oplcat)
# recruiting wave (werving)
# migration background

In [None]:
# correlation

import seaborn as sns
import matplotlib.pyplot as plt

corr = df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm',
            xticklabels=corr.columns, yticklabels=corr.columns,
            linewidths=.5, cbar_kws={"shrink": .5})

plt.title('Correlation Matrix')
plt.show()


In [None]:
# drop some income predictors to speed things up
# how does our algorithm react to multicolleniarity?

columns_to_drop = ['brutocat', 'nettocat', 'brutohh_f', 'nettohh_f'] 
df.drop(columns=columns_to_drop, axis=1, inplace=True)


In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import time

In [None]:
## pipeline for imputing and scaling

print("Starting imputation ")
start_time = time.time()
print(start_time)
imputer = KNNImputer(n_neighbors=3)
df_imp = imputer.fit_transform(df)
print("Imputation completed in {:.2f} seconds.".format(time.time() - start_time))

print("Starting scaling ")
start_time = time.time()
print(start_time)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_imp)
print("Scaling completed in {:.2f} seconds.".format(time.time() - start_time))

df_imputed = pd.DataFrame(df_scaled, columns=df.columns)

print("Pipeline execution completed.")


In [None]:
df_imputed.to_csv("df_imputed.csv", index=False)

In [None]:
# pipeline for only scaling

pipe_scaled = Pipeline([(
    'scaler', StandardScaler()
)])

# data only scaled
df_scaled = pipe_scaled.fit_transform(df)
df_scaled = pd.DataFrame(df_scaled, columns=df.columns)

In [None]:
df_scaled.to_csv("df_scaled.csv", index=False)