In [None]:
!pip install ace_tools



In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# Load the dataset
file_path = "Prevalence_Sex_Age_Year_ICD.csv"  # Change this to the correct path
df = pd.read_csv(file_path)

# Step 1: Handling Missing Data using KNN Imputation

# Convert categorical columns to numerical for KNN imputation
df_impute = df.copy()
df_impute['sex'] = df_impute['sex'].astype('category').cat.codes
df_impute['Age_Group'] = df_impute['Age_Group'].astype('category').cat.codes
df_impute['icd_code'] = df_impute['icd_code'].astype('category').cat.codes

# Apply KNN Imputation on the numerical data
imputer = KNNImputer(n_neighbors=5)
df_impute[['p']] = imputer.fit_transform(df_impute[['p']])

# Restore categorical values
df_impute['sex'] = df['sex']
df_impute['Age_Group'] = df['Age_Group']
df_impute['icd_code'] = df['icd_code']

# Step 2: Data Transformation

# One-hot encoding for categorical variables (sex, Age_Group)
encoder = OneHotEncoder(sparse_output=False, drop='first')  # Corrected parameter name
encoded_columns = encoder.fit_transform(df_impute[['sex', 'Age_Group']])
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(['sex', 'Age_Group']))

# Min-max scaling for prevalence values (p)
scaler = MinMaxScaler()
df_impute['p'] = scaler.fit_transform(df_impute[['p']])

# Concatenate encoded columns back to the dataframe
df_transformed = pd.concat([df_impute.drop(columns=['sex', 'Age_Group']), encoded_df], axis=1)

# Step 3: Temporal Splitting
def assign_period(year):
    if 1997 <= year <= 2002:
        return '1997-2002'
    elif 2003 <= year <= 2008:
        return '2003-2008'
    elif 2009 <= year <= 2014:
        return '2009-2014'
    else:
        return 'Other'

df_transformed['time_period'] = df_transformed['year'].apply(assign_period)

# Save preprocessed data to a new CSV file
df_transformed.to_csv("Preprocessed_Comorbidity_Data.csv", index=False)

print("Preprocessing complete. Data saved to 'Preprocessed_Comorbidity_Data.csv'.")


Preprocessing complete. Data saved to 'Preprocessed_Comorbidity_Data.csv'.
