In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
print(df.head())

   patient_id sex  age_group  height_cm  weight_kg  waist_circumference_cm  \
0       12675   M         60        165         55                    76.0   
1        3097   M         40        175         80                    94.0   
2       45345   M         35        180         90                    98.0   
3       40390   M         40        175         70                    77.8   
4       36755   M         45        175         65                    78.0   

   vision_left  vision_right  hearing_left  hearing_right  ...  \
0          1.0           1.0           1.0            1.0  ...   
1          1.0           1.0           1.0            1.0  ...   
2          1.2           1.5           1.0            1.0  ...   
3          1.0           1.2           1.0            1.0  ...   
4          1.5           1.5           1.0            1.0  ...   

   hemoglobin_level  urine_protein_level  serum_creatinine  ast_enzyme_level  \
0              14.1                  1.0              

In [3]:
id_column = df_test[['patient_id']]
id_column.to_csv('data/ids.csv', index=False)
print("✅ ID column saved to 'ids.csv'")
df_test = df_test.drop('patient_id', axis=1)
print("\nDataFrame head after dropping ID:")
print(f"\nRemaining columns in main DataFrame: {df_test.columns.tolist()}")
df = df.drop('patient_id', axis=1)
print("\nDataFrame head after dropping ID:")
print(f"\nRemaining columns in main DataFrame: {df.columns.tolist()}")

✅ ID column saved to 'ids.csv'

DataFrame head after dropping ID:

Remaining columns in main DataFrame: ['sex', 'age_group', 'height_cm', 'weight_kg', 'waist_circumference_cm', 'vision_left', 'vision_right', 'hearing_left', 'hearing_right', 'bp_systolic', 'bp_diastolic', 'fasting_glucose', 'total_cholesterol', 'triglycerides', 'hdl_cholesterol', 'ldl_cholesterol', 'hemoglobin_level', 'urine_protein_level', 'serum_creatinine', 'ast_enzyme_level', 'alt_enzyme_level', 'ggt_enzyme_level', 'oral_health_status', 'dental_cavity_status', 'tartar_presence']

DataFrame head after dropping ID:

Remaining columns in main DataFrame: ['sex', 'age_group', 'height_cm', 'weight_kg', 'waist_circumference_cm', 'vision_left', 'vision_right', 'hearing_left', 'hearing_right', 'bp_systolic', 'bp_diastolic', 'fasting_glucose', 'total_cholesterol', 'triglycerides', 'hdl_cholesterol', 'ldl_cholesterol', 'hemoglobin_level', 'urine_protein_level', 'serum_creatinine', 'ast_enzyme_level', 'alt_enzyme_level', 'ggt_e

In [4]:
sex_mapping = {'M': 1, 'F': 0}
df['sex_encoded'] = df['sex'].map(sex_mapping)
df = df.drop('sex', axis=1)
df_test['sex_encoded'] = df_test['sex'].map(sex_mapping)
df_test = df_test.drop('sex', axis=1)
print("✅ 'sex' column encoded.")

yes_no_mapping = {'Y': 1, 'N': 0}
df['oral_health_status_encoded'] = df['oral_health_status'].map(yes_no_mapping)
df['tartar_presence_encoded'] = df['tartar_presence'].map(yes_no_mapping)
df = df.drop(['oral_health_status', 'tartar_presence'], axis=1)
df_test['oral_health_status_encoded'] = df_test['oral_health_status'].map(yes_no_mapping)
df_test['tartar_presence_encoded'] = df_test['tartar_presence'].map(yes_no_mapping)
df_test = df_test.drop(['oral_health_status', 'tartar_presence'], axis=1)
print("✅ 'oral_health_status' and 'tartar_presence' encoded.")

df_test.to_csv('data/test_processed.csv', index=False)
print("✅ Encoded test features saved to 'test_processed.csv'.")
df.to_csv('data/train_processed.csv', index=False)
print("✅ Encoded test features saved to 'train_processed.csv'.")

✅ 'sex' column encoded.
✅ 'oral_health_status' and 'tartar_presence' encoded.
✅ Encoded test features saved to 'test_processed.csv'.
✅ Encoded test features saved to 'train_processed.csv'.


In [5]:
print(df.head())
print(f"\nRemaining columns in main DataFrame: {df.columns.tolist()}")

   age_group  height_cm  weight_kg  waist_circumference_cm  vision_left  \
0         60        165         55                    76.0          1.0   
1         40        175         80                    94.0          1.0   
2         35        180         90                    98.0          1.2   
3         40        175         70                    77.8          1.0   
4         45        175         65                    78.0          1.5   

   vision_right  hearing_left  hearing_right  bp_systolic  bp_diastolic  ...  \
0           1.0           1.0            1.0         96.0          70.0  ...   
1           1.0           1.0            1.0        126.0          79.0  ...   
2           1.5           1.0            1.0        117.0          76.0  ...   
3           1.2           1.0            1.0        118.0          66.0  ...   
4           1.5           1.0            1.0        112.0          78.0  ...   

   urine_protein_level  serum_creatinine  ast_enzyme_level  alt_enzy