In [2]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

seed = 1
np.random.seed(seed)

root_dir = "E:\\Repositories\\personal-projects\\ai-ml-projects\\bmi-classification"
os.chdir(root_dir)

In [3]:
data_path = os.path.join(root_dir, "data", "intermediate")

In [4]:
class Dataset:
    def __init__(self, data_path, std_scaler=StandardScaler(), one_hot_encoder=OneHotEncoder(sparse_output=False)):
        self.data_path = data_path
        self.std_scaler = std_scaler
        self.one_hot_encoder = one_hot_encoder
    
    def load_data(self):
        data_frames = []
        for file in os.listdir(self.data_path):
            if file.endswith(".csv"):
                data_buffer = pd.read_csv(os.path.join(self.data_path, file))
                data_frames.append(data_buffer)
        
        if not data_frames:
            print("No CSV files found in the specified directory.")
            return None
        
        # Combine all data frames into a single data frame
        combined_data = pd.concat(data_frames, ignore_index=True)
        
        return combined_data
    
    def preprocessing(self):
        data = self.load_data()
        if data is None:
            return None
        
        # Encode the 'Index' column
        index_encoded = self.one_hot_encoder.fit_transform(data[['Index']])
        index_encoded_df = pd.DataFrame(index_encoded, columns=self.one_hot_encoder.get_feature_names_out(['Index']))
        
        # Drop the original 'Index' column and concatenate the encoded columns
        data = data.drop('Index', axis=1)
        data = pd.concat([data, index_encoded_df], axis=1)
        
        # Standardize the features
        feature_columns = data.columns.difference(index_encoded_df.columns)
        data[feature_columns] = self.std_scaler.fit_transform(data[feature_columns])
        joblib.dump(self.std_scaler, os.path.join(self.data_path, "std_scaler.pkl"))
        joblib.dump(feature_columns, os.path.join(self.data_path, "feature_columns.pkl"))
        
        return data

dataset = Dataset(data_path)
data = dataset.preprocessing()

if data is not None:
    print(data.head())


     Gender    Height    Weight  Index_0  Index_1  Index_2  Index_3  Index_4  \
0  1.011174  0.235303 -0.310062      0.0      0.0      0.0      0.0      1.0   
1  1.011174  1.147330 -0.588376      0.0      0.0      1.0      0.0      0.0   
2 -0.988950  0.904123  0.122870      0.0      0.0      0.0      0.0      1.0   
3 -0.988950  1.512141 -0.062672      0.0      0.0      0.0      1.0      0.0   
4  1.011174 -1.284742 -1.392394      0.0      0.0      0.0      1.0      0.0   

   Index_5  
0      0.0  
1      0.0  
2      0.0  
3      0.0  
4      0.0  


In [12]:
data.to_csv(os.path.join(root_dir, 'data', 'processed', 'bmi_data.csv'), index=False)