In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from scipy.stats import shapiro, kstest, norm, probplot, chi2_contingency
import argparse
import configparser
import subprocess
from sklearn.model_selection import train_test_split
pd.set_option('display.max_rows', None)  # Display all rows

In [None]:
def open_config_file(config_file):
    """Opens the configuration file for review."""
    if not os.path.exists(config_file):
        print(f"Configuration file '{config_file}' not found.")
        return False
    print(f"Using configuration file: {config_file}")
    return True

def load_paths_and_suffix(config_file):
    """Returns hardcoded paths and file suffixes for the workflow."""
    # return {
    #     "input_folder": "../OTH_DATA/training_data",
    #     "output_folder": "../OTH_DATA/cleaned_data",
    #     "cleaned_file_suffix": "_TESTCASE2"
    # }
    config = configparser.ConfigParser()
    config.read(config_file)
    paths = {
        "input_folder": config["Paths"].get("input_folder_clean", "../OTH_DATA/training_data"),
        "output_folder": config["Paths"].get("output_folder_clean", "../OTH_DATA/cleaned_data"),
        "cleaned_file_suffix": config["Paths"].get("cleaned_file_suffix", "_v1")
    }
    return paths

def clean_data(data, drop_columns=None, add_target=False, target_column_name="target"):
    """Cleans the data by dropping columns, adding BMI, and encoding categorical data."""
    if drop_columns:
        data = data.drop(columns=drop_columns, errors="ignore")
    if "Height" in data.columns and "Weight" in data.columns:
        data["BMI"] = data["Weight"] / (data["Height"] ** 2)
        data['BMI'] = data['BMI'].round(2)
    if add_target and target_column_name not in data.columns:
        data[target_column_name] = None
    categorical_cols = data.select_dtypes(include=["object"]).columns
    for col in categorical_cols:
        data[col] = data[col].astype("category").cat.codes
    return data

def generate_encoding_summary(original_data, encoded_data):
    """Generates a summary of the encoding performed."""
    summary = {}
    categorical_columns = original_data.select_dtypes(include=["object"]).columns
    for column in categorical_columns:
        if column in encoded_data.columns:
            original_col = original_data[column].astype("category")
            summary[column] = dict(enumerate(original_col.cat.categories))
    return summary

def print_encoding_summary(encoding_summary):
    """Prints the encoding summary."""
    for column, mapping in encoding_summary.items():
        print(f"\nColumn: {column}")
        for code, category in mapping.items():
            print(f"  {code} -> {category}")


In [3]:
# Step 1: Open configuration file and confirm paths
config_file = "../SCRIPTS_CFG/config.txt"

if not open_config_file(config_file):
    print("Exiting due to missing or inaccessible config file.")
else:
    # Load paths from the configuration file
    paths = load_paths_and_suffix(config_file)
    input_folder = paths["input_folder"]
    output_folder = paths["output_folder"]
    cleaned_file_suffix = paths["cleaned_file_suffix"]

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Proceed with the rest of the workflow (listing files, cleaning data, etc.)
    files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]
    if not files:
        print("No CSV files found in the input folder.")
    else:
        print("\nAvailable files:")
        for i, file in enumerate(files, 1):
            print(f"{i}. {file}")

        # User selects a file to clean
        try:
            choice = int(input("Enter the number of the file you want to clean: "))
            selected_file = files[choice - 1]
            print(f"Selected file: {selected_file}")
        except (ValueError, IndexError):
            print("Invalid selection.")
            selected_file = None

        if selected_file:
            # Rest of the workflow for cleaning data
            input_path = os.path.join(input_folder, selected_file)
            data = pd.read_csv(input_path)

            # Ask for additional columns to drop
            user_input = input("Enter additional columns to drop (comma-separated) or press Enter to skip: ")
            drop_columns = ["Patient ID"]
            if user_input:
                drop_columns += [col.strip() for col in user_input.split(",")]

            add_target = input("Add an empty target column? (yes/no): ").strip().lower() == "yes"
            target_column_name = "target" if not add_target else input("Enter target column name: "
                                                                      )

            # Clean the data
            cleaned_data = clean_data(data, drop_columns=drop_columns, add_target=add_target, target_column_name=target_column_name)

            # Save the cleaned data
            output_filename = f"cleaned_{selected_file.split('.')[0]}{cleaned_file_suffix}.csv"
            output_path = os.path.join(output_folder, output_filename)
            cleaned_data.to_csv(output_path, index=False)
            print(f"Cleaned data saved to: {output_path}")

            # Reload the cleaned data and split if required
            cleaned_data = pd.read_csv(output_path)

            # Ask the user if they want to split the data
            split_data = input("Do you want to split the data into training and testing sets? (yes/no): ").strip().lower() == "yes"
            if split_data:
                try:
                    # Prompt user for split percentages
                    train_percentage = float(input("Enter the percentage for training data (e.g., 80 for 80%): ")) / 100
                    validation_percentage = float(input("Enter the percentage for validation data (e.g., 10 for 10%): ")) / 100
                    test_percentage = float(input("Enter the percentage for test data (e.g., 10 for 10%): ")) / 100

                    # Ensure the percentages add up to 1
                    if not abs(train_percentage + validation_percentage + test_percentage - 1) < 1e-5:
                        raise ValueError("Percentages must add up to 100%!")
                except ValueError as e:
                    print(f"Invalid input: {e}")
                    print("Using default split: 70% train, 15% validation, 15% test.")
                    train_percentage, validation_percentage, test_percentage = 0.7, 0.15, 0.15

                # Perform the data splitting
                if validation_percentage == 0:
                    # Split into training and testing only
                    train_data, test_data = train_test_split(cleaned_data, test_size=test_percentage, random_state=42)
                    validation_data = None
                else:
                    # Split into training, validation, and testing
                    train_data, temp_data = train_test_split(cleaned_data, test_size=(validation_percentage + test_percentage), random_state=42)
                    validation_data, test_data = train_test_split(temp_data, test_size=(test_percentage / (validation_percentage + test_percentage)), random_state=42)

                # Save the split datasets
                train_output_path = os.path.join(output_folder, f"train_{selected_file.split('.')[0]}{cleaned_file_suffix}.csv")
                test_output_path = os.path.join(output_folder, f"test_{selected_file.split('.')[0]}{cleaned_file_suffix}.csv")
                train_data.to_csv(train_output_path, index=False)
                test_data.to_csv(test_output_path, index=False)
                print(f"Saved training data to {train_output_path}")
                print(f"Saved testing data to {test_output_path}")

                # Save validation data if it exists
                if validation_data is not None:
                    validation_output_path = os.path.join(output_folder, f"validation_{selected_file.split('.')[0]}{cleaned_file_suffix}.csv")
                    validation_data.to_csv(validation_output_path, index=False)
                    print(f"Saved validation data to {validation_output_path}")

            # Generate and print encoding summary
            encoding_summary = generate_encoding_summary(data, cleaned_data)
            print_encoding_summary(encoding_summary)


Using configuration file: ../SCRIPTS_CFG/config.txt

Available files:
1. MS_2_Scenario_data.csv


Enter the number of the file you want to clean:  1


Selected file: MS_2_Scenario_data.csv


Enter additional columns to drop (comma-separated) or press Enter to skip:  
Add an empty target column? (yes/no):  no


Cleaned data saved to: ../OTH_DATA/cleaned_data\cleaned_MS_2_Scenario_data_TESTCASE3.csv


Do you want to split the data into training and testing sets? (yes/no):  yes
Enter the percentage for training data (e.g., 80 for 80%):  80
Enter the percentage for validation data (e.g., 10 for 10%):  0
Enter the percentage for test data (e.g., 10 for 10%):  20


Saved training data to ../OTH_DATA/cleaned_data\train_MS_2_Scenario_data_TESTCASE3.csv
Saved testing data to ../OTH_DATA/cleaned_data\test_MS_2_Scenario_data_TESTCASE3.csv

Column: Gender
  0 -> Female
  1 -> Male

Column: fam_hist_over-wt
  0 -> no
  1 -> yes

Column: FAVC
  0 -> no
  1 -> yes

Column: CAEC
  0 -> Always
  1 -> Frequently
  2 -> Sometimes
  3 -> no

Column: SMOKE
  0 -> no
  1 -> yes

Column: SCC
  0 -> no
  1 -> yes

Column: CALC
  0 -> Frequently
  1 -> Sometimes
  2 -> no

Column: MTRANS
  0 -> Automobile
  1 -> Bike
  2 -> Motorbike
  3 -> Public_Transportation
  4 -> Walking

Column: Obesity_Level
  0 -> Insufficient_Weight
  1 -> Normal_Weight
  2 -> Obesity_Type_I
  3 -> Obesity_Type_II
  4 -> Obesity_Type_III
  5 -> Overweight_Level_I
  6 -> Overweight_Level_II


Enter additional columns to drop (comma-separated) or press Enter to skip:  
Add an empty target column? (yes/no):  no


Cleaned data saved to: ../OTH_DATA/cleaned_data\cleaned_MS_2_Scenario_data_TESTCASE2.csv

Column: Gender
  0 -> Female
  1 -> Male

Column: fam_hist_over-wt
  0 -> no
  1 -> yes

Column: FAVC
  0 -> no
  1 -> yes

Column: CAEC
  0 -> Always
  1 -> Frequently
  2 -> Sometimes
  3 -> no

Column: SMOKE
  0 -> no
  1 -> yes

Column: SCC
  0 -> no
  1 -> yes

Column: CALC
  0 -> Frequently
  1 -> Sometimes
  2 -> no

Column: MTRANS
  0 -> Automobile
  1 -> Bike
  2 -> Motorbike
  3 -> Public_Transportation
  4 -> Walking

Column: Obesity_Level
  0 -> Insufficient_Weight
  1 -> Normal_Weight
  2 -> Obesity_Type_I
  3 -> Obesity_Type_II
  4 -> Obesity_Type_III
  5 -> Overweight_Level_I
  6 -> Overweight_Level_II
