In [78]:
import os
import pandas as pd
import numpy as np
import argparse
from scipy.stats import shapiro, kstest, norm, probplot, chi2_contingency
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error,confusion_matrix, precision_score, recall_score, auc,roc_curve, roc_auc_score
from xgboost import XGBClassifier
import joblib
import configparser
import subprocess

In [79]:
# def open_config_file(config_file):
#     """Opens the configuration file for user review."""
#     # Resolve the absolute path of the configuration file
#     absolute_path = os.path.abspath(config_file)

#     if not os.path.exists(absolute_path):
#         print(f"Configuration file '{absolute_path}' not found. Please ensure the file exists.")
#         return False

#     try:
#         # Open the file in the default text editor
#         if os.name == 'nt':  # Windows
#             os.startfile(absolute_path)
#         else:
#             subprocess.Popen(['open' if os.name == 'posix' else 'xdg-open', absolute_path])
        
#         print(f"Opened configuration file: {absolute_path}")
#         input("Press Enter once you've reviewed and saved the configuration file...")
#         return True
#     except Exception as e:
#         print(f"Failed to open the configuration file: {e}")
#         return False

def open_config_file(config_file):
    # Check if the config file exists
    if not os.path.exists(config_file):
        print(f"Configuration file '{config_file}' not found. Please make sure it exists.")
        return False
    
    # Open the config file in the default text editor
    try:
        print(f"Opening configuration file '{config_file}' for review...")
        subprocess.Popen(['open' if os.name == 'posix' else 'start', config_file], shell=True)
        input("Press Enter when you're ready to proceed with model training...")
        return True
    except Exception as e:
        print(f"Failed to open the configuration file: {e}")
        return False

# def open_config_file(config_file):
#     """Opens the configuration file for review."""
#     if not os.path.exists(config_file):
#         print(f"Configuration file '{config_file}' not found.")
#         return False
#     print(f"Using configuration file: {config_file}")
#     return True

def load_model_params(config_file, model_name):
    """Loads model parameters from the configuration file."""
    config = configparser.ConfigParser()
    config.read(config_file)
    params = {}
    for key, value in config[model_name].items():
        try:
            params[key] = eval(value)  # Evaluate values for literals
        except (NameError, SyntaxError):
            params[key] = value.strip("'\"")
    return params

def load_paths_and_suffix(config_file):
    """Loads input/output paths and suffix from the configuration file."""
    config = configparser.ConfigParser()
    config.read(config_file)
    return {
        "input_folder": config["Paths"].get("input_folder", "../OTH_DATA/cleaned_data"),
        "output_folder": config["Paths"].get("output_folder", "../ML_DATA/model_outputs"),
        "model_name_suffix": config["Paths"].get("model_name_suffix", "_v1"),
    }

def select_training_file(input_folder):
    """Prompts user to select a training file from the input folder."""
    files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    if not files:
        print("No CSV files found in the specified folder.")
        return None
    print("Available files for training:")
    for idx, file in enumerate(files, 1):
        print(f"{idx}. {file}")
    try:
        choice = int(input("Select the file number to use for training: ")) - 1
        return os.path.join(input_folder, files[choice])
    except (ValueError, IndexError):
        print("Invalid selection.")
        return None


In [80]:
def select_models(config_file):
    all_models = {
        "RandomForest": RandomForestClassifier(**load_model_params(config_file, "RandomForest")),
        "AdaBoost": AdaBoostClassifier(**load_model_params(config_file, "AdaBoost")),
        "GradientBoosting": GradientBoostingClassifier(**load_model_params(config_file, "GradientBoosting")),
        "KNeighbors": KNeighborsClassifier(**load_model_params(config_file, "KNeighbors")),
        "SVC": SVC(**load_model_params(config_file, "SVC")),
        "DecisionTree": DecisionTreeClassifier(**load_model_params(config_file, "DecisionTree")),
        "LogisticRegression": LogisticRegression(**load_model_params(config_file, "LogisticRegression")),
        "NaiveBayes": GaussianNB(**load_model_params(config_file, "NaiveBayes")),
        "NeuralNetwork": MLPClassifier(**load_model_params(config_file, "NeuralNetwork")),
        "XGBoost": XGBClassifier(**load_model_params(config_file, "XGBoost"))
    }
    
    # Display model selection prompt
    print("Available models for training:")
    for idx, model_name in enumerate(all_models, start=1):
        print(f"{idx}. {model_name}")
    
    selection = input("Enter the model numbers to train (comma-separated) or 'all' to train all models: ")
    if selection.lower() == 'all':
        selected_models = all_models
    else:
        selected_indices = [int(i.strip()) - 1 for i in selection.split(",")]
        selected_models = {model_name: model for idx, (model_name, model) in enumerate(all_models.items()) if idx in selected_indices}
    
    return selected_models

# def train_and_save_models(X, y, paths, models):
#     """Trains and saves selected models."""
#     os.makedirs(paths["output_folder"], exist_ok=True)
#     categorical_cols = X.select_dtypes(include=['object', 'category']).columns
#     numeric_cols = X.select_dtypes(include=[np.number]).columns
#     preprocessor = ColumnTransformer([
#         ('num', StandardScaler(), numeric_cols),
#         ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
#     ])
#     for model_name, model in models.items():
#         pipeline = Pipeline([
#             ('preprocessor', preprocessor),
#             ('model', model),
#         ])
#         pipeline.fit(X, y)
#         model_path = os.path.join(paths["output_folder"], f"{model_name}{paths['model_name_suffix']}.pkl")
#         joblib.dump(pipeline, model_path)
#         print(f"Saved model: {model_name} to {model_path}")

def train_and_save_models(X, y, paths, models):
    if not os.path.exists(paths["output_folder"]):
        os.makedirs(paths["output_folder"])

    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    numeric_cols = X.select_dtypes(include=[np.number]).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ])

    for model_name, model in models.items():
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])

        # Fit the pipeline
        pipeline.fit(X, y)

        # Extract feature names after preprocessing
        feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
        pipeline.feature_names = feature_names  # Save feature names to the pipeline

        # Save the trained model with the model name and specified suffix
        model_path = os.path.join(
            paths["output_folder"], 
            f"{model_name}{paths['model_name_suffix']}.pkl"
        )
        joblib.dump(pipeline, model_path)
        print(f"Trained and saved model: {model_name} to {model_path}")


In [81]:
def select_features(data):
    print("Available features:")
    for idx, column in enumerate(data.columns):
        print(f"{idx + 1}: {column}")
    selected = input("Enter the feature numbers to use (comma-separated), type 'all' to select all, or press Enter for default (2,3,4,5,6): ")
    
    if selected.lower() == 'all':
        return data
    elif selected.strip() == '':  # Default option if no input
        default_indices = [1, 2, 3, 4, 5]  # 2,3,4,5,6 are indices 1,2,3,4,5 (0-based)
        return data.iloc[:, default_indices]
    else:
        selected_indices = [int(i) - 1 for i in selected.split(',')]
        return data.iloc[:, selected_indices]

In [82]:
def normalize_features(data):
    print("\nAvailable features for normalization:")
    for idx, column in enumerate(data.columns):
        print(f"{idx + 1}: {column}")
    normalize = input("Enter the feature numbers to normalize (comma-separated) or press Enter to skip: ")
    
    if normalize:
        selected_indices = [int(i) - 1 for i in normalize.split(',')]
        selected_features = data.columns[selected_indices]
        for feature in selected_features:
            data[feature] = np.log1p(data[feature]) # Log transformation
            print(f"Applied log transformation on {feature}")
    return data

In [83]:
def select_training_file(input_folder):
    files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    if not files:
        print("No CSV files found in the specified input folder.")
        return None
    print("Available files for model training:")
    for idx, file in enumerate(files, 1):
        print(f"{idx}. {file}")
    choice = int(input("Select the file number to use for training: ")) - 1
    return os.path.join(input_folder, files[choice])

In [84]:
config_file = "../SCRIPTS_CFG/config.txt"
if not open_config_file(config_file):
    print("Exiting due to missing or inaccessible config file.")

else:
    # Load paths and model name suffix
    paths = load_paths_and_suffix(config_file)

    # List files in the input folder and prompt user to select one
    data_path = select_training_file(paths["input_folder"])
    if data_path is None:
        print("No valid file selected. Exiting.")

    else:
        data = pd.read_csv(data_path)

        # Select features
        data = select_features(data)

        # Separate features and target variable
        # target_col = input("Enter the target column (label) by name for training (e.g., 'Survived'): ")
        target_col = "Obesity_Level"
        if target_col not in data.columns:
            raise ValueError(f"The specified target column '{target_col}' does not exist in the data.")

        X = data.drop(columns=[target_col])
        y = data[target_col]

        # Select models to train
        models = select_models(config_file)

        # Normalize features if needed
        X = normalize_features(X)

        # Train and save models
        train_and_save_models(X, y, paths, models)
        print("Training completed and models saved.")

Opening configuration file '../SCRIPTS_CFG/config.txt' for review...


Press Enter when you're ready to proceed with model training... 


Available files for model training:
1. cleaned_MS_2_Scenario_data_TESTCASE2.csv
2. cleaned_MS_2_Scenario_data_TESTCASE3.csv
3. cleaned_MS_2_Scenario_data_testSplit.csv
4. cleaned_MS_2_Scenario_data_v1.csv
5. test_MS_2_Scenario_data_TESTCASE2.csv
6. test_MS_2_Scenario_data_TESTCASE3.csv
7. test_MS_2_Scenario_data_testSplit.csv
8. test_MS_2_Scenario_data_v1.csv
9. train_MS_2_Scenario_data_TESTCASE2.csv
10. train_MS_2_Scenario_data_TESTCASE3.csv
11. train_MS_2_Scenario_data_testSplit.csv
12. train_MS_2_Scenario_data_v1.csv
13. validation_MS_2_Scenario_data_TESTCASE2.csv
14. validation_MS_2_Scenario_data_TESTCASE3.csv
15. validation_MS_2_Scenario_data_testSplit.csv


Select the file number to use for training:  10


Available features:
1: Gender
2: Age
3: Height
4: Weight
5: fam_hist_over-wt
6: FAVC
7: FCVC
8: NCP
9: CAEC
10: SMOKE
11: CH2O
12: SCC
13: FAF
14: TUE
15: CALC
16: MTRANS
17: Obesity_Level
18: BMI


Enter the feature numbers to use (comma-separated), type 'all' to select all, or press Enter for default (2,3,4,5,6):  all


Available models for training:
1. RandomForest
2. AdaBoost
3. GradientBoosting
4. KNeighbors
5. SVC
6. DecisionTree
7. LogisticRegression
8. NaiveBayes
9. NeuralNetwork
10. XGBoost


Enter the model numbers to train (comma-separated) or 'all' to train all models:  all



Available features for normalization:
1: Gender
2: Age
3: Height
4: Weight
5: fam_hist_over-wt
6: FAVC
7: FCVC
8: NCP
9: CAEC
10: SMOKE
11: CH2O
12: SCC
13: FAF
14: TUE
15: CALC
16: MTRANS
17: BMI


Enter the feature numbers to normalize (comma-separated) or press Enter to skip:  


Trained and saved model: RandomForest to ../ML_DATA/model_outputs\RandomForest_demo.pkl




Trained and saved model: AdaBoost to ../ML_DATA/model_outputs\AdaBoost_demo.pkl
Trained and saved model: GradientBoosting to ../ML_DATA/model_outputs\GradientBoosting_demo.pkl
Trained and saved model: KNeighbors to ../ML_DATA/model_outputs\KNeighbors_demo.pkl
Trained and saved model: SVC to ../ML_DATA/model_outputs\SVC_demo.pkl
Trained and saved model: DecisionTree to ../ML_DATA/model_outputs\DecisionTree_demo.pkl
Trained and saved model: LogisticRegression to ../ML_DATA/model_outputs\LogisticRegression_demo.pkl
Trained and saved model: NaiveBayes to ../ML_DATA/model_outputs\NaiveBayes_demo.pkl
Trained and saved model: NeuralNetwork to ../ML_DATA/model_outputs\NeuralNetwork_demo.pkl


Parameters: { "use_label_encoder" } are not used.



Trained and saved model: XGBoost to ../ML_DATA/model_outputs\XGBoost_demo.pkl
Training completed and models saved.
