In [None]:
import os
import pandas as pd
import numpy as np
import argparse
from scipy.stats import shapiro, kstest, norm, probplot, chi2_contingency
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error,confusion_matrix, precision_score, recall_score, auc,roc_curve, roc_auc_score
from xgboost import XGBClassifier
import joblib
import configparser
import subprocess

In [None]:
def open_config_file(config_file):
    """Opens the configuration file for user review."""
    if not os.path.exists(config_file):
        print(f"Configuration file '{config_file}' not found.")
        return False
    try:
        if os.name == 'nt':  # Windows
            os.startfile(config_file)
        else:
            subprocess.Popen(['open' if os.name == 'posix' else 'xdg-open', config_file])
        input("Press Enter once you've reviewed the configuration file...")
        return True
    except Exception as e:
        print(f"Failed to open the configuration file: {e}")
        return False

def load_model_params(config_file, model_name):
    """Loads model parameters from the configuration file."""
    config = configparser.ConfigParser()
    config.read(config_file)
    params = {}
    for key, value in config[model_name].items():
        try:
            params[key] = eval(value)  # Evaluate values for literals
        except (NameError, SyntaxError):
            params[key] = value.strip("'\"")
    return params

def load_paths_and_suffix(config_file):
    """Loads input/output paths and suffix from the configuration file."""
    config = configparser.ConfigParser()
    config.read(config_file)
    return {
        "input_folder": config["Paths"].get("input_folder", "../OTH_DATA/cleaned_data"),
        "output_folder": config["Paths"].get("output_folder", "../ML_DATA/model_outputs"),
        "model_name_suffix": config["Paths"].get("model_name_suffix", "_v1"),
    }

def select_training_file(input_folder):
    """Prompts user to select a training file from the input folder."""
    files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    if not files:
        print("No CSV files found in the specified folder.")
        return None
    print("Available files for training:")
    for idx, file in enumerate(files, 1):
        print(f"{idx}. {file}")
    try:
        choice = int(input("Select the file number to use for training: ")) - 1
        return os.path.join(input_folder, files[choice])
    except (ValueError, IndexError):
        print("Invalid selection.")
        return None


In [None]:
def select_models(config_file):
    """Allows the user to select models to train."""
    all_models = {
        "RandomForest": RandomForestClassifier(**load_model_params(config_file, "RandomForest")),
        "AdaBoost": AdaBoostClassifier(**load_model_params(config_file, "AdaBoost")),
        "GradientBoosting": GradientBoostingClassifier(**load_model_params(config_file, "GradientBoosting")),
        "KNeighbors": KNeighborsClassifier(**load_model_params(config_file, "KNeighbors")),
        "SVC": SVC(**load_model_params(config_file, "SVC")),
        "DecisionTree": DecisionTreeClassifier(**load_model_params(config_file, "DecisionTree")),
        "LogisticRegression": LogisticRegression(**load_model_params(config_file, "LogisticRegression")),
        "NaiveBayes": GaussianNB(**load_model_params(config_file, "NaiveBayes")),
        "NeuralNetwork": MLPClassifier(**load_model_params(config_file, "NeuralNetwork")),
        "XGBoost": XGBClassifier(**load_model_params(config_file, "XGBoost")),
    }
    print("Available models:")
    for idx, model_name in enumerate(all_models, 1):
        print(f"{idx}. {model_name}")
    selection = input("Enter the model numbers to train (comma-separated) or 'all': ")
    if selection.lower() == 'all':
        return all_models
    else:
        indices = [int(i.strip()) - 1 for i in selection.split(",")]
        return {model_name: model for idx, (model_name, model) in enumerate(all_models.items()) if idx in indices}

def train_and_save_models(X, y, paths, models):
    """Trains and saves selected models."""
    os.makedirs(paths["output_folder"], exist_ok=True)
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ])
    for model_name, model in models.items():
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model),
        ])
        pipeline.fit(X, y)
        model_path = os.path.join(paths["output_folder"], f"{model_name}{paths['model_name_suffix']}.pkl")
        joblib.dump(pipeline, model_path)
        print(f"Saved model: {model_name} to {model_path}")


In [None]:
def select_features(data):
    print("Available features:")
    for idx, column in enumerate(data.columns):
        print(f"{idx + 1}: {column}")
    selected = input("Enter the feature numbers to use (comma-separated), type 'all' to select all, or press Enter for default (2,3,4,5,6): ")
    
    if selected.lower() == 'all':
        return data
    elif selected.strip() == '':  # Default option if no input
        default_indices = [1, 2, 3, 4, 5]  # 2,3,4,5,6 are indices 1,2,3,4,5 (0-based)
        return data.iloc[:, default_indices]
    else:
        selected_indices = [int(i) - 1 for i in selected.split(',')]
        return data.iloc[:, selected_indices]

In [None]:
def normalize_features(data):
    print("\nAvailable features for normalization:")
    for idx, column in enumerate(data.columns):
        print(f"{idx + 1}: {column}")
    normalize = input("Enter the feature numbers to normalize (comma-separated) or press Enter to skip: ")
    
    if normalize:
        selected_indices = [int(i) - 1 for i in normalize.split(',')]
        selected_features = data.columns[selected_indices]
        for feature in selected_features:
            data[feature] = np.log1p(data[feature]) # Log transformation
            print(f"Applied log transformation on {feature}")
    return data

In [None]:
def select_training_file(input_folder):
    files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    if not files:
        print("No CSV files found in the specified input folder.")
        return None
    print("Available files for model training:")
    for idx, file in enumerate(files, 1):
        print(f"{idx}. {file}")
    choice = int(input("Select the file number to use for training: ")) - 1
    return os.path.join(input_folder, files[choice])

In [None]:
config_file = "../SCRIPTS_CFG/config.txt"
if not open_config_file(config_file):
    print("Exiting due to missing or inaccessible config file.")

else:
    # Load paths and model name suffix
    paths = load_paths_and_suffix(config_file)

    # List files in the input folder and prompt user to select one
    data_path = select_training_file(paths["input_folder"])
    if data_path is None:
        print("No valid file selected. Exiting.")

    else:
        data = pd.read_csv(data_path)

        # Select features
        data = select_features(data)

        # Separate features and target variable
        # target_col = input("Enter the target column (label) by name for training (e.g., 'Survived'): ")
        target_col = "Obesity_Level"
        if target_col not in data.columns:
            raise ValueError(f"The specified target column '{target_col}' does not exist in the data.")

        X = data.drop(columns=[target_col])
        y = data[target_col]

        # Select models to train
        models = select_models(config_file)

        # Normalize features if needed
        X = normalize_features(X)

        # Train and save models
        train_and_save_models(X, y, paths, models)
        print("Training completed and models saved.")

# if __name__ == "__main__":
#     parser = argparse.ArgumentParser(description="Train various models on a selected dataset.")
#     parser.add_argument('--config_file', type=str, default="config.txt", help="Path to the model parameter file.")
    
#     args = parser.parse_args()
#     main(args.config_file)