In [None]:
# data_loader.py


import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import config


def load_and_prepare_data(data_path):
    """
    Complete data pipeline:
    - Load Excel
    - Clean dataset
    - Validate feature/target columns
    - Train/test split
    - Standard scaling
    
    Returns:
        X_train, X_test, y_train, y_test
    """

    # ----------------------------
    # 1️⃣ Load Excel File
    # ----------------------------
    excel_files = [f for f in os.listdir(data_path) if f.endswith(".xlsx")]
    
    if len(excel_files) == 0:
        raise FileNotFoundError("No Excel file found in given path.")
    
    excel_file = excel_files[0]
    
    df_raw = pd.read_excel(
        os.path.join(data_path, excel_file),
        header=None
    )

    print("Original Shape:", df_raw.shape)

    # ----------------------------
    # 2️⃣ Clean Dataset
    # ----------------------------
    df_raw = df_raw.drop(index=0).reset_index(drop=True)
    df_raw = df_raw.drop(columns=[0])
    df_raw.columns = df_raw.iloc[0]
    df = df_raw[1:].reset_index(drop=True)

    df = df.apply(pd.to_numeric, errors="coerce")
    df = df.dropna().reset_index(drop=True)

    print("Cleaned Shape:", df.shape)
    print("Columns:", df.columns.tolist())

    # ----------------------------
    # 3️⃣ Validate Columns
    # ----------------------------
    for col in config.FEATURE_COLUMNS + config.TARGET_COLUMNS:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in dataset.")

    # ----------------------------
    # 4️⃣ Feature / Target Split
    # ----------------------------
    X = df[config.FEATURE_COLUMNS].values
    y = df[config.TARGET_COLUMNS].values

    # ----------------------------
    # 5️⃣ Train-Test Split
    # ----------------------------
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=config.TEST_SIZE,
        random_state=config.RANDOM_STATE
    )

    # ----------------------------
    # 6️⃣ Standard Scaling
    # ----------------------------
    scaler = StandardScaler()

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    print("Data preparation completed successfully.")

    return X_train, X_test, y_train, y_test