In [None]:
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import os
from datetime import datetime

# -------------------------------
# Config
# -------------------------------
DATA_FILE = "HistoricalQuotes.csv"   # Your dataset CSV
MODEL_PATH = "xgb_apple.model"
SCALER_PATH = "scaler.pkl"
FEATURES_PATH = "features.pkl"
LOG_PATH = "training_log.txt"

# -------------------------------
# Load Data
# -------------------------------
if not os.path.exists(DATA_FILE):
    raise FileNotFoundError(f"❌ CSV not found: {DATA_FILE}")

df = pd.read_csv(DATA_FILE)

# --- Flatten MultiIndex headers if present ---
if isinstance(df.columns, pd.MultiIndex):
    df.columns = [
        "_".join([str(c) for c in col if c]).strip()
        for col in df.columns
    ]

# --- Clean headers ---
df.columns = [str(c).strip().lower() for c in df.columns]

# --- Rename common variations ---
rename_map = {
    "date": "Date",
    "timestamp": "Date",
    "close/last": "Close",
    "adj close": "Close",
    "close": "Close",
    "price": "Close",
    "open": "Open",
    "high": "High",
    "low": "Low",
    "volume": "Volume",
}
df.rename(columns={c: rename_map.get(c, c) for c in df.columns}, inplace=True)

print("🔎 Columns after cleaning:", df.columns.tolist())

# --- Ensure required columns ---
required_cols = ["Date", "Close", "Volume"]
missing = [col for col in required_cols if col not in df.columns]
if missing:
    raise ValueError(f"❌ Missing required columns: {missing}")

# -------------------------------
# Preprocess Data
# -------------------------------
for col in ["Close", "Open", "High", "Low", "Volume"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.replace(r"[\$,]", "", regex=True)
        df[col] = pd.to_numeric(df[col], errors="coerce")

df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df.dropna(subset=["Date", "Close"], inplace=True)
df.sort_values("Date", inplace=True)

# -------------------------------
# Feature Engineering
# -------------------------------
df["Return"] = df["Close"].pct_change()
df["MA5"] = df["Close"].rolling(5).mean()
df["MA10"] = df["Close"].rolling(10).mean()
df["Volatility"] = df["Return"].rolling(5).std()

df.dropna(inplace=True)

features = ["Return", "MA5", "MA10", "Volatility"]
X = df[features]
y = df["Close"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -------------------------------
# Train/Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, shuffle=False
)

# -------------------------------
# Train Model
# -------------------------------
model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

# -------------------------------
# Evaluate
# -------------------------------
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"✅ Training done: RMSE={rmse:.2f}, R²={r2:.2f}")

# -------------------------------
# Save Model + Artifacts
# -------------------------------
model.save_model(MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)
joblib.dump(features, FEATURES_PATH)

# -------------------------------
# Save Training Log (UTF-8 safe)
# -------------------------------
with open(LOG_PATH, "w", encoding="utf-8") as f:
    f.write(f"Training Log: {datetime.now()}\n")
    f.write(f"Features used: {features}\n")
    f.write(f"Dataset size: {len(df)} rows\n")
    f.write(f"RMSE: {rmse:.2f}\n")
    f.write(f"R²: {r2:.2f}\n")

print(f"📌 Model, scaler, features, and log saved.")
