In [None]:
"""
ETA Estimation Algorithm for Manga/Comic Conversion

Based on empirical data:
- Witch Hat Atelier v03: 189 pages, ~422 seconds total
  - MuPDF extraction: 141 seconds (33.4%)
  - Image processing: 281 seconds (66.6%)
  - HTML building: 0.1 seconds (negligible)

Performance factors:
- File type (PDF vs archives vs images)
- File size (correlates with page count and image quality)
- Device profile resolution (higher = more processing)
- Advanced options (quality, upscaling, color, etc.)
- Available system resources
"""

import os
import time
import pandas as pd
import sqlalchemy
from sqlalchemy import func, text
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib
import json
from typing import Dict, Any, Optional
from database import SessionLocal, ConversionJob
from utils.enhanced_logger import setup_enhanced_logging, log_with_context

logger = setup_enhanced_logging()


CATEGORICAL_FEATURES = ['device_profile']
NUMERICAL_FEATURES = ['input_file_size', 'output_file_size']
TARGET_FEATURE = ['actual_duration']

def retrieve_data():
    from database.models import get_db_session
    session = get_db_session()
    try:
        # Execute raw SQL and fetch results
        result = session.execute(sqlalchemy.text("SELECT * FROM conversion_jobs"))
        columns = result.keys()
        rows = result.fetchall()
        # Convert to DataFrame manually
        df = pd.DataFrame(rows, columns=columns)
        return df
    finally:
        session.close()

def sanitize_data(df):
    df = df[CATEGORICAL_FEATURES + NUMERICAL_FEATURES + TARGET_FEATURE]
    df = df.dropna()
    df = df.drop_duplicates()
    return df

def train_model():
    df = sanitize_data(retrieve_data())
    X = df[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]
    y = df[TARGET_FEATURE]
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), CATEGORICAL_FEATURES)
        ],
        remainder='passthrough'
    )
    models = {
        "Decision Tree": DecisionTreeRegressor(random_state=42),
        "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
        "Linear Regression": LinearRegression()
    }
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    best_model_name = None
    best_mse = float('inf')
    best_pipeline = None

    for name, model in models.items():
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', model)])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        print(f"{name} - MSE: {mse:.2f}")
        if mse < best_mse:
            best_mse = mse
            best_model_name = name
            best_pipeline = pipeline

    print(f"Best model: {best_model_name} with MSE {best_mse:.2f}")

    # Save best model
    joblib.dump(best_pipeline, "best_model.pkl")

def should_retrain_model(model_path="best_model.pkl", max_age_days=1):
    """
    Check if the model should be retrained based on its age.
    """
    if not os.path.exists(model_path):
        return True  # Train if model doesn't exist

    # Check the age of the model file
    model_age_seconds = time.time() - os.path.getmtime(model_path)
    model_age_days = model_age_seconds / (60 * 60 * 24)

    if model_age_days > max_age_days:
        return True  # Retrain if model is too old

    return False

def estimate_eta(conversion_job: ConversionJob, output_file_size: Optional[int] = None) -> float:
    """
    Estimate the conversion time (ETA) for a given job.
    If the model is old or doesn't exist, it retrains it.
    """
    try:
        # Check if the model needs retraining
        if should_retrain_model():
            log_with_context(
                logger, 'info', 'ETA model requires training.',
                job_id=conversion_job.id,
                user_id=conversion_job.license_key
            )
            train_model()
            
        loaded_model = joblib.load("best_model.pkl")

        # Heuristic for output file size if not provided
        if output_file_size is None:
            output_file_size = conversion_job.input_file_size * 0.8

        single_row = pd.DataFrame({
            "input_file_size": [conversion_job.input_file_size],
            "output_file_size": [output_file_size],
            "device_profile": [conversion_job.device_profile]
        })

        eta = loaded_model.predict(single_row)[0]

        # Convert numpy type to Python float for JSON serialization and proper rounding
        eta_float = float(eta)

        log_with_context(
            logger, 'info', 'Successfully estimated ETA',
            job_id=conversion_job.id,
            user_id=conversion_job.license_key,
            estimated_eta=eta_float
        )

        return max(0.0, eta_float)

    except Exception as e:
        log_with_context(
            logger, 'error', f'An error occurred during ETA estimation: {e}',
            job_id=conversion_job.id,
            user_id=conversion_job.license_key,
            error_type=type(e).__name__
        )
        # Fallback heuristic in case of any error
        return (conversion_job.input_file_size / 50000)

if __name__ == "__main__":
    train_model()