## Train Random Forest x XGBoost Ensemble

### Setup

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import joblib
import os
import ast

print("Libraries imported successfully.")

### Load the Processed Data

In [None]:
try:
    df = pd.read_csv('data/processed/dili_data_clean.csv')
    print("Successfully loaded processed data.")
    print(f"Dataset shape: {df.shape}")
except FileNotFoundError:
    print("Error: dili_data_clean.csv not found.")
    print("Please upload the file to your Colab session's file browser.")

### Prepare Data for Modelling

In [None]:
df.dropna(subset=['fingerprint'], inplace=True)
df['fingerprint'] = df['fingerprint'].apply(ast.literal_eval)
X = np.array(df['fingerprint'].tolist())
y = df['dili_concern'].values

print(f"Data prepared for training.")
print(f"Feature shape: {X.shape}")
print(f"Target shape: {y.shape}")

### Train the Ensemble Models

In [None]:
# Train RandomForest Model
print("Training RandomForest model on the full dataset...")
rf_model = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X, y)
print("RandomForest training complete.")

# Train XGBoost Model
print("\nTraining XGBoost model on the full dataset...")
best_xgb_params = {
    'objective': 'binary:logistic', 'eval_metric': 'logloss', 'use_label_encoder': False,
    'random_state': 42, 'n_estimators': 100, 'max_depth': 5,
    'learning_rate': 0.1, 'subsample': 0.9, 'colsample_bytree': 0.7,
    'gamma': 0.2, 'min_child_weight': 1 # Using best params from tuning
}
neg_count = np.sum(y == 0)
pos_count = np.sum(y == 1)
best_xgb_params['scale_pos_weight'] = neg_count / pos_count if pos_count > 0 else 1

xgb_model = xgb.XGBClassifier(**best_xgb_params)
xgb_model.fit(X, y)
print("XGBoost training complete.")

### Save the Ensemble Model

In [None]:
MODEL_OUTPUT_DIR = 'models'
MODEL_PATH = os.path.join(MODEL_OUTPUT_DIR, 'ensemble_model.pkl')

os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

ensemble_models = {
    'random_forest': rf_model,
    'xgboost': xgb_model
}

joblib.dump(ensemble_models, MODEL_PATH)
print(f"\nEnsemble model successfully saved to '{MODEL_PATH}'")
print("You can now download this file from the Colab file browser to use with the GUI.")