In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from datetime import datetime
import time
import os
import joblib
from tqdm.notebook import tqdm

def main():
    print("Loading data...")
    # Load the preprocessed rock samples data
    df = pd.read_csv('/Users/ramiab/Desktop/Mineral-Predictions-Local/Training/data/preprocessed/rock_features.csv')
    
    # Define target elements and their columns
    elements = ['AU', 'AG', 'CU', 'CO', 'NI']
    target_cols = [f'{element}_target' for element in elements]
    
    # Define feature columns (all except targets and UNIQUE_ID)
    feature_cols = [col for col in df.columns if col not in target_cols and col != 'UNIQUE_ID']
    
    # Split data
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    
    X_train = train_df[feature_cols]
    X_val = val_df[feature_cols]
    
    # Create directory for saving models
    model_dir = '/Users/ramiab/Desktop/Mineral-Predictions-Local/Training/models/gradient_boost'
    os.makedirs(model_dir, exist_ok=True)
    
    # Training metrics storage
    training_history = {
        element: {
            'train_auc': [],
            'val_auc': [],
            'best_val_auc': 0
        } for element in elements
    }
    
    # Train separate model for each element
    print("\nTraining Gradient Boosting models for each element:")
    for element in elements:
        print(f"\n{'='*50}")
        print(f"Training model for {element}")
        print(f"{'='*50}")
        
        y_train = train_df[f'{element}_target']
        y_val = val_df[f'{element}_target']
        
        # Initialize and train model
        model = GradientBoostingClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            min_samples_split=5,
            verbose=1,
            random_state=42
        )
        
        print(f"\nTraining {element} model...")
        model.fit(X_train, y_train)
        
        # Get predictions
        train_preds = model.predict_proba(X_train)[:, 1]
        val_preds = model.predict_proba(X_val)[:, 1]
        
        # Calculate AUC scores
        train_auc = roc_auc_score(y_train, train_preds)
        val_auc = roc_auc_score(y_val, val_preds)
        
        print(f"\nResults for {element}:")
        print(f"Training AUC: {train_auc:.4f}")
        print(f"Validation AUC: {val_auc:.4f}")
        
        # Save the model
        model_path = os.path.join(model_dir, f'gradboost_{element}_{datetime.now().strftime("%Y%m%d_%H%M")}.joblib')
        joblib.dump(model, model_path)
        print(f"Model saved to: {model_path}")
        
        # Store metrics
        training_history[element]['train_auc'].append(train_auc)
        training_history[element]['val_auc'].append(val_auc)
        training_history[element]['best_val_auc'] = val_auc
        
        # Feature importance plot
        plt.figure(figsize=(10, 6))
        importances = pd.DataFrame({
            'feature': feature_cols,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        plt.bar(range(len(importances)), importances['importance'])
        plt.xticks(range(len(importances)), importances['feature'], rotation=45, ha='right')
        plt.title(f'Feature Importances for {element}')
        plt.tight_layout()
        plt.savefig(os.path.join(model_dir, f'feature_importance_{element}_{datetime.now().strftime("%Y%m%d_%H%M")}.png'))
        plt.close()
    
    # Plot final results
    plt.figure(figsize=(12, 6))
    x = np.arange(len(elements))
    width = 0.35
    
    plt.bar(x - width/2, 
            [training_history[element]['train_auc'][0] for element in elements],
            width, label='Train AUC')
    plt.bar(x + width/2, 
            [training_history[element]['val_auc'][0] for element in elements],
            width, label='Validation AUC')
    
    plt.xlabel('Elements')
    plt.ylabel('AUC Score')
    plt.title('Gradient Boosting Model Performance by Element')
    plt.xticks(x, elements)
    plt.legend()
    plt.tight_layout()
    
    plt.savefig(os.path.join(model_dir, f'training_results_{datetime.now().strftime("%Y%m%d_%H%M")}.png'))
    plt.close()
    
    # Save training history
    history_path = os.path.join(model_dir, f'training_history_{datetime.now().strftime("%Y%m%d_%H%M")}.csv')
    history_df = pd.DataFrame(training_history)
    history_df.to_csv(history_path, index=False)
    
    print("\nTraining complete! Summary of best results:")
    for element in elements:
        print(f"{element}:")
        print(f"  Best Validation AUC: {training_history[element]['best_val_auc']:.4f}")

if __name__ == "__main__":
    total_start_time = time.time()
    main()
    total_time = time.time() - total_start_time
    print(f"\nTotal training time: {total_time/3600:.2f} hours")

Loading data...

Training Gradient Boosting models for each element:

Training model for AU

Training AU model...
      Iter       Train Loss   Remaining Time 
         1           0.2309            2.50m
         2           0.2241            2.48m
         3           0.2189            2.44m
         4           0.2142            2.33m
         5           0.2108            2.25m
         6           0.2079            2.19m
         7           0.2055            2.14m
         8           0.2026            2.12m
         9           0.2001            2.09m
        10           0.1986            2.06m
        20           0.1860            1.77m
        30           0.1792            1.44m
        40           0.1740            1.24m
        50           0.1707            1.05m
        60           0.1665           52.23s
        70           0.1635           40.57s
        80           0.1605           28.20s
        90           0.1586           14.10s
       100           0.1577   


KeyboardInterrupt

