# Phase 2: Model Building and Evaluation

**Objective**: Build and evaluate machine learning/deep learning models to identify potential archaeological sites.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import xgboost as xgb
# import tensorflow as tf # For Deep Learning models
# from tensorflow.keras.models import Sequential # Example
# from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout # Example
import matplotlib.pyplot as plt
import seaborn as sns

# Configure visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 2. Load Processed Data

Assuming processed data (features and labels) is stored in the `../../data/processed/` directory. The actual loading mechanism will depend on how data was saved in Phase 1.

In [None]:
# Example: Load features and labels from a CSV file
PROCESSED_DATA_PATH = '../../data/processed/features_and_labels.csv' # Example path

try:
    # data_df = pd.read_csv(PROCESSED_DATA_PATH)
    # X = data_df.drop('label_column', axis=1) # Replace 'label_column' with actual label column name
    # y = data_df['label_column']
    # print(f"Processed data loaded. Features shape: {X.shape}, Labels shape: {y.shape}")
    print(f"Conceptual: Load data from {PROCESSED_DATA_PATH}")
except FileNotFoundError:
    print(f"Error: Processed data file not found at {PROCESSED_DATA_PATH}.")
    # X, y = None, None # Ensure variables exist
except Exception as e:
    print(f"An error occurred while loading processed data: {e}")
    # X, y = None, None

## 3. Data Splitting

Split the data into training and testing/validation sets.

In [None]:
# if X is not None and y is not None:
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y) # Stratify for imbalanced classes
#     print(f"Data split into training and testing sets.")
#     print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
#     print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")
# else:
#     print("Features (X) or labels (y) not loaded. Skipping data splitting.")
print("Conceptual: Split data into X_train, X_test, y_train, y_test")

## 4. Model Selection and Training

Select appropriate models based on the data and problem type. The competition mentions Random Forest, XGBoost, CNNs, and Transformers.

### 4.1 Traditional Machine Learning Models

In [None]:
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

trained_models = {}

# if 'X_train' in locals():
#     for name, model in models.items():
#         print(f"Training {name}...")
#         model.fit(X_train, y_train)
#         trained_models[name] = model
#         print(f"{name} trained.")
# else:
#     print("Training data not available. Skipping model training.")
print("Conceptual: Train Random Forest, Gradient Boosting, XGBoost")

### 4.2 Deep Learning Models (Conceptual)

This section would involve defining, compiling, and training deep learning models (e.g., CNN for image data, potentially Transformers for sequence data or combined features).

In [None]:
# Conceptual example for a CNN (requires image-like input)
# if 'X_train_cnn' in locals(): # Assuming X_train_cnn is preprocessed for CNN
#     cnn_model = Sequential([
#         Conv2D(32, (3, 3), activation='relu', input_shape=X_train_cnn.shape[1:]),
#         MaxPooling2D((2, 2)),
#         Conv2D(64, (3, 3), activation='relu'),
#         MaxPooling2D((2, 2)),
#         Flatten(),
#         Dense(128, activation='relu'),
#         Dropout(0.5),
#         Dense(1, activation='sigmoid') # Assuming binary classification
#     ])
#     cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#     print("CNN model defined and compiled (conceptual).")
#     # cnn_model.fit(X_train_cnn, y_train_cnn, epochs=10, validation_split=0.2, batch_size=32)
#     # trained_models['CNN'] = cnn_model
# else:
#     print("CNN training data not available or not in correct format.")
print("Conceptual: Define, compile, and train CNN/Transformer models")

## 5. Model Evaluation

Evaluate models on the test set using metrics like accuracy, precision, recall, F1-score, and ROC AUC. Pay attention to the competition's specific evaluation criteria.

In [None]:
evaluation_results = {}

# if 'X_test' in locals() and trained_models:
#     for name, model in trained_models.items():
#         print(f"Evaluating {name}...")
#         if hasattr(model, 'predict_proba'):
#             y_pred_proba = model.predict_proba(X_test)[:, 1]
#             y_pred = (y_pred_proba > 0.5).astype(int) # Thresholding for binary classification
#             roc_auc = roc_auc_score(y_test, y_pred_proba)
#         else: # For models like SVC without predict_proba by default for decision_function
#             y_pred = model.predict(X_test)
#             roc_auc = None # Or calculate from decision_function if available

#         accuracy = accuracy_score(y_test, y_pred)
#         precision = precision_score(y_test, y_pred, zero_division=0)
#         recall = recall_score(y_test, y_pred, zero_division=0)
#         f1 = f1_score(y_test, y_pred, zero_division=0)
          
#         evaluation_results[name] = {
#             'Accuracy': accuracy,
#             'Precision': precision,
#             'Recall': recall,
#             'F1-score': f1,
#             'ROC_AUC': roc_auc
#         }
#         print(f"Results for {name}: {evaluation_results[name]}")
#         print(classification_report(y_test, y_pred, zero_division=0))
        
#         # Confusion Matrix
#         cm = confusion_matrix(y_test, y_pred)
#         sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
#         plt.title(f'Confusion Matrix - {name}')
#         plt.xlabel('Predicted')
#         plt.ylabel('Actual')
#         plt.show()
# else:
#     print("Test data or trained models not available for evaluation.")

print("Conceptual: Evaluate models using various metrics and plot confusion matrices.")

## 6. Hyperparameter Tuning (Conceptual)

Use techniques like GridSearchCV or RandomizedSearchCV to find optimal hyperparameters for the best-performing models.

In [None]:
# Example for RandomForest
# if 'X_train' in locals():
#     param_grid = {
#         'n_estimators': [100, 200, 300],
#         'max_depth': [None, 10, 20, 30],
#         'min_samples_split': [2, 5, 10]
#     }
#     grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='roc_auc', verbose=1, n_jobs=-1)
#     # grid_search_rf.fit(X_train, y_train)
#     # print(f"Best RF params: {grid_search_rf.best_params_}")
#     # best_rf_model = grid_search_rf.best_estimator_
#     # trained_models['RandomForest_tuned'] = best_rf_model 
# else:
#     print("Training data not available for hyperparameter tuning.")
print("Conceptual: Perform hyperparameter tuning using GridSearchCV or RandomizedSearchCV.")

## 7. Feature Importance (for applicable models)

Analyze feature importances from models like RandomForest or XGBoost to understand which features are most influential.

In [None]:
# if 'RandomForest_tuned' in trained_models and hasattr(X, 'columns'):
#     importances = trained_models['RandomForest_tuned'].feature_importances_
#     feature_names = X.columns
#     feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
#     feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
    
#     plt.figure(figsize=(12, 8))
#     sns.barplot(x='importance', y='feature', data=feature_importance_df.head(20)) # Display top 20
#     plt.title('Top 20 Feature Importances (Random Forest)')
#     plt.show()
# elif 'XGBoost' in trained_models and hasattr(X, 'columns'):
#     importances = trained_models['XGBoost'].feature_importances_
#     feature_names = X.columns
#     feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
#     feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
    
#     plt.figure(figsize=(12, 8))
#     sns.barplot(x='importance', y='feature', data=feature_importance_df.head(20)) # Display top 20
#     plt.title('Top 20 Feature Importances (XGBoost)')
#     plt.show()
# else:
#     print("Tuned RandomForest/XGBoost model or feature names not available for feature importance analysis.")
print("Conceptual: Analyze and plot feature importances.")

## 8. Saving the Best Model

Save the best performing model for later use in Phase 3 (Discovery Validation).

In [None]:
import joblib # For saving scikit-learn models
MODEL_OUTPUT_PATH = '../../models/' # Example path, ensure this directory exists

# Ensure the directory exists
# import os
# os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)

# if 'best_rf_model' in locals(): # Example: saving the tuned RandomForest
#     joblib.dump(best_rf_model, os.path.join(MODEL_OUTPUT_PATH, 'best_random_forest_model.joblib'))
#     print(f"Best model saved to {os.path.join(MODEL_OUTPUT_PATH, 'best_random_forest_model.joblib')}")
# elif 'CNN' in trained_models: # Example for saving a Keras model
#     # trained_models['CNN'].save(os.path.join(MODEL_OUTPUT_PATH, 'best_cnn_model.h5'))
#     # print(f"Best CNN model saved to {os.path.join(MODEL_OUTPUT_PATH, 'best_cnn_model.h5')}")
# else:
#     print("No best model specified or available to save.")
print("Conceptual: Save the best performing model using joblib or model-specific methods.")

## 9. Next Steps

- Use the trained model(s) to predict on new/unseen data to identify potential archaeological sites.
- Proceed to Phase 3 for validation of these potential sites using archaeological literature and expert review.