# Phase 3: Discovery Validation and Archaeological Evaluation

**Objective**: Validate potential archaeological sites identified by the models using archaeological literature and prepare for expert review.

## 1. Import Libraries

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import json
import joblib # For loading saved scikit-learn models
# import tensorflow as tf # If a TF model was saved

# For displaying maps or coordinates, if needed
# import folium 
# from shapely.geometry import Point

## 2. Load Trained Model and Potential Sites Data

- Load the best performing model saved from Phase 2.
- Load the data representing areas/points that the model predicted as potential archaeological sites. This might be a list of coordinates, polygons, or IDs.

In [None]:
MODEL_PATH = '../../models/best_random_forest_model.joblib' # Example path
POTENTIAL_SITES_DATA_PATH = '../../data/predicted/potential_sites.csv' # Example path

try:
    # loaded_model = joblib.load(MODEL_PATH)
    # print(f"Model loaded successfully from {MODEL_PATH}")
    print(f"Conceptual: Load model from {MODEL_PATH}")
except FileNotFoundError:
    print(f"Error: Model file not found at {MODEL_PATH}.")
    # loaded_model = None
except Exception as e:
    print(f"An error occurred while loading the model: {e}")
    # loaded_model = None

try:
    # potential_sites_df = pd.read_csv(POTENTIAL_SITES_DATA_PATH)
    # print(f"Potential sites data loaded. Shape: {potential_sites_df.shape}")
    # print(potential_sites_df.head())
    print(f"Conceptual: Load potential sites from {POTENTIAL_SITES_DATA_PATH}")
except FileNotFoundError:
    print(f"Error: Potential sites data file not found at {POTENTIAL_SITES_DATA_PATH}.")
    # potential_sites_df = None
except Exception as e:
    print(f"An error occurred while loading potential sites data: {e}")
    # potential_sites_df = None

## 3. Load Archaeological Literature Data

This data will be used to cross-reference and provide supporting evidence for the model's findings.

In [None]:
ARCHAEOLOGICAL_LITERATURE_PATH = '../../data/raw/archaeological_lit.json' # Or processed version

try:
    with open(ARCHAEOLOGICAL_LITERATURE_PATH, 'r') as f:
        archaeological_lit = json.load(f)
    print(f"Archaeological literature data loaded successfully.")
    # Potentially convert to a DataFrame or a more searchable structure if not done in Phase 1
    # if isinstance(archaeological_lit, list):
    #     archaeological_df = pd.DataFrame(archaeological_lit)
    #     print(f"Literature converted to DataFrame. Shape: {archaeological_df.shape}")
    #     # print(archaeological_df.head())
    # else:
    #     # Process dict or other structures as needed
    #     pass
except FileNotFoundError:
    print(f"Error: Archaeological literature file not found at {ARCHAEOLOGICAL_LITERATURE_PATH}.")
    # archaeological_lit = None
except Exception as e:
    print(f"An error occurred while loading archaeological literature: {e}")
    # archaeological_lit = None

## 4. Validation Process

The core of this phase is to check each potential site against available evidence.

### 4.1 Extracting Information for Each Potential Site
- For each site: coordinates, model confidence score, relevant features that led to the prediction.

In [None]:
# if potential_sites_df is not None:
#     print("Iterating through potential sites (conceptual):")
#     for index, site in potential_sites_df.iterrows():
#         site_coords = (site.get('latitude'), site.get('longitude')) # Adjust column names
#         site_confidence = site.get('model_confidence_score')
#         print(f"Site {index}: Coords={site_coords}, Confidence={site_confidence}")
#         # Further processing for each site will go here
# else:
#     print("Potential sites data not available.")
print("Conceptual: Extract information for each potential site.")

### 4.2 Cross-Referencing with Archaeological Literature
- For each potential site, search the archaeological literature for mentions, similar findings, or contextual information (e.g., proximity to known sites, LiDAR tile IDs, DOIs).
- This might involve spatial joins (if literature has coordinates) or text-based searches.

In [None]:
validated_sites_info = []

# def find_supporting_literature(site_coords, literature_data):
#     # This is a placeholder for a more sophisticated search.
#     # It could involve checking if site_coords fall within a known site's bounding box,
#     # or if a LiDAR Tile ID associated with the site is mentioned in literature.
#     # For simplicity, let's assume literature_data is a list of dicts with 'coords' and 'doi'.
#     if not literature_data or not isinstance(literature_data, list):
#         return None
#     for record in literature_data:
#         if 'coordinates' in record and 'doi' in record:
#             # Example: check for exact coordinate match (highly unlikely in real scenarios)
#             # A more realistic check would involve proximity search.
#             lit_coords = tuple(record['coordinates']) 
#             if lit_coords == site_coords: # This needs to be a proximity check
#                 return record['doi']
#     return None

# if potential_sites_df is not None and archaeological_lit is not None:
#     print("Cross-referencing with literature (conceptual):")
#     for index, site in potential_sites_df.iterrows():
#         site_coords = (site.get('latitude'), site.get('longitude')) # Adjust as needed
#         supporting_doi = find_supporting_literature(site_coords, archaeological_lit) # archaeological_lit might need preprocessing
        
#         site_info = {
#             'id': site.get('site_id', index),
#             'latitude': site_coords[0],
#             'longitude': site_coords[1],
#             'model_confidence': site.get('model_confidence_score'),
#             'supporting_doi': supporting_doi,
#             'validation_notes': ''
#         }
#         if supporting_doi:
#             site_info['validation_notes'] = f"Supported by literature: {supporting_doi}"
#             print(f"Site {site_coords} potentially supported by DOI: {supporting_doi}")
#         else:
#             site_info['validation_notes'] = "No direct support found in initial literature scan."
#             print(f"Site {site_coords} has no direct supporting literature in this scan.")
#         validated_sites_info.append(site_info)
# else:
#     print("Data for cross-referencing not available.")
print("Conceptual: Cross-reference with literature. This will involve spatial queries or text matching.")

### 4.3 Confidence Scoring based on Validation
- Assign a new confidence score or category based on the strength of evidence (e.g., 'High Confidence - Literature Supported', 'Medium Confidence - Model Predicted', 'Low Confidence - Needs Field Verification').

In [None]:
# if validated_sites_info:
#     for site_info in validated_sites_info:
#         if site_info['supporting_doi']:
#             site_info['final_confidence'] = 'High - Literature Supported'
#         elif site_info['model_confidence'] > 0.8: # Example threshold
#             site_info['final_confidence'] = 'Medium - Strong Model Prediction'
#         else:
#             site_info['final_confidence'] = 'Low - Model Suggestion'
#     print("Final confidence scores assigned (conceptual).")
#     # validated_df = pd.DataFrame(validated_sites_info)
#     # print(validated_df.head())
# else:
#     print("No validated site information to assign final confidence scores.")
print("Conceptual: Assign final confidence scores based on validation.")

## 5. Preparing Data for Expert Review

- Compile a list of validated sites with all relevant information: coordinates, model output, supporting literature (if any), derived features, and the assigned validation confidence.
- This list will be provided to archaeological experts as per the competition guidelines.

In [None]:
EXPERT_REVIEW_OUTPUT_PATH = '../../data/processed/expert_review_candidates.csv'

# if 'validated_df' in locals() and not validated_df.empty:
#     # validated_df.to_csv(EXPERT_REVIEW_OUTPUT_PATH, index=False)
#     print(f"Data for expert review prepared and conceptually saved to {EXPERT_REVIEW_OUTPUT_PATH}")
# else:
#     print("No validated data to prepare for expert review.")
print(f"Conceptual: Save data for expert review to {EXPERT_REVIEW_OUTPUT_PATH}")

## 6. Next Steps

- Document the findings from this validation phase.
- Proceed to Phase 4 for final documentation and submission preparation.