# Phase 1: Data Understanding and Preprocessing

**Objective**: Understand the structure of the provided datasets and prepare them for analysis.

## 1. Import Libraries

In [None]:
import pandas as pd
import geopandas as gpd
import rasterio
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Configure visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 2. Define Data Paths

Assuming data is stored in the `../../data/raw/` directory relative to this notebook. Adjust paths as necessary.

In [None]:
# Define paths to data files (replace with actual file names)
LIDAR_PATH = '../../data/raw/lidar_data.tif' # Example path
SATELLITE_IMAGE_PATH = '../../data/raw/satellite_image.tif' # Example path
NDVI_DATA_PATH = '../../data/raw/ndvi_data.tif' # Example path
GIS_DATA_PATH = '../../data/raw/gis_data.geojson' # Example path
ARCHAEOLOGICAL_LITERATURE_PATH = '../../data/raw/archaeological_lit.json' # Example path

## 3. Load and Inspect Data

In this section, we will load each dataset and perform initial inspections.

### 3.1 LiDAR Point Cloud Data

In [None]:
try:
    with rasterio.open(LIDAR_PATH) as src:
        lidar_data = src.read(1) # Read the first band
        lidar_meta = src.meta
    print(f"LiDAR data loaded successfully. Shape: {lidar_data.shape}")
    print(f"LiDAR metadata: {lidar_meta}")
    # Display a sample of the LiDAR data
    # plt.imshow(lidar_data, cmap='terrain')
    # plt.title('LiDAR Data Sample')
    # plt.colorbar(label='Elevation')
    # plt.show()
except FileNotFoundError:
    print(f"Error: LiDAR file not found at {LIDAR_PATH}. Please check the path.")
except Exception as e:
    print(f"An error occurred while loading LiDAR data: {e}")

**Observations & Next Steps (LiDAR):**
- Check data type, resolution, coordinate reference system (CRS).
- Visualize the data to understand its spatial distribution.
- Identify potential preprocessing needs: noise reduction, normalization.

### 3.2 Satellite Imagery

In [None]:
try:
    with rasterio.open(SATELLITE_IMAGE_PATH) as src:
        satellite_image = src.read() # Read all bands
        satellite_meta = src.meta
    print(f"Satellite image loaded successfully. Shape: {satellite_image.shape}")
    print(f"Satellite metadata: {satellite_meta}")
    # To display a true/false color composite, you might need to select specific bands
    # For example, if RGB are bands 4, 3, 2 (1-indexed):
    # rgb_image = satellite_image[[3, 2, 1], :, :] 
    # rgb_image_display = np.moveaxis(rgb_image, 0, -1) # Rearrange for plotting
    # plt.imshow(rgb_image_display)
    # plt.title('Satellite Image Sample')
    # plt.show()
except FileNotFoundError:
    print(f"Error: Satellite image file not found at {SATELLITE_IMAGE_PATH}.")
except Exception as e:
    print(f"An error occurred while loading satellite image: {e}")

**Observations & Next Steps (Satellite):**
- Check number of bands, data type, resolution, CRS.
- Visualize different band combinations.
- Preprocessing: cloud masking, atmospheric correction (if not already done).

### 3.3 NDVI Data

In [None]:
try:
    with rasterio.open(NDVI_DATA_PATH) as src:
        ndvi_data = src.read(1)
        ndvi_meta = src.meta
    print(f"NDVI data loaded successfully. Shape: {ndvi_data.shape}")
    print(f"NDVI metadata: {ndvi_meta}")
    # plt.imshow(ndvi_data, cmap='RdYlGn')
    # plt.title('NDVI Data Sample')
    # plt.colorbar(label='NDVI Value')
    # plt.show()
except FileNotFoundError:
    print(f"Error: NDVI file not found at {NDVI_DATA_PATH}.")
except Exception as e:
    print(f"An error occurred while loading NDVI data: {e}")

# If NDVI needs to be calculated from satellite bands (e.g., Red and NIR):
# def calculate_ndvi(red_band, nir_band):
#     # Ensure inputs are numpy arrays to prevent type errors
#     red = red_band.astype(np.float32)
#     nir = nir_band.astype(np.float32)
#     # Prevent division by zero
#     numerator = nir - red
#     denominator = nir + red
#     ndvi = np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=np.float32), where=denominator!=0)
#     return ndvi

# Assuming satellite_image has Red in band X and NIR in band Y (0-indexed)
# if 'satellite_image' in locals() and satellite_image.ndim == 3 and satellite_image.shape[0] >= 4: # Basic check
#     red_band_index = 3 # Example: band 4 (0-indexed)
#     nir_band_index = 4 # Example: band 5 (0-indexed)
#     calculated_ndvi = calculate_ndvi(satellite_image[red_band_index], satellite_image[nir_band_index])
#     print(f"Calculated NDVI shape: {calculated_ndvi.shape}")
#     # plt.imshow(calculated_ndvi, cmap='RdYlGn')
#     # plt.title('Calculated NDVI')
#     # plt.colorbar(label='NDVI Value')
#     # plt.show()
# else:
#     print("Satellite image not available or does not have enough bands to calculate NDVI.")

**Observations & Next Steps (NDVI):**
- Verify the range of NDVI values (typically -1 to +1).
- Check CRS and resolution, ensure consistency with other raster data.
- If NDVI is not provided directly, calculate it from satellite imagery (Red and NIR bands).

### 3.4 GIS Data

In [None]:
try:
    gis_data = gpd.read_file(GIS_DATA_PATH)
    print(f"GIS data loaded successfully. Shape: {gis_data.shape}")
    print(f"GIS data CRS: {gis_data.crs}")
    # print(gis_data.head())
    # gis_data.plot(figsize=(10, 10), legend=True, column='land_use_type_example') # Replace with actual column
    # plt.title('GIS Data Sample')
    # plt.show()
except FileNotFoundError:
    print(f"Error: GIS file not found at {GIS_DATA_PATH}.")
except Exception as e:
    print(f"An error occurred while loading GIS data: {e}")

**Observations & Next Steps (GIS):**
- Examine attribute table for relevant features (e.g., land use, topography, hydrography).
- Check geometry types (points, lines, polygons).
- Ensure CRS is consistent with other datasets. Reproject if necessary.

### 3.5 Archaeological Literature Data

In [None]:
try:
    with open(ARCHAEOLOGICAL_LITERATURE_PATH, 'r') as f:
        archaeological_lit = json.load(f)
    print(f"Archaeological literature data loaded successfully.")
    # print(json.dumps(archaeological_lit, indent=2, ensure_ascii=False)[:500]) # Print a sample

    # If it's a list of records, convert to DataFrame for easier handling
    # if isinstance(archaeological_lit, list):
    #     archaeological_df = pd.DataFrame(archaeological_lit)
    #     print(f"Converted to DataFrame. Shape: {archaeological_df.shape}")
    #     print(archaeological_df.head())
    # elif isinstance(archaeological_lit, dict): # Or some other structure
    #     # Process dictionary as needed
    #     print("Data is a dictionary. Inspect its structure for relevant information.")
    
except FileNotFoundError:
    print(f"Error: Archaeological literature file not found at {ARCHAEOLOGICAL_LITERATURE_PATH}.")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {ARCHAEOLOGICAL_LITERATURE_PATH}.")
except Exception as e:
    print(f"An error occurred: {e}")

**Observations & Next Steps (Archaeological Literature):**
- Understand the structure of the data (JSON fields, nested objects).
- Extract key information: site coordinates (if available), textual descriptions, DOI, LiDAR tile IDs, etc.
- This data will be crucial for training (if labeled sites are provided) and for validation.

## 4. Data Preprocessing Plan

Based on initial exploration, outline the necessary preprocessing steps:

### 4.1 Handling Missing Values
- **Strategy**: Identify missing values in each dataset. Decide on imputation (e.g., mean, median, mode, interpolation for rasters) or removal, depending on the extent and nature of missingness.

In [None]:
# Example for a pandas DataFrame (conceptual)
# if 'some_df' in locals():
#     print(some_df.isnull().sum())

### 4.2 Coordinate System Unification
- **Strategy**: Ensure all geospatial datasets (LiDAR, satellite, NDVI, GIS) share a common Coordinate Reference System (CRS). Reproject data if necessary. The target CRS should be suitable for the Amazon region.

In [None]:
# Example for GeoPandas GeoDataFrame
# if 'gis_data' in locals() and 'lidar_meta' in locals():
#     target_crs = lidar_meta['crs'] # Assuming LiDAR CRS is the target
#     if gis_data.crs != target_crs:
#         print(f"Reprojecting GIS data from {gis_data.crs} to {target_crs}")
#         # gis_data = gis_data.to_crs(target_crs)
#         # print(f"New GIS data CRS: {gis_data.crs}")

# For raster data with rasterio, reprojection is more complex and might involve `rasterio.warp.reproject`

### 4.3 Data Resampling/Alignment (for Rasters)
- **Strategy**: If raster datasets (LiDAR, satellite, NDVI) have different resolutions or pixel alignments, they may need to be resampled to a common grid for combined analysis. Choose an appropriate resampling method (e.g., nearest neighbor, bilinear).

### 4.4 Feature Extraction (Examples)
- **LiDAR**: Derive terrain features like slope, aspect, curvature, Digital Terrain Model (DTM), Canopy Height Model (CHM).
- **Satellite/NDVI**: Extract textural features, additional vegetation indices.
- **GIS**: Rasterize vector data (e.g., distance to rivers, land cover type) to align with raster grids.

In [None]:
# Placeholder for feature extraction code
# Example: Slope calculation from elevation data (conceptual)
# if 'lidar_data' in locals():
#     # slope_x, slope_y = np.gradient(lidar_data)
#     # slope_rad = np.arctan(np.sqrt(slope_x**2 + slope_y**2))
#     # slope_deg = np.degrees(slope_rad)
#     # print("Slope calculated (conceptual).")
#     pass

### 4.5 Data Normalization/Scaling
- **Strategy**: For machine learning models, numerical features often need to be scaled (e.g., MinMaxScaler, StandardScaler) to a common range.

## 5. Next Steps

- Proceed with implementing the preprocessing steps outlined above.
- Perform exploratory data analysis (EDA) on the cleaned and processed data to understand distributions, correlations, and identify potential patterns.
- Begin feature engineering based on insights from EDA and domain knowledge.