# Data Preparation and Feature Engineering

This notebook documents the process of cleaning the raw metadata and generating species-level features for the MicroArk project.

In [None]:
import pandas as pd
import os
from pathlib import Path

# Define paths relative to the project root
# Assuming this notebook is in src/notebooks/
project_root = Path("../../").resolve()
input_file = project_root / "data" / "metalog_raw.csv"
output_file = project_root / "data" / "species_features.csv"

print(f"Project Root: {project_root}")
print(f"Input File: {input_file}")

## 1. Load Data
Load the raw metadata CSV downloaded from Metalog.

In [None]:
if not input_file.exists():
    print(f"Error: {input_file} not found.")
else:
    df = pd.read_csv(input_file, low_memory=False)
    print(f"Loaded DataFrame with shape: {df.shape}")
    display(df.head())

## 2. Preprocessing & Column Mapping

We need to map specific columns to our target features:
- `host_species` <--- `host_tax_id` (Taxonomic ID is more reliable)
- `country` <--- `location` (Extracted from string)
- `collection_year` <--- `collection_date` (Extracted year)

In [None]:
# Map host_species
if 'host_tax_id' in df.columns:
    df['host_species'] = df['host_tax_id']
else:
    print("Error: 'host_tax_id' column missing.")

# Map country
if 'location' in df.columns:
    df['country'] = df['location'].astype(str).apply(lambda x: x.split(':')[0].strip() if pd.notnull(x) else "Unknown")
else:
    df['country'] = "Unknown"

# Map collection_year
if 'collection_date' in df.columns:
    df['collection_year'] = pd.to_datetime(df['collection_date'], errors='coerce').dt.year
else:
    df['collection_year'] = 0

# Ensure coordinates
for col in ['latitude', 'longitude']:
    if col not in df.columns:
        df[col] = 0

print("Columns mapped.")

## 3. Data Cleaning

1. Drop rows where `host_species` is NaN.
2. Filter out species that have fewer than 5 samples.

In [None]:
before_count = len(df)
df = df.dropna(subset=["host_species"])
print(f"Dropped {before_count - len(df)} rows with missing species info.")

# Filter < 5 samples
species_counts = df["host_species"].value_counts()
valid_species = species_counts[species_counts >= 5].index
df_filtered = df[df["host_species"].isin(valid_species)].copy()

print(f"Retained {len(valid_species)} species with >= 5 samples.")
print(f"Final sample count: {len(df_filtered)}")

## 4. Feature Creation

Group by `host_species` and calculate:
- `num_samples`: Total count
- `num_countries`: Unique countries
- `year_span`: Max year - Min year
- `lat_variance`: Variance of latitude
- `long_variance`: Variance of longitude

In [None]:
grouped = df_filtered.groupby("host_species")

features = pd.DataFrame({
    "num_samples": grouped.size(),
    "num_countries": grouped["country"].nunique(),
    "year_span": grouped["collection_year"].max() - grouped["collection_year"].min(),
    "lat_variance": grouped["latitude"].var(),
    "long_variance": grouped["longitude"].var()
})

features = features.fillna(0)
features.reset_index(inplace=True)

display(features.head())

## 5. Save Results
Save the generated features to a CSV file.

In [None]:
features.to_csv(output_file, index=False)
print(f"Saved features to {output_file}")