In [None]:
import pandas as pd
import numpy as np

print("Starting the data integration process (Corrected Version)...")

# Load All Datasets
try:
    df_usda = pd.read_csv('../Bee/data/processed/cleaned_usda_bee_data.csv')
    df_climate = pd.read_csv('../data/Climate Data/noaa_monthly_climate_1985_2024.csv')
    df_veg = pd.read_csv('../data/Vegetation Data/Vegetation_Data.csv')
    df_sightings = pd.read_csv('../Bee/data/processed/usgs_monthly_sighting_counts.csv')
    print("All source files loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: Could not find a file. {e}")
    print("Please double-check your file paths.")

# Process USDA Data
df_usda['state'] = df_usda['State'].str.upper()
df_usda.rename(columns={'Year': 'year'}, inplace=True)

# Get Annual Inventory from Marketing Year
df_usda_inventory = df_usda[
    df_usda['Period'] == 'MARKETING YEAR'
][['year', 'state', 'inventory_colonies']].copy()

# Get Annual Loss & Stressors
quarterly_periods = ['JAN THRU MAR', 'APR THRU JUN', 'JUL THRU SEP', 'OCT THRU DEC']
df_usda_quarterly = df_usda[df_usda['Period'].isin(quarterly_periods)]

stressor_cols = [
    'loss_pct', 'stressor_disease_pct', 'stressor_other_pct',
    'stressor_pesticides_pct', 'stressor_pests_pct',
    'stressor_unknown_pct', 'stressor_varroa_mites_pct'
]

# Group by year and state
df_usda_annual_metrics = df_usda_quarterly.groupby(['year', 'state'])[stressor_cols].mean().reset_index()

# Combine USDA inventory and metrics
df_usda_annual = pd.merge(
    df_usda_inventory,
    df_usda_annual_metrics,
    on=['year', 'state'],
    how='left'
)
print("Processed USDA data (Combined annual inventory and averaged quarterly metrics).")

# Process Climate Data
df_climate['state'] = df_climate['State'].str.upper()
df_climate_annual = df_climate.groupby(['year', 'state']).agg(
    annual_avg_temp=('tavg', 'mean'),
    annual_total_precip=('pcp', 'sum')
).reset_index()
print("Processed and aggregated annual climate data.")

# Process Vegetation Data
df_veg['Date'] = pd.to_datetime(df_veg['Date'], errors='coerce')
df_veg.dropna(subset=['Date'], inplace=True)
df_veg['year'] = df_veg['Date'].dt.year
df_veg['state'] = df_veg['State'].str.upper()
df_veg_annual = df_veg.groupby(['year', 'state']).agg(
    annual_avg_ndvi=('Mean_NDVI', 'mean')
).reset_index()
print("Processed and aggregated annual vegetation data.")

# Process USGS Sightings Data 
df_sightings['state'] = df_sightings['stateProvince'].str.upper()
df_sightings_annual = df_sightings.groupby(['year', 'state']).agg(
    annual_total_sightings=('sighting_count', 'sum')
).reset_index()
print("Processed and aggregated annual USGS sighting data.")

# Merge All Datasets
print("Merging all datasets into a master file...")
master_df = df_usda_annual
master_df = pd.merge(master_df, df_climate_annual, on=['year', 'state'], how='left')
master_df = pd.merge(master_df, df_veg_annual, on=['year', 'state'], how='left')
master_df = pd.merge(master_df, df_sightings_annual, on=['year', 'state'], how='left')

# Final Cleanup and Save
# Filter for the data
master_df = master_df[master_df['year'] >= 2015].copy()

master_df.dropna(subset=['loss_pct'], inplace=True)

master_df.to_csv('master_dataset_state_year.csv', index=False)

print("\n--- ✅ SUCCESS! ---")
print("New file 'master_dataset_state_year.csv' has been created.")
print("This file contains all data aggregated at the state-year level.")

print("\n--- Master DataFrame Info ---")
master_df.info()

print("\n--- Master DataFrame Head ---")
print(master_df.head())

Starting the data integration process (Corrected Version)...
All source files loaded successfully.
Processed USDA data (Combined annual inventory and averaged quarterly metrics).
Processed and aggregated annual climate data.
Processed and aggregated annual vegetation data.
Processed and aggregated annual USGS sighting data.
Merging all datasets into a master file...

--- ✅ SUCCESS! ---
New file 'master_dataset_state_year.csv' has been created.
This file contains all data aggregated at the state-year level.

--- Master DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
Index: 378 entries, 1208 to 1585
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   year                       378 non-null    int64  
 1   state                      378 non-null    object 
 2   inventory_colonies         378 non-null    float64
 3   loss_pct                   378 non-null    float64
 4   stressor_dis