In [15]:
# import pandas as pd
# import requests
# import time
# import os

# # 1. Load your coordinates
# df_coords = pd.read_csv("/content/drive/MyDrive/Machine Learning 5th Semester Project/Practice/region_coordinates.csv") # Make sure this file exists!

# # 2. Settings for NASA POWER API
# BASE_URL = "https://power.larc.nasa.gov/api/temporal/daily/point"
# START_DATE = min_date  # YYYYMMDD (Change to match your data start)
# END_DATE = max_date    # YYYYMMDD (Change to match your data end)
# # Parameters: T2M (Temp), PRECTOTCORR (Rain), RH2M (Humidity)
# PARAMS = "T2M,T2M_MAX,T2M_MIN,PRECTOTCORR,RH2M"

# OUTPUT_FOLDER = "data/weather"
# os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# print(f"Starting download for {len(df_coords)} regions...")

# # 3. Loop through each region and download
# for index, row in df_coords.iterrows():
#     region_name = row['ID']
#     lat = row['Latitude']
#     lon = row['Longitude']

#     # Clean filename (remove spaces/slashes for windows compatibility)
#     safe_name = region_name.replace(" ", "_").replace("-", "_").replace("/", "_")

#     print(f"[{index+1}/{len(df_coords)}] Downloading: {region_name}...")

#     # API Request
#     request_url = f"{BASE_URL}?parameters={PARAMS}&community=AG&longitude={lon}&latitude={lat}&start={START_DATE}&end={END_DATE}&format=CSV"

#     response = requests.get(request_url)

#     if response.status_code == 200:
#         # Save to file
#         file_path = f"{OUTPUT_FOLDER}/weather_{safe_name}.csv"
#         with open(file_path, 'wb') as f:
#             f.write(response.content)
#     else:
#         print(f"Failed to download {region_name}")

#     # Be polite to the API (don't spam requests)
#     time.sleep(1)

# print("All downloads complete! Check the 'data/weather' folder.")

In [None]:
import pandas as pd
import numpy as np
import glob
import os

# ==========================================
# CONFIGURATION (PATHS)
# ==========================================
BASE_PATH = "D:/dengue_project/data"
DENGUE_FILE = f"{BASE_PATH}/philippines_dengue.csv"
SATELLITE_FILES = f"{BASE_PATH}/*Vegetation*.csv"
WEATHER_FILES = f"{BASE_PATH}/weather/*.csv"
OUTPUT_FILE = f"{BASE_PATH}/philippines_dengue_dataset_FINAL.csv"

# ==========================================
# PART 1: LOAD & CLEAN TARGET (DENGUE CASES)
# ==========================================
print("Step 1: Processing Dengue Cases...")
df_dengue = pd.read_csv(DENGUE_FILE)

# Remove metadata row if exists
if df_dengue.iloc[0]['cases'] == '#cases':
  df_dengue = df_dengue.iloc[1:].copy()

# Clean 'cases' column
df_dengue['cases'] = df_dengue['cases'].astype(str).str.replace(',', '', regex=True)
df_dengue['cases'] = pd.to_numeric(df_dengue['cases'], errors='coerce').fillna(0)

# Convert Date and Region
df_dengue['date'] = pd.to_datetime(df_dengue['date'])
df_dengue = df_dengue.rename(columns={'Region': 'Region_ID'})
df_dengue['Region_ID'] = df_dengue['Region_ID'].str.strip()

# Resample to Weekly Frequency per Region
df_dengue = df_dengue.set_index('date').groupby('Region_ID').resample('W-SUN')['cases'].sum().reset_index()

print(f"   > Dengue Data Loaded: {len(df_dengue)} rows (Unique Regions: {df_dengue['Region_ID'].nunique()})")

# ==========================================
# PART 2: PROCESS SATELLITE DATA (NDVI)
# ==========================================
print("Step 2: Processing Satellite Data...")
sat_files = glob.glob(SATELLITE_FILES)
if not sat_files:
    raise FileNotFoundError("Could not find Satellite CSV!")

df_sat = pd.read_csv(sat_files[0])
df_sat = df_sat.rename(columns={'ID': 'Region_ID', 'Date': 'date'})

# Find NDVI column and scale it
ndvi_col = [c for c in df_sat.columns if 'NDVI' in c][0]
df_sat['NDVI'] = df_sat[ndvi_col] * 0.0001
df_sat['date'] = pd.to_datetime(df_sat['date'])

# Interpolate to Weekly
df_sat_processed = pd.DataFrame()
for region in df_sat['Region_ID'].unique():
    subset = df_sat[df_sat['Region_ID'] == region].copy()
    subset = subset.set_index('date').resample('W-SUN').mean(numeric_only=True)
    subset['NDVI'] = subset['NDVI'].interpolate(method='linear')
    subset['Region_ID'] = region
    df_sat_processed = pd.concat([df_sat_processed, subset.reset_index()])

print(f"   > Satellite Data Processed: {len(df_sat_processed)} rows")

# ==========================================
# PART 3: PROCESS WEATHER DATA (FIXED MAPPING)
# ==========================================
print("Step 3: Processing Weather Data...")
weather_files = glob.glob(WEATHER_FILES)
df_weather_all = pd.DataFrame()

# *** THE FIX: Explicitly map the "Space" names to "Hyphen" names ***
# Key = The name appearing in your weather file processing
# Value = The exact name in the Dengue file
REGION_MAP = {
    'BARMM': 'BARMM',
    'CAR': 'CAR',
    'CARAGA': 'CARAGA',
    'NATIONAL CAPITAL REGION': 'NATIONAL CAPITAL REGION',
    'REGION III CENTRAL LUZON': 'REGION III-CENTRAL LUZON',
    'REGION IVB MIMAROPA': 'REGION IVB-MIMAROPA',
    'REGION IV-A CALABARZON': 'REGION IV-A-CALABARZON',
    'REGION IV-A-CALABARZON': 'REGION IV-A-CALABARZON', # Catch both cases
    'REGION IX ZAMBOANGA PENINSULA': 'REGION IX-ZAMBOANGA PENINSULA',
    'REGION VII CENTRAL VISAYAS': 'REGION VII-CENTRAL VISAYAS',
    'REGION VII EASTERN VISAYAS': 'REGION VII-EASTERN VISAYAS',
    'REGION VI WESTERN VISAYAS': 'REGION VI-WESTERN VISAYAS',
    'REGION V BICOL REGION': 'REGION V-BICOL REGION',
    'REGION XII SOCCSKSARGEN': 'REGION XII-SOCCSKSARGEN',
    'REGION XI DAVAO REGION': 'REGION XI-DAVAO REGION',
    'REGION X NORTHERN MINDANAO': 'REGION X-NORTHERN MINDANAO',
    'Region II CAGAYAN VALLEY': 'Region II-CAGAYAN VALLEY',
    'Region I ILOCOS REGION': 'Region I-ILOCOS REGION'
}

for file in weather_files:
    # 1. Clean the filename to get a "Raw Name"
    filename = os.path.basename(file)
    raw_name = filename.replace("weather_", "").replace(".csv", "").replace("_", " ").replace("- ", "-")

    # 2. Fix specific edge case found in your log
    if "CALABARZON" in raw_name and "IV-A" not in raw_name:
        raw_name = "REGION IV-A-CALABARZON"

    # 3. Look up the Correct Dengue Name
    # We try to find the raw_name in our map. If not found, we skip it to avoid errors.
    if raw_name in REGION_MAP:
        correct_name = REGION_MAP[raw_name]
    else:
        # Fallback: Try to find a partial match
        found = False
        for key, val in REGION_MAP.items():
            if key in raw_name or raw_name in key:
                correct_name = val
                found = True
                break
        if not found:
            print(f"WARNING: Could not map weather file '{filename}' (Raw: {raw_name})")
            continue

    # 4. Load Data
    try:
        df_w = pd.read_csv(file, skiprows=22, header=None, sep=',')
        df_w.columns = ['YEAR', 'MO', 'T2M', 'T2M_MAX', 'T2M_MIN', 'PRECTOTCORR', 'RH2M']
        df_w['date'] = pd.to_datetime(df_w['YEAR'].astype(str) + df_w['MO'].astype(str).str.zfill(3), format='%Y%j')

        df_w_weekly = df_w.set_index('date').resample('W-SUN').agg({
            'PRECTOTCORR': 'sum', 'T2M': 'mean', 'T2M_MAX': 'max', 'T2M_MIN': 'min', 'RH2M': 'mean'
        }).reset_index()

        df_w_weekly['Region_ID'] = correct_name
        df_weather_all = pd.concat([df_weather_all, df_w_weekly])

    except Exception as e:
        print(f"Error processing {filename}: {e}")

print(f"   > Weather Data Processed: {len(df_weather_all)} rows")

# ==========================================
# PART 4: THE GRAND MERGE
# ==========================================
print("Step 4: Merging Everything...")

# Merge Dengue + Weather
df_merged = pd.merge(df_dengue, df_weather_all, on=['date', 'Region_ID'], how='inner')

# Merge + Satellite
df_final = pd.merge(df_merged, df_sat_processed[['date', 'Region_ID', 'NDVI']], on=['date', 'Region_ID'], how='left')

# Drop rows where we lost data (Dates didn't overlap)
df_final = df_final.dropna()

print("="*40)
print(f"FINAL DATASET SHAPE: {df_final.shape}")
print("="*40)

# Save
df_final.to_csv(OUTPUT_FILE, index=False)
print(f"Saved to {OUTPUT_FILE}")
print(df_final.head())

Step 1: Processing Dengue Cases...
   > Dengue Data Loaded: 4454 rows (Unique Regions: 17)
Step 2: Processing Satellite Data...
   > Satellite Data Processed: 4454 rows
Step 3: Processing Weather Data...
   > Weather Data Processed: 4420 rows
Step 4: Merging Everything...
FINAL DATASET SHAPE: (4403, 9)
Saved to /content/drive/MyDrive/Machine Learning 5th Semester Project/dengue_project/data/philippines_dengue_dataset_FINAL.csv
  Region_ID       date  cases  PRECTOTCORR        T2M  T2M_MAX  T2M_MIN  \
0     BARMM 2016-01-24     30         2.31  25.470000    32.12    19.52   
1     BARMM 2016-01-31     36         1.61  24.745714    31.91    18.48   
2     BARMM 2016-02-07     27        11.54  26.907143    33.47    21.50   
3     BARMM 2016-02-14     27        14.04  26.217143    33.21    19.22   
4     BARMM 2016-02-21     27         6.90  25.560000    33.36    17.57   

        RH2M      NDVI  
0  80.087143  0.000053  
1  79.957143  0.000050  
2  79.484286  0.000048  
3  79.435714  0.00

In [None]:
df1 = pd.read_csv("dengue_project/data/philippines_dengue_dataset.csv")
df1['date'] = pd.to_datetime(df1['date'])
df1.head()

Unnamed: 0,Region_ID,date,cases,PRECTOTCORR,T2M,T2M_MAX,T2M_MIN,RH2M,NDVI
0,BARMM,2016-01-24,30,2.31,25.47,32.12,19.52,80.087143,5.3e-05
1,BARMM,2016-01-31,36,1.61,24.745714,31.91,18.48,79.957143,5e-05
2,BARMM,2016-02-07,27,11.54,26.907143,33.47,21.5,79.484286,4.8e-05
3,BARMM,2016-02-14,27,14.04,26.217143,33.21,19.22,79.435714,5e-05
4,BARMM,2016-02-21,27,6.9,25.56,33.36,17.57,78.058571,5.1e-05


In [30]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1295 entries, 0 to 1294
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Region_ID    1295 non-null   object        
 1   date         1295 non-null   datetime64[ns]
 2   cases        1295 non-null   int64         
 3   PRECTOTCORR  1295 non-null   float64       
 4   T2M          1295 non-null   float64       
 5   T2M_MAX      1295 non-null   float64       
 6   T2M_MIN      1295 non-null   float64       
 7   RH2M         1295 non-null   float64       
 8   NDVI         1295 non-null   float64       
dtypes: datetime64[ns](1), float64(6), int64(1), object(1)
memory usage: 91.2+ KB


In [28]:
len(df1)

1295