In [1]:
import pandas as pd
import requests
import time
import json
import os
from kafka import KafkaProducer

START_DATE = "2021-08-01"
END_DATE = "2025-01-30"
GEOGRAPHY_FILE = "../../data/source/thailand_geography.csv"


In [2]:
KAFKA_BROKER = 'localhost:9092' 
KAFKA_TOPIC = 'air_data'

producer = KafkaProducer(
    bootstrap_servers=[KAFKA_BROKER],
    value_serializer=lambda x: json.dumps(x).encode('utf-8')
)
print(f"‚úÖ Connected to Kafka Broker: {KAFKA_BROKER}")

‚úÖ Connected to Kafka Broker: localhost:9092


In [3]:
# ---------------------------------------------------------
# PREPARE LOCATIONS
# ---------------------------------------------------------
target_locations = []
try:
    df_geo = pd.read_csv(GEOGRAPHY_FILE)
    bangkok_districts = df_geo[df_geo['province'].str.contains("Bangkok|‡∏Å‡∏£‡∏∏‡∏á‡πÄ‡∏ó‡∏û", case=False, na=False)]

    
    for index, row in bangkok_districts.iterrows():
        target_locations.append({
            "name": row['district'], 
            "lat": row['latitude'],
            "lon": row['longitude']
        })

    print(f"üéØ Target: {len(target_locations)} districts")
except FileNotFoundError:
    print(f"‚ùå Error: ‡∏´‡∏≤‡πÑ‡∏ü‡∏•‡πå {GEOGRAPHY_FILE} ‡πÑ‡∏°‡πà‡πÄ‡∏à‡∏≠")

# ---------------------------------------------------------
# SCRAPING & SENDING LOOP
# ---------------------------------------------------------
print(f"üöÄ Starting Data Collection -> Kafka Topic: {KAFKA_TOPIC}")
total_sent = 0

for location in target_locations:
    dist_name = location['name']
    
    url = "https://air-quality-api.open-meteo.com/v1/air-quality"
    
    params = {
        "latitude": location['lat'],
        "longitude": location['lon'],
        "start_date": START_DATE,
        "end_date": END_DATE,
        "hourly": "pm2_5",
        "timezone": "Asia/Bangkok"
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        
        # --- PROCESS: Hourly -> Daily Average ---
        hourly_data = data.get('hourly', {})
        times = hourly_data.get('time', [])
        values = hourly_data.get('pm2_5', [])
        
        temp_df = pd.DataFrame({
            'time': pd.to_datetime(times),
            'pm25': values
        })
        
        # Resample ‡πÄ‡∏õ‡πá‡∏ô‡∏£‡∏≤‡∏¢‡∏ß‡∏±‡∏ô (‡∏´‡∏≤‡∏Ñ‡πà‡∏≤‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢)
        temp_df.set_index('time', inplace=True)
        daily_df = temp_df.resample('D').mean().reset_index()
        
        # --- SEND TO KAFKA ---
        # ‡∏ß‡∏ô‡∏•‡∏π‡∏õ‡∏™‡πà‡∏á‡∏ó‡∏µ‡∏•‡∏∞‡∏ß‡∏±‡∏ô‡∏Ç‡∏≠‡∏á‡πÄ‡∏Ç‡∏ï‡∏£‡∏ô‡∏±‡πâ‡∏ô‡πÜ
        for _, row in daily_df.iterrows():
            
            # ‡πÄ‡∏ä‡πá‡∏Ñ‡∏ß‡πà‡∏≤‡∏Ñ‡πà‡∏≤ PM2.5 ‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà NaN (API ‡∏ö‡∏≤‡∏á‡∏ó‡∏µ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏´‡∏≤‡∏¢)
            pm25_val = row['pm25']
            if pd.isna(pm25_val):
                continue

            payload = {
                "district": dist_name,
                "date": row['time'].strftime('%Y-%m-%d'), # ‡πÅ‡∏õ‡∏•‡∏á‡πÄ‡∏õ‡πá‡∏ô string ‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà
                "avg_pm25_ug_m3": float(pm25_val),        # ‡πÅ‡∏õ‡∏•‡∏á‡πÄ‡∏õ‡πá‡∏ô float
                "latitude": location['lat'],
                "longitude": location['lon']
            }
            
            # ‡∏™‡πà‡∏á‡πÄ‡∏Ç‡πâ‡∏≤ Kafka
            producer.send(KAFKA_TOPIC, value=payload)
            total_sent += 1

        print(f"‚úì Sent data for {dist_name} , records: {len(daily_df)} , total sent: {total_sent}")
        
    except Exception as e:
        print(f"x Failed {dist_name}: {e}")
        
    # Be polite to API
    time.sleep(0.1)

# ---------------------------------------------------------
# 4. FINISH
# ---------------------------------------------------------
producer.flush() # ‡∏î‡∏±‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏µ‡πà‡∏Ñ‡πâ‡∏≤‡∏á‡∏ó‡πà‡∏≠‡∏≠‡∏≠‡∏Å‡πÉ‡∏´‡πâ‡∏´‡∏°‡∏î
producer.close()

print("------------------------------------------------")
print(f"‚úÖ Completed! Sent total {total_sent} records to Kafka.")

üéØ Target: 169 districts
üöÄ Starting Data Collection -> Kafka Topic: air_data
‚úì Sent data for ‡∏û‡∏£‡∏∞‡∏ô‡∏Ñ‡∏£ , records: 1279 , total sent: 911
‚úì Sent data for ‡∏û‡∏£‡∏∞‡∏ô‡∏Ñ‡∏£ , records: 1279 , total sent: 1822
‚úì Sent data for ‡∏û‡∏£‡∏∞‡∏ô‡∏Ñ‡∏£ , records: 1279 , total sent: 2733
‚úì Sent data for ‡∏û‡∏£‡∏∞‡∏ô‡∏Ñ‡∏£ , records: 1279 , total sent: 3644
‚úì Sent data for ‡∏û‡∏£‡∏∞‡∏ô‡∏Ñ‡∏£ , records: 1279 , total sent: 4555
‚úì Sent data for ‡∏û‡∏£‡∏∞‡∏ô‡∏Ñ‡∏£ , records: 1279 , total sent: 5466
‚úì Sent data for ‡∏û‡∏£‡∏∞‡∏ô‡∏Ñ‡∏£ , records: 1279 , total sent: 6377
‚úì Sent data for ‡∏û‡∏£‡∏∞‡∏ô‡∏Ñ‡∏£ , records: 1279 , total sent: 7288
‚úì Sent data for ‡∏û‡∏£‡∏∞‡∏ô‡∏Ñ‡∏£ , records: 1279 , total sent: 8199
‚úì Sent data for ‡∏û‡∏£‡∏∞‡∏ô‡∏Ñ‡∏£ , records: 1279 , total sent: 9110
‚úì Sent data for ‡∏û‡∏£‡∏∞‡∏ô‡∏Ñ‡∏£ , records: 1279 , total sent: 10021
‚úì Sent data for ‡∏û‡∏£‡∏∞‡∏ô‡∏Ñ‡∏£ , records: 1279 , total sent: 10932
‚úì Sent data for ‡∏î‡∏∏‡∏™‡∏¥‡∏ï , records: 1279 , t