In [3]:
from kafka import KafkaConsumer
import json
import csv
import os
from datetime import datetime

OUTPUT_FILE = '../../data/raw/bangkok_air_raw.csv'

KAFKA_BROKER = 'localhost:9092' 
KAFKA_TOPIC = 'air_data'
GROUP_ID = 'air_group'
FLUSH_INTERVAL = 50
TIMEOUT_MS = 60000

# ‡∏ä‡∏∑‡πà‡∏≠‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå (Header)
CSV_FIELDS = [
    "district", 
    "date", 
    "avg_pm25_ug_m3", 
    "latitude", 
    "longitude", 
    "processing_time"
]

consumer = KafkaConsumer(
    KAFKA_TOPIC,
    bootstrap_servers=[KAFKA_BROKER],
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    group_id=GROUP_ID,
    value_deserializer=lambda x: json.loads(x.decode('utf-8')) if x else None,
    consumer_timeout_ms=TIMEOUT_MS
)

In [4]:
print(f"üöÄ Starting PM2.5 Consumer...")
print(f"üìÇ Output File: {OUTPUT_FILE}")

# ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå‡∏ñ‡πâ‡∏≤‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏°‡∏µ
folder_path = os.path.dirname(OUTPUT_FILE)
if folder_path and not os.path.exists(folder_path):
    os.makedirs(folder_path)

# ‡πÄ‡∏õ‡∏¥‡∏î‡πÑ‡∏ü‡∏•‡πå CSV ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô
csv_file = open(OUTPUT_FILE, 'w', newline='', encoding='utf-8-sig') 
writer = csv.DictWriter(csv_file, fieldnames=CSV_FIELDS)
writer.writeheader()

print(f"üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å...")

# ==========================================
# 2. MAIN LOOP
# ==========================================
try:
    count = 0
    print(f"üéß Listening for data (Auto-stop in {TIMEOUT_MS/1000}s)...")
    
    for message in consumer:
        raw_data = message.value
        
        if raw_data is None:
            continue

        try:

            record = {
                "district": raw_data.get('district'),
                "date": raw_data.get('date'),
                "avg_pm25_ug_m3": raw_data.get('avg_pm25_ug_m3'),
                "latitude": raw_data.get('latitude'),
                "longitude": raw_data.get('longitude'),
                "processing_time": datetime.now().isoformat()
            }

            writer.writerow(record)
            count += 1
            
            # Optimization: Flush periodically
            if count % FLUSH_INTERVAL == 0:
                csv_file.flush()
                print(f"‚úÖ Saved {count} records... (Last: {record['district']} - {record['date']})", end='\r')
            
        except Exception as e:
            print(f"\n‚ö†Ô∏è Error processing row: {e}")
    
    print(f"\nüéâ No more messages. Timeout reached ({TIMEOUT_MS}ms).")

except KeyboardInterrupt:
    print(f"\nüõë Stopped by user.")

finally:
    # Cleanup
    csv_file.flush()
    csv_file.close()
    consumer.close()
    
    print("-" * 40)
    print(f"‚úÖ Finished! Total records saved: {count}")
    print(f"üìÇ File location: {os.path.abspath(OUTPUT_FILE)}")

üöÄ Starting PM2.5 Consumer...
üìÇ Output File: ../../data/raw/bangkok_air_raw.csv
üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å...
üéß Listening for data (Auto-stop in 60.0s)...
‚úÖ Saved 140250 records... (Last: ‡∏ö‡∏≤‡∏á‡∏ö‡∏≠‡∏ô - 2024-12-17)))))01-07)
üéâ No more messages. Timeout reached (60000ms).
----------------------------------------
‚úÖ Finished! Total records saved: 140294
üìÇ File location: /Users/pattaponsurinwarangkul/CEDT/Year2/DataSci/proj/DSDE-Traffy-Project/data/raw/bangkok_air_raw.csv
