In [1]:
from kafka import KafkaConsumer
import json
import csv
import os
import time
from datetime import datetime

In [6]:
# ==========================================
# ‚öôÔ∏è CONFIGURATION
# ==========================================
KAFKA_TOPIC = 'traffy_data'
KAFKA_BROKER = 'localhost:9092'
GROUP_ID = 'traffy_raw_writer_group' 
OUTPUT_FILE = '../../data/raw/bangkok_traffy_raw.csv'
FLUSH_INTERVAL = 100 
TIMEOUT_MS = 5000 

# Header: ‡πÄ‡∏≠‡∏≤‡∏ï‡∏≤‡∏° Field ‡∏à‡∏£‡∏¥‡∏á‡∏ó‡∏µ‡πà Traffy ‡∏™‡πà‡∏á‡∏°‡∏≤ + processing_time
CSV_FIELDS = [
    "ticket_id", "type", "organization", "comment", "photo", 
    "photo_after", "coords", "address", "subdistrict", "district", 
    "province", "timestamp", "state", "star", "count_reopen", 
    "last_activity", "processing_time"
]

# ==========================================
# 1. SETUP CONSUMER & FILE
# ==========================================
consumer = KafkaConsumer(
    KAFKA_TOPIC,
    bootstrap_servers=[KAFKA_BROKER],
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    group_id=GROUP_ID,
    value_deserializer=lambda x: json.loads(x.decode('utf-8')) if x else None,
    consumer_timeout_ms=TIMEOUT_MS
)

In [7]:
print(f"üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏î‡∏¥‡∏ö (Raw Data) ‡∏•‡∏á‡πÑ‡∏ü‡∏•‡πå: {OUTPUT_FILE}")

# ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå‡∏ñ‡πâ‡∏≤‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏°‡∏µ
folder_path = os.path.dirname(OUTPUT_FILE)
if folder_path and not os.path.exists(folder_path):
    os.makedirs(folder_path)

# ‡πÄ‡∏õ‡∏¥‡∏î‡πÑ‡∏ü‡∏•‡πå CSV ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô
csv_file = open(OUTPUT_FILE, 'w', newline='', encoding='utf-8-sig') 
writer = csv.DictWriter(csv_file, fieldnames=CSV_FIELDS)
writer.writeheader()

# üî• ‡∏™‡∏£‡πâ‡∏≤‡∏á Set ‡πÑ‡∏ß‡πâ‡∏à‡∏≥ ID ‡∏ó‡∏µ‡πà‡πÄ‡∏Ñ‡∏¢‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô‡∏•‡∏á‡πÑ‡∏ü‡∏•‡πå‡πÅ‡∏•‡πâ‡∏ß
seen_ticket_ids = set()

print(f"üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å...")

# ==========================================
# 2. MAIN LOOP
# ==========================================
try:
    count = 0
    duplicate_count = 0
    missing_id_count = 0
    
    for message in consumer:
        raw_data = message.value
        if raw_data is None: continue

        ticket_id = raw_data.get('ticket_id')


        if ticket_id in seen_ticket_ids:
            duplicate_count += 1
            continue

        if ticket_id is None:
            missing_id_count += 1
            continue
        
        # ‚úÖ ‡∏ñ‡πâ‡∏≤‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡πÄ‡∏Ñ‡∏¢‡πÄ‡∏à‡∏≠:
        # 1. ‡∏à‡∏î‡∏à‡∏≥ ID ‡∏ô‡∏µ‡πâ‡πÑ‡∏ß‡πâ
        seen_ticket_ids.add(ticket_id)

        # 2. ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏•‡∏á‡πÑ‡∏ü‡∏•‡πå
        try:
            raw_record = {
                "ticket_id": ticket_id,
                "type": raw_data.get('type'),            
                "organization": raw_data.get('organization'),
                "comment": raw_data.get('comment'),
                "photo": raw_data.get('photo'),
                "photo_after": raw_data.get('photo_after'),
                "coords": raw_data.get('coords'),        
                "address": raw_data.get('address'),
                "subdistrict": raw_data.get('subdistrict'),
                "district": raw_data.get('district'),
                "province": raw_data.get('province'),
                "timestamp": raw_data.get('timestamp'),
                "state": raw_data.get('state'),
                "star": raw_data.get('star'),          
                "count_reopen": raw_data.get('count_reopen'),
                "last_activity": raw_data.get('last_activity'),
                "processing_time": datetime.now().isoformat() 
            }

            # ‡πÄ‡∏ä‡πá‡∏Ñ coords
            coords = raw_record["coords"]
            if coords is None:
                print(f"\n‚ö†Ô∏è ‡∏Ç‡πâ‡∏≤‡∏°‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏û‡∏¥‡∏Å‡∏±‡∏î coords (ticket_id: {ticket_id})")
                continue

            writer.writerow(raw_record)
            count += 1
            
            if count % FLUSH_INTERVAL == 0:
                csv_file.flush()
                print(f"‚úÖ Saved {count} (Duplicate: {duplicate_count}) (Missing: {missing_id_count})", end='\r')
            
        except Exception as e:
            print(f"\n‚ö†Ô∏è Error: {e}")
    
    print(f"\nüéâ ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏´‡∏°‡∏î‡πÅ‡∏•‡πâ‡∏ß! (Timeout {TIMEOUT_MS}ms)")

except KeyboardInterrupt:
    print(f"\nüõë User Stopped.")

finally:
    csv_file.flush()
    csv_file.close()
    consumer.close()
    
    print("-" * 40)
    print(f"üìä ‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ó‡∏≥‡∏á‡∏≤‡∏ô:")
    print(f"   - ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏à‡∏£‡∏¥‡∏á: {count} ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£")
    print(f"   - ‡∏Ç‡πâ‡∏≤‡∏°‡∏ó‡∏µ‡πà‡∏ã‡πâ‡∏≥: {duplicate_count} ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£")
    print(f"üìÇ ‡πÑ‡∏ü‡∏•‡πå‡∏≠‡∏¢‡∏π‡πà‡∏ó‡∏µ‡πà: {OUTPUT_FILE}")

üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏î‡∏¥‡∏ö (Raw Data) ‡∏•‡∏á‡πÑ‡∏ü‡∏•‡πå: ../../data/raw/bangkok_traffy_raw.csv
üöÄ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å...
‚úÖ Saved 778200 (Duplicate: 8771) (Missing: 0)
üéâ ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏´‡∏°‡∏î‡πÅ‡∏•‡πâ‡∏ß! (Timeout 5000ms)
----------------------------------------
üìä ‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ó‡∏≥‡∏á‡∏≤‡∏ô:
   - ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏à‡∏£‡∏¥‡∏á: 778255 ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£
   - ‡∏Ç‡πâ‡∏≤‡∏°‡∏ó‡∏µ‡πà‡∏ã‡πâ‡∏≥: 8771 ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£
üìÇ ‡πÑ‡∏ü‡∏•‡πå‡∏≠‡∏¢‡∏π‡πà‡∏ó‡∏µ‡πà: ../../data/raw/bangkok_traffy_raw.csv
