In [6]:
"""
GEOSPATIAL INTELLIGENCE PIPELINE: SECURITY INCIDENTS (PERSISTENT)
-----------------------------------------------------------------
Author: William Alfredo Canch√© Zacar√≠as
Description: ETL Pipeline that scrapes news, merges with historical database,
resolves entities to coordinates, and publishes GeoJSON/HTML maps.
"""

import requests
import json
import random
import pandas as pd
import folium
import os
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from datetime import datetime

# ==========================================
# 1. CONFIGURATION
# ==========================================
SEARCH_QUERY = "(policiaca OR seguridad OR accidente OR ssp OR robo) AND Yucatan"
DATABASE_FILE = "base_datos_seguridad_acumulada.csv" 
USER_AGENT_ID = "security_analyst_bot_v3.0"

PRIORITY_NEIGHBORHOODS = {
    "Juan Pablo": "Juan Pablo II, Merida, Yucatan",
    "Ciudad Caucel": "Ciudad Caucel, Merida, Yucatan",
    "Caucel": "Ciudad Caucel, Merida, Yucatan",
    "Altabrisa": "Altabrisa, Merida, Yucatan",
    "Centro": "Centro, Merida, Yucatan",
    "Perif√©rico": "Anillo Periferico, Merida, Yucatan",
    "Francisco de Montejo": "Francisco de Montejo, Merida, Yucatan",
    "Las Am√©ricas": "Fraccionamiento Las Americas, Merida, Yucatan",
    "Cholul": "Cholul, Merida, Yucatan",
    "Kanas√≠n": "Kanasin, Yucatan",
    "Plaza Grande": "Plaza Grande, Merida, Yucatan",
    "Los H√©roes": "Fraccionamiento Los Heroes, Merida, Yucatan",
    "Vergel": "Vergel, Merida, Yucatan"
}

TARGET_MUNICIPALITIES = [
    "Valladolid", "Tizim√≠n", "Progreso", "Um√°n", "Tekax", 
    "Ticul", "Chemax", "Motul", "Hunucm√°", "Oxkutzcab", 
    "Izamal", "Celest√∫n", "Sisal"
]

# ==========================================
# 2. DATA INGESTION (Scraping)
# ==========================================
def fetch_live_data():
    """Scrapes current Google News Feed"""
    url = f"https://news.google.com/rss/search?q={SEARCH_QUERY}&hl=es-419&gl=MX&ceid=MX:es-419"
    print(f"üì° [ETL] Connecting to Google News Feed...")
    
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, 'xml')
        items = soup.find_all('item')
        
        data = []
        for item in items:
            # Basic Date Cleaning
            data.append({
                'title': item.title.text,
                'description': item.description.text if item.description else "",
                'pub_date': item.pubDate.text,
                'link': item.link.text,
                'source': item.source.text if item.source else "Google News"
            })
        print(f"‚úÖ [ETL] Downloaded {len(data)} new articles.")
        return pd.DataFrame(data)
    except Exception as e:
        print(f"‚ùå [ETL] Error: {e}")
        return pd.DataFrame()

# ==========================================
# 3. DATABASE MANAGEMENT
# ==========================================
def update_database(new_df):
    """Merges new data with existing CSV database"""
    if os.path.exists(DATABASE_FILE):
        print("üìÇ [DB] Loading historical database...")
        history_df = pd.read_csv(DATABASE_FILE)
        
        # Merge and delete duplicates based on title
        full_df = pd.concat([new_df, history_df])
        full_df = full_df.drop_duplicates(subset='title', keep='first')
        
        print(f"üìà [DB] Database updated. Total records: {len(full_df)}")
    else:
        print("‚ú® [DB] Creating new database file...")
        full_df = new_df
    
    # Save new version
    full_df.to_csv(DATABASE_FILE, index=False, encoding='utf-8-sig')
    return full_df

# ==========================================
# 4. GEOSPATIAL PROCESSING
# ==========================================
geolocator = Nominatim(user_agent=USER_AGENT_ID)
geocode_service = RateLimiter(geolocator.geocode, min_delay_seconds=1)

def resolve_coordinates(row):
    # If there are coordinates on database / save time
    if pd.notnull(row.get('lat')) and row.get('lat') != 0:
        return row['lat'], row['lon'], row['location_match']

    full_text = (str(row['title']) + " " + str(row['description'])).lower()
    
    # Search Logic (Colonias -> Municipios)
    for key, query in PRIORITY_NEIGHBORHOODS.items():
        if key.lower() in full_text:
            try:
                loc = geolocator.geocode(query + ", Mexico")
                if loc:
                    return loc.latitude + random.uniform(-0.003, 0.003), \
                           loc.longitude + random.uniform(-0.003, 0.003), key
            except: pass

    for muni in TARGET_MUNICIPALITIES:
        if muni.lower() in full_text:
            try:
                loc = geolocator.geocode(f"{muni}, Yucatan, Mexico")
                if loc:
                    return loc.latitude + random.uniform(-0.005, 0.005), \
                           loc.longitude + random.uniform(-0.005, 0.005), muni
            except: pass
            
    return None, None, None

# ==========================================
# 5. PIPELINE EXECUTION
# ==========================================
def run_pipeline():
    # 1. Obtain fresh data
    live_df = fetch_live_data()
    
    # 2. Update historical database 
    master_df = update_database(live_df)
    
    if master_df.empty: return "No data."

    # 3. Add coordinates
    print("üìç [GEO] resolving coordinates for database...")
    
    # Check for lat/lon rows on new files
    if 'lat' not in master_df.columns: master_df['lat'] = None
    if 'lon' not in master_df.columns: master_df['lon'] = None

    coords = master_df.apply(lambda row: pd.Series(resolve_coordinates(row)), axis=1)
    master_df[['lat', 'lon', 'location_match']] = coords

    # Save with calculated coordinates to not waste API
    master_df.to_csv(DATABASE_FILE, index=False, encoding='utf-8-sig')
    
    # Filter
    geo_df = master_df.dropna(subset=['lat'])
    print(f"üó∫Ô∏è [VIZ] Mapping {len(geo_df)} verified incidents.")

    # 4. Generate GeoJSON y Map
    geojson_data = {"type": "FeatureCollection", "features": []}
    for _, row in geo_df.iterrows():
        feature = {
            "type": "Feature",
            "geometry": {"type": "Point", "coordinates": [row['lon'], row['lat']]},
            "properties": {"title": row['title'], "source": row['source']}
        }
        geojson_data["features"].append(feature)

    with open("security_incidents.geojson", "w", encoding="utf-8") as f:
        json.dump(geojson_data, f, ensure_ascii=False)

    m = folium.Map(location=[20.967, -89.623], zoom_start=10, tiles="CartoDB positron")
    folium.GeoJson(geojson_data, popup=folium.GeoJsonPopup(fields=["title", "source"])).add_to(m)
    m.save("index.html")
    
    return m

# Execute
m = run_pipeline()
m

üì° [ETL] Connecting to Google News Feed...
‚úÖ [ETL] Downloaded 100 new articles.
üìÇ [DB] Loading historical database...
üìà [DB] Database updated. Total records: 302
üìç [GEO] resolving coordinates for database...
üó∫Ô∏è [VIZ] Mapping 127 verified incidents.
