In [None]:
!pip install pandas
!pip install openpyxl
!pip install duckdb

## Fuentes de datos

### Estructurada

In [None]:
import pandas as pd
import os

# Set the data directory path
data_dir = "../data/relational"

print("=== EXAMINING XLSX FILES ===")
print()

# Read the Catalogos.xlsx file
print("1. CATALOGOS.XLSX:")
print("-" * 50)
catalogos_file = os.path.join(data_dir, "240708 Catalogos.xlsx")
try:
    # Read all sheets from the Excel file
    catalogos_sheets = pd.read_excel(catalogos_file, sheet_name=None)
    print(f"Number of sheets: {len(catalogos_sheets)}")
    print(f"Sheet names: {list(catalogos_sheets.keys())}")
    print()
    
    for sheet_name, df in catalogos_sheets.items():
        print(f"Sheet: {sheet_name}")
        print(f"Shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        print("First 5 rows:")
        print(df.head())
        print("-" * 30)
        print()
        
except Exception as e:
    print(f"Error reading catalogos file: {e}")

print("\n" + "="*60)
print()

# Read the Descriptores.xlsx file
print("2. DESCRIPTORES.XLSX:")
print("-" * 50)
descriptores_file = os.path.join(data_dir, "240708 Descriptores_.xlsx")
try:
    # Read all sheets from the Excel file
    descriptores_sheets = pd.read_excel(descriptores_file, sheet_name=None)
    print(f"Number of sheets: {len(descriptores_sheets)}")
    print(f"Sheet names: {list(descriptores_sheets.keys())}")
    print()
    
    for sheet_name, df in descriptores_sheets.items():
        print(f"Sheet: {sheet_name}")
        print(f"Shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        print("First 5 rows:")
        print(df.head())
        print("-" * 30)
        print()
        
except Exception as e:
    print(f"Error reading descriptores file: {e}")

print("\n" + "="*60)
print()

# Examine the CSV file structure
print("3. COVID19MEXICO202.CSV STRUCTURE:")
print("-" * 50)
csv_file = os.path.join(data_dir, "COVID19MEXICO2020.csv")
try:
    # Read just the first few rows to understand structure
    df_csv = pd.read_csv(csv_file, nrows=10)
    print(f"CSV Shape (first 10 rows): {df_csv.shape}")
    print(f"CSV Columns: {list(df_csv.columns)}")
    print("First 5 rows of CSV:")
    print(df_csv.head())
    print()
    
    # Get info about the full CSV
    df_info = pd.read_csv(csv_file)
    print(f"Full CSV Shape: {df_info.shape}")
    print(f"Data types:")
    print(df_info.dtypes)
    
except Exception as e:
    print(f"Error reading CSV file: {e}")

print("\n" + "="*60)
print()

In [None]:
# Set the graph data directory path
graph_dir = "../data/graph"

print("GRAPH DATA FILES:")
print("-" * 50)
graph_files = [
    "Casos_Diarios_Estado_Nacional_Confirmados_20230625.csv",
    "Casos_Diarios_Estado_Nacional_Defunciones_20230625.csv", 
    "Casos_Diarios_Estado_Nacional_Negativos_20230625.csv",
    "Casos_Diarios_Estado_Nacional_Sospechosos_20230625.csv"
]

for file in graph_files:
    print(f"• {file}")

print()
print("=" * 60)
print()

# Analyze each file
for i, file in enumerate(graph_files, 1):
    print(f"{i}. {file.upper()}:")
    print("-" * 50)
    
    file_path = os.path.join(graph_dir, file)
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        print(f"Shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        print()
        
        # Show first few rows
        print("First 3 rows:")
        print(df.head(3))
        print()
        
        # Analyze the data structure
        print("Data Analysis:")
        print(f"• Number of states: {len(df)}")
        print(f"• Date range: {df.columns[3]} to {df.columns[-1]}")
        print(f"• Total days covered: {len(df.columns) - 3}")
        print(f"• Population column: {df.columns[1]}")
        print(f"• State name column: {df.columns[2]}")
        print()
        
        # Show some statistics
        print("Sample Statistics:")
        numeric_cols = df.columns[3:]  # All date columns
        sample_dates = numeric_cols[:5]  # First 5 date columns
        print(f"Sample dates: {list(sample_dates)}")
        print(f"Sample values for first state (Aguascalientes):")
        for col in sample_dates:
            print(f"  {col}: {df.iloc[0][col]}")
        print()
        
        print("=" * 60)
        print()
        
    except Exception as e:
        print(f"Error reading {file}: {e}")
        print()


In [None]:
# Set the text data directory path
text_dir = "../data/text"

print("DATA FILES:")
print("-" * 50)
text_files = [
    "data_descriptor.txt",
    "mexico_1-003.tsv", 
    "mexico_2-001.tsv",
    "mexico_3-004.tsv"
]

for file in text_files:
    print(f"• {file}")

print()
print("=" * 60)
print()

# Analyze the descriptor file
print("1. DATA DESCRIPTOR ANALYSIS:")
print("-" * 50)
print("The descriptor file explains the structure of the TSV files:")
print()
print("TSV FILES CONTAIN TWITTER DATA WITH THE FOLLOWING ATTRIBUTES:")
print("-" * 50)

# Read and display the descriptor content
descriptor_path = os.path.join(text_dir, "data_descriptor.txt")
with open(descriptor_path, 'r') as f:
    descriptor_content = f.read()
    print(descriptor_content)

print()
print("=" * 60)
print()

# Analyze TSV files (without loading them fully due to size)
print("2. TSV FILES ANALYSIS:")
print("-" * 50)

tsv_files = ["mexico_1-003.tsv", "mexico_2-001.tsv", "mexico_3-004.tsv"]

for i, file in enumerate(tsv_files, 1):
    print(f"{i}. {file.upper()}:")
    print("-" * 30)
    
    file_path = os.path.join(text_dir, file)
    
    # Get file size in bytes
    file_size_bytes = os.path.getsize(file_path)
    file_size_mb = file_size_bytes / (1024 * 1024)
    
    # Count lines efficiently
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            line_count = sum(1 for line in f)
    except:
        line_count = "Unknown (file too large)"
    
    print(f"• File size: {file_size_mb:.1f} MB")
    print(f"• Total rows: {line_count:,}" if isinstance(line_count, int) else f"• Total rows: {line_count}")
    print(f"• Data type: Twitter/X social media posts")
    print(f"• Time period: 2020-2023 (COVID-19 era)")
    print(f"• Language: Mixed (primarily Spanish and English)")
    print()

print("TSV FILES STRUCTURE SUMMARY:")
print("-" * 50)
print("Each TSV file contains Twitter data with these key columns:")
print("• tweet_id: Unique tweet identifier")
print("• date_time: When the tweet was posted")
print("• lang: Language code (en, es, etc.)")
print("• user_id: Author identifier")
print("• tweet_text_*: Tweet content and extracted entities")
print("• sentiment_label: -1 (negative), 0 (neutral), 1 (positive)")
print("• sentiment_conf: Confidence score for sentiment")
print("• geo_*: GPS coordinates and location data")
print("• place_*: Twitter place tags")
print("• user_loc_*: User location information")
print("• *_toponyms: Named entities and locations extracted from text")
print()


## Esquema canonico y mappeo de fuentes

**Idea:** diseñar un **esquema canónico** mínimo para consultas unificadas.

- **Dimensión Territorio**: `entidad_id`, `entidad_nombre`, (opc. `municipio_id`, `municipio_nombre`)
- **Dimensión Tiempo**: `fecha` (día), (opc. `semana`, `mes`)
- **Hechos Clínicos (casos)**: variables de `COVID19MEXICO.csv` limpias y enlazadas con catálogos.
- **Hechos Serie**: `(entidad_id, fecha, metrica, valor)` donde `metrica ∈ {confirmados, defunciones, negativos, sospechosos}`
- **Hechos Texto (vectorial)**: `(doc_id, embedding, metadatos)`

**Campos de enlace (join keys):**
- `ENTIDAD_RES` (relacional) ↔ `cve_ent` (series) ↔ `CLAVE_ENTIDAD` (catálogo ENTIDADES)

In [None]:
import duckdb
import pandas as pd
import os
from datetime import datetime
import numpy as np

# Initialize DuckDB connection
con = duckdb.connect(database=':memory:')

# Fix the date parsing in the load_and_prepare_data function
def load_and_prepare_data():
    """Load and prepare all required datasets with proper column handling"""
    data_dir = "../data"
    
    # Load COVID cases data (relational) - all yearly CSV files
    relational_dir = os.path.join(data_dir, "relational")
    csv_files = [
        "COVID19MEXICO2020.csv",
        "COVID19MEXICO2021.csv", 
        "COVID19MEXICO2022.csv",
        "COVID19MEXICO2023.csv"
    ]
    
    df_cases_list = []
    all_columns = set()
    
    for csv_file in csv_files:
        file_path = os.path.join(relational_dir, csv_file)
        if os.path.exists(file_path):
            try:
                # Fix DtypeWarning by specifying dtype for problematic column
                df_year = pd.read_csv(file_path, dtype={'PAIS_ORIGEN': str})
                df_year['YEAR'] = csv_file.replace('COVID19MEXICO', '').replace('.csv', '')
                df_cases_list.append(df_year)
                all_columns.update(df_year.columns)
                print(f"✓ Loaded {csv_file}: {df_year.shape}")
                print(f"  Columns: {list(df_year.columns)}")
            except Exception as e:
                print(f"⚠ Error loading {csv_file}: {e}")
        else:
            print(f"⚠ File not found: {csv_file}")
    
    # Combine all yearly data
    if df_cases_list:
        df_cases = pd.concat(df_cases_list, ignore_index=True)
        print(f"✓ Combined relational data: {df_cases.shape}")
        print(f"✓ Years included: {sorted(df_cases['YEAR'].unique())}")
        print(f"✓ All unique columns: {sorted(all_columns)}")
        
        # Check for key columns
        key_columns = ['CLASIFICACION_FINAL', 'RESULTADO_LAB', 'FECHA_ACTUALIZACION', 'ENTIDAD_RES']
        missing_cols = [col for col in key_columns if col not in df_cases.columns]
        if missing_cols:
            print(f"⚠ Missing key columns: {missing_cols}")
        else:
            print(f"✓ All key columns present: {key_columns}")
            
    else:
        raise ValueError("No CSV files could be loaded")
    
    # Load catalog data
    catalogos_file = os.path.join(data_dir, "relational/240708 Catalogos.xlsx")
    cats = pd.read_excel(catalogos_file, sheet_name=None)
    
    # Load and transform series data (graph)
    graph_dir = os.path.join(data_dir, "graph")
    series_files = {
        'confirmados': 'Casos_Diarios_Estado_Nacional_Confirmados_20230625.csv',
        'defunciones': 'Casos_Diarios_Estado_Nacional_Defunciones_20230625.csv',
        'negativos': 'Casos_Diarios_Estado_Nacional_Negativos_20230625.csv',
        'sospechosos': 'Casos_Diarios_Estado_Nacional_Sospechosos_20230625.csv'
    }
    
    series_long = {}
    for name, filename in series_files.items():
        file_path = os.path.join(graph_dir, filename)
        df = pd.read_csv(file_path)
        
        # Transform wide format to long format
        id_vars = ['cve_ent', 'poblacion', 'nombre']
        date_cols = [col for col in df.columns if col not in id_vars]
        
        df_long = pd.melt(
            df, 
            id_vars=id_vars, 
            value_vars=date_cols,
            var_name='fecha', 
            value_name='valor'
        )
        df_long['metrica'] = name
        # FIX: Parse dates with correct format (DD-MM-YYYY)
        df_long['fecha'] = pd.to_datetime(df_long['fecha'], format='%d-%m-%Y')
        
        series_long[name] = df_long
    
    print(f"✓ Loaded graph data: {len(series_long)} series")
    
    # Load text data (Twitter sentiment) - sample for analysis
    text_dir = os.path.join(data_dir, "text")
    twitter_files = ["mexico_1-003.tsv", "mexico_2-001.tsv", "mexico_3-004.tsv"]
    
    # Load a sample of Twitter data for sentiment analysis
    df_twitter_sample = None
    for twitter_file in twitter_files[:1]:  # Use first file for sample
        file_path = os.path.join(text_dir, twitter_file)
        try:
            # Read sample of Twitter data
            df_twitter_sample = pd.read_csv(file_path, sep='\t', nrows=10000)
            print(f"✓ Loaded text data sample: {df_twitter_sample.shape}")
            break
        except Exception as e:
            print(f"⚠ Could not load {twitter_file}: {e}")
            continue
    
    return df_cases, cats, series_long, df_twitter_sample

# Load all data modalities using robust function
try:
    df_cases, cats, series_long, df_twitter_sample = load_and_prepare_data()
except Exception as e:
    print(f"✗ Error loading data: {e}")
    raise

# Register relational data
con.register('cases_raw', df_cases)

# Register graph data (series)
for name, df in series_long.items():
    con.register(f'series_{name}', df)

# Register text data if available
if df_twitter_sample is not None:
    con.register('twitter_sample', df_twitter_sample)

# Prepare entity catalog
cat_ent = cats.get('Catálogo de ENTIDADES')
if cat_ent is not None:
    cat_ent = cat_ent.rename(columns={
        'CLAVE_ENTIDAD': 'entidad_id',
        'ENTIDAD_FEDERATIVA': 'entidad_nombre', 
        'ABREVIATURA': 'ent_abbr'
    })
    con.register('cat_entidades', cat_ent)

# Create unified series view
con.execute("""
CREATE OR REPLACE VIEW series_all AS
SELECT 
    cve_ent AS entidad_id,
    nombre AS entidad_nombre, 
    poblacion,
    fecha,
    metrica,
    valor
FROM (
    SELECT cve_ent, poblacion, nombre, fecha, metrica, valor FROM series_confirmados
    UNION ALL
    SELECT cve_ent, poblacion, nombre, fecha, metrica, valor FROM series_defunciones
    UNION ALL
    SELECT cve_ent, poblacion, nombre, fecha, metrica, valor FROM series_negativos
    UNION ALL
    SELECT cve_ent, poblacion, nombre, fecha, metrica, valor FROM series_sospechosos
);
""")

# MULTI-MODAL ANALYSIS PIPELINE

print("\n" + "="*60)
print("COMPREHENSIVE COVID-19 MULTI-MODAL ANALYSIS PIPELINE")
print("="*60)

# 1. RELATIONAL-GRAPH INTEGRATION: Match by date and location
print("\n1. RELATIONAL-GRAPH INTEGRATION:")
print("-" * 40)

# First, let's check the data types and fix date columns
con.execute("""
-- Convert date columns to proper date format
CREATE OR REPLACE VIEW cases_raw_processed AS
SELECT 
    *,
    CAST(FECHA_ACTUALIZACION AS DATE) as case_date_clean
FROM cases_raw
WHERE FECHA_ACTUALIZACION IS NOT NULL
    AND FECHA_ACTUALIZACION != '9999-99-99'  -- Filter out invalid dates
    AND CAST(SUBSTR(FECHA_ACTUALIZACION, 1, 4) AS INTEGER) BETWEEN 2020 AND 2023;  -- Filter valid year range
""")

con.execute("""
-- Convert series dates to proper date format
CREATE OR REPLACE VIEW series_all_processed AS
SELECT 
    *,
    CAST(fecha AS DATE) as series_date_clean
FROM series_all
WHERE fecha IS NOT NULL;
""")

# Memory-efficient integration with chunked processing
print("Creating memory-efficient relational-graph integration...")

# Set conservative memory parameters
con.execute("SET memory_limit='16GB'")  # Reduced from 32GB
con.execute("SET max_temp_directory_size='100GB'")  # Increased temp space
con.execute("SET threads=2")  # Reduced threads to save memory
con.execute("SET preserve_insertion_order=false")

# First, create an empty table with the right structure
con.execute("""
CREATE OR REPLACE TABLE relational_graph_integration (
    case_date DATE,
    entidad_id INTEGER,
    entidad_nombre VARCHAR,
    CLASIFICACION_FINAL INTEGER,
    RESULTADO_LAB INTEGER,
    series_date DATE,
    metrica VARCHAR,
    series_value DOUBLE,
    date_diff_days DOUBLE
);
""")

# Process by year and entity to reduce memory usage
years = [2020, 2021, 2022, 2023]

for year in years:
    print(f"Processing year {year}...")
    
    # Get entities for this year to process in smaller chunks
    entities = con.execute(f"""
        SELECT DISTINCT c.ENTIDAD_RES 
        FROM cases_raw_processed c
        WHERE c.FECHA_ACTUALIZACION IS NOT NULL
            AND EXTRACT(YEAR FROM c.case_date_clean) = {year}
        ORDER BY c.ENTIDAD_RES
    """).fetchall()
    
    print(f"  Processing {len(entities)} entities for year {year}")
    
    # Process entities in batches of 5 to reduce memory usage
    batch_size = 5
    for i in range(0, len(entities), batch_size):
        entity_batch = entities[i:i+batch_size]
        entity_list = [str(e[0]) for e in entity_batch]
        entity_filter = f"c.ENTIDAD_RES IN ({','.join(entity_list)})"
        
        print(f"    Processing entities {i+1}-{min(i+batch_size, len(entities))} of {len(entities)}")
        
        con.execute(f"""
        INSERT INTO relational_graph_integration
        SELECT 
            c.FECHA_ACTUALIZACION as case_date,
            c.ENTIDAD_RES as entidad_id,
            e.entidad_nombre,
            c.CLASIFICACION_FINAL,
            c.RESULTADO_LAB,
            s.fecha as series_date,
            s.metrica,
            s.valor as series_value,
            ABS(julian(c.case_date_clean) - julian(s.series_date_clean)) as date_diff_days
        FROM cases_raw_processed c
        LEFT JOIN cat_entidades e ON c.ENTIDAD_RES = e.entidad_id
        LEFT JOIN series_all_processed s ON c.ENTIDAD_RES = s.entidad_id 
            AND ABS(julian(c.case_date_clean) - julian(s.series_date_clean)) <= 7
        WHERE c.FECHA_ACTUALIZACION IS NOT NULL
            AND s.fecha IS NOT NULL
            AND EXTRACT(YEAR FROM c.case_date_clean) = {year}
            AND {entity_filter}
        ORDER BY c.FECHA_ACTUALIZACION, c.ENTIDAD_RES, s.metrica;
        """)
    
    print(f"Completed year {year}")

# Show final results
integration_count = con.execute('SELECT COUNT(*) FROM relational_graph_integration').fetchone()[0]
print(f"Total integration records: {integration_count:,}")

# 2. GRAPH-TEXT INTEGRATION: Correlate sentiment with case numbers
print("\n2. GRAPH-TEXT INTEGRATION (Sentiment-Case Correlation):")
print("-" * 50)

if df_twitter_sample is not None:
    # Process Twitter data for sentiment analysis
    con.execute("""
    CREATE OR REPLACE VIEW twitter_processed AS
    SELECT 
        tweet_id,
        date_time,
        lang,
        sentiment_label,
        sentiment_conf,
        CASE 
            WHEN sentiment_label = 1 THEN 'positive'
            WHEN sentiment_label = 0 THEN 'neutral'
            WHEN sentiment_label = -1 THEN 'negative'
            ELSE 'unknown'
        END as sentiment_category,
        CAST(SUBSTR(date_time, 1, 10) AS DATE) as tweet_date
    FROM twitter_sample
    WHERE sentiment_label IS NOT NULL
        AND date_time IS NOT NULL;
    """)
    
    # Create daily sentiment aggregation
    con.execute("""
    CREATE OR REPLACE VIEW daily_sentiment AS
    SELECT 
        tweet_date,
        sentiment_category,
        COUNT(*) as tweet_count,
        AVG(sentiment_conf) as avg_confidence
    FROM twitter_processed
    GROUP BY tweet_date, sentiment_category
    ORDER BY tweet_date, sentiment_category;
    """)
    
    # Correlate sentiment with case numbers
    con.execute("""
    CREATE OR REPLACE VIEW sentiment_case_correlation AS
    SELECT 
        s.fecha,
        s.entidad_id,
        s.entidad_nombre,
        s.metrica,
        s.valor as case_count,
        ts.sentiment_category,
        ts.tweet_count as sentiment_tweets,
        ts.avg_confidence,
        ABS(julian(s.fecha) - julian(ts.tweet_date)) as date_diff_days
    FROM series_all s
    LEFT JOIN daily_sentiment ts ON ABS(julian(s.fecha) - julian(ts.tweet_date)) <= 3
    WHERE s.metrica IN ('confirmados', 'defunciones')
        AND ts.sentiment_category IS NOT NULL
    ORDER BY s.fecha, s.entidad_id, s.metrica;
    """)
    
    sentiment_count = con.execute('SELECT COUNT(*) FROM sentiment_case_correlation').fetchone()[0]
    print(f"✓ Created sentiment-case correlation: {sentiment_count:,} records")
    
    # Show sentiment analysis sample
    print("\nSample sentiment analysis:")
    sample_sentiment = con.execute("""
        SELECT fecha, entidad_nombre, metrica, case_count, 
               sentiment_category, sentiment_tweets, avg_confidence
        FROM sentiment_case_correlation 
        WHERE sentiment_category IS NOT NULL
        ORDER BY fecha DESC, case_count DESC
        LIMIT 5
    """).fetchall()
    
    for row in sample_sentiment:
        print(f"  {row[0]} | {row[1]} | {row[2]} | {row[3]} | {row[4]} | {row[5]} | {row[6]:.2f}")

else:
    print("⚠ Twitter data not available for sentiment analysis")

# 3. COMPREHENSIVE ANALYSIS VIEWS
print("\n3. COMPREHENSIVE ANALYSIS VIEWS:")
print("-" * 35)

# Create comprehensive COVID-19 analysis view
con.execute("""
CREATE OR REPLACE VIEW covid_comprehensive_analysis AS
SELECT 
    s.fecha,
    s.entidad_id,
    s.entidad_nombre,
    s.poblacion,
    -- Case metrics
    MAX(CASE WHEN s.metrica = 'confirmados' THEN s.valor ELSE 0 END) as casos_confirmados,
    MAX(CASE WHEN s.metrica = 'defunciones' THEN s.valor ELSE 0 END) as defunciones,
    MAX(CASE WHEN s.metrica = 'negativos' THEN s.valor ELSE 0 END) as casos_negativos,
    MAX(CASE WHEN s.metrica = 'sospechosos' THEN s.valor ELSE 0 END) as casos_sospechosos,
    -- Calculated metrics
    MAX(CASE WHEN s.metrica = 'confirmados' THEN s.valor ELSE 0 END) * 100000.0 / s.poblacion as tasa_confirmados_100k,
    MAX(CASE WHEN s.metrica = 'defunciones' THEN s.valor ELSE 0 END) * 100000.0 / s.poblacion as tasa_defunciones_100k,
    -- Sentiment metrics (if available)
    ts_neg.avg_confidence as avg_negative_sentiment_conf,
    ts_pos.avg_confidence as avg_positive_sentiment_conf,
    ts_neg.tweet_count as negative_tweets,
    ts_pos.tweet_count as positive_tweets
FROM series_all s
LEFT JOIN daily_sentiment ts_neg ON ABS(julian(s.fecha) - julian(ts_neg.tweet_date)) <= 3 
    AND ts_neg.sentiment_category = 'negative'
LEFT JOIN daily_sentiment ts_pos ON ABS(julian(s.fecha) - julian(ts_pos.tweet_date)) <= 3 
    AND ts_pos.sentiment_category = 'positive'
GROUP BY s.fecha, s.entidad_id, s.entidad_nombre, s.poblacion, 
         ts_neg.avg_confidence, ts_pos.avg_confidence, 
         ts_neg.tweet_count, ts_pos.tweet_count
ORDER BY s.fecha DESC, s.entidad_id;
""")

# Create research-ready dataset
con.execute("""
CREATE OR REPLACE VIEW research_dataset AS
SELECT 
    fecha,
    entidad_id,
    entidad_nombre,
    poblacion,
    casos_confirmados,
    defunciones,
    casos_negativos,
    casos_sospechosos,
    tasa_confirmados_100k,
    tasa_defunciones_100k,
    -- Sentiment features
    COALESCE(avg_negative_sentiment_conf, 0) as negative_sentiment_confidence,
    COALESCE(avg_positive_sentiment_conf, 0) as positive_sentiment_confidence,
    COALESCE(negative_tweets, 0) as negative_tweet_count,
    COALESCE(positive_tweets, 0) as positive_tweet_count,
    -- Derived features
    CASE WHEN casos_confirmados > 0 THEN defunciones * 100.0 / casos_confirmados ELSE 0 END as case_fatality_rate,
    CASE WHEN poblacion > 0 THEN (casos_confirmados + casos_negativos) * 100.0 / poblacion ELSE 0 END as test_positivity_rate
FROM covid_comprehensive_analysis
WHERE casos_confirmados > 0 OR defunciones > 0;
""")

print("✓ Created comprehensive analysis views:")
print("  - covid_comprehensive_analysis: Multi-modal integrated data")
print("  - research_dataset: Clean dataset for research")

# 4. ANALYSIS INSIGHTS
print("\n4. ANALYSIS INSIGHTS:")
print("-" * 20)

# Top affected states
print("\nTop 10 states by total confirmed cases:")
top_states = con.execute("""
    SELECT entidad_nombre, SUM(casos_confirmados) as total_cases, 
           SUM(defunciones) as total_deaths,
           AVG(tasa_confirmados_100k) as avg_rate_100k
    FROM research_dataset 
    GROUP BY entidad_id, entidad_nombre
    ORDER BY total_cases DESC
    LIMIT 10
""").fetchall()

for i, row in enumerate(top_states, 1):
    print(f"  {i:2d}. {row[0]:<25} | Cases: {row[1]:>8,} | Deaths: {row[2]:>6,} | Rate: {row[3]:>6.1f}")

# Fixed sentiment analysis with proper column handling
if df_twitter_sample is not None:
    # Process Twitter data for sentiment analysis with proper date parsing
    con.execute("""
    CREATE OR REPLACE VIEW twitter_processed AS
    SELECT 
        tweet_id,
        CAST(date_time AS TIMESTAMP) as tweet_timestamp,
        DATE(CAST(date_time AS TIMESTAMP)) as tweet_date,
        lang,
        sentiment_label,
        sentiment_conf,
        CASE 
            WHEN sentiment_label = -1 THEN 'negative'
            WHEN sentiment_label = 0 THEN 'neutral' 
            WHEN sentiment_label = 1 THEN 'positive'
            ELSE 'unknown'
        END as sentiment_category,
        -- Extract location information for better correlation
        COALESCE(geo_state, place_state, user_loc_state) as tweet_state,
        COALESCE(geo_country_code, place_country_code, user_loc_country_code) as tweet_country
    FROM twitter_sample
    WHERE sentiment_label IS NOT NULL 
        AND sentiment_conf IS NOT NULL
        AND date_time IS NOT NULL
        AND (geo_country_code = 'MX' OR place_country_code = 'MX' OR user_loc_country_code = 'MX');
    """)
    
    # Create daily sentiment aggregation with location filtering
    con.execute("""
    CREATE OR REPLACE VIEW daily_sentiment AS
    SELECT 
        tweet_date,
        sentiment_category,
        COUNT(*) as tweet_count,
        AVG(sentiment_conf) as avg_confidence
    FROM twitter_processed
    WHERE tweet_country = 'MX'  -- Only Mexican tweets
    GROUP BY tweet_date, sentiment_category
    ORDER BY tweet_date, sentiment_category;
    """)
    
    # Improved sentiment-case correlation with better date matching
    con.execute("""
    CREATE OR REPLACE VIEW sentiment_case_correlation AS
    SELECT 
        s.fecha,
        s.entidad_id,
        s.entidad_nombre,
        s.metrica,
        s.valor as case_count,
        ts.sentiment_category,
        ts.tweet_count as sentiment_tweets,
        ts.avg_confidence,
        ABS(julian(s.fecha) - julian(ts.tweet_date)) as date_diff_days
    FROM series_all s
    LEFT JOIN daily_sentiment ts ON ABS(julian(s.fecha) - julian(ts.tweet_date)) <= 3
    WHERE s.metrica IN ('confirmados', 'defunciones')
        AND ts.sentiment_category IS NOT NULL
    ORDER BY s.fecha, s.entidad_id, s.metrica;
    """)
    
    sentiment_corr = con.execute("""
        SELECT 
            entidad_nombre,
            metrica,
            COUNT(*) as count,
            AVG(case_count) as avg_cases,
            AVG(sentiment_tweets) as avg_tweets,
            AVG(avg_confidence) as avg_conf
        FROM sentiment_case_correlation 
        WHERE sentiment_category IS NOT NULL
        GROUP BY entidad_nombre, metrica
        ORDER BY entidad_nombre, metrica
    """).fetchall()

    for row in sentiment_corr:
        print(f"  {row[0]:<12} | {row[1]:<8} | Count: {row[2]:>6} | Cases: {row[3]:>6.1f} | Tweets: {row[4]:>6.1f} | Conf: {row[5]:>5.2f}")

# 5. EXPORT FOR RESEARCH
print("\n5. RESEARCH DATASET EXPORT:")
print("-" * 30)

# Get dataset statistics
dataset_stats = con.execute("""
    SELECT 
        COUNT(*) as total_records,
        COUNT(DISTINCT entidad_id) as unique_states,
        MIN(fecha) as start_date,
        MAX(fecha) as end_date,
        SUM(casos_confirmados) as total_cases,
        SUM(defunciones) as total_deaths
    FROM research_dataset
""").fetchone()

# Get year distribution from the combined cases data
year_stats = con.execute("""
    SELECT 
        YEAR,
        COUNT(*) as records,
        COUNT(DISTINCT ENTIDAD_RES) as unique_entities
    FROM cases_raw
    GROUP BY YEAR
    ORDER BY YEAR
""").fetchall()

print(f"✓ Research dataset ready:")
print(f"  - Records: {dataset_stats[0]:,}")
print(f"  - States: {dataset_stats[1]}")
print(f"  - Date range: {dataset_stats[2]} to {dataset_stats[3]}")
print(f"  - Total cases: {dataset_stats[4]:,}")
print(f"  - Total deaths: {dataset_stats[5]:,}")

print(f"\n✓ Multi-year data distribution:")
for year, records, entities in year_stats:
    print(f"  - {year}: {records:,} records, {entities} entities")

print("\n" + "="*60)
print("MULTI-MODAL COVID-19 ANALYSIS PIPELINE COMPLETE")
print("="*60)
print("\nAvailable views for analysis:")
print("  - series_all: Unified time series data")
print("  - relational_graph_integration: Relational + Graph data")
print("  - sentiment_case_correlation: Graph + Text sentiment data")
print("  - covid_comprehensive_analysis: All modalities integrated")
print("  - research_dataset: Clean dataset for research")


### Problemática por mala toma de decisiones derivada de datos de baja calidad

Durante la pandemia de COVID-19 en México, decisiones críticas (como asignación de recursos hospitalarios, distribución de vacunas y diseño de campañas de prevención) pudieron verse afectadas por datos de baja calidad.

Dimensiones de calidad afectadas:

- **Completitud**: valores faltantes en `FECHA_DEF`, `UCI`, `INTUBADO`, lo que impide medir correctamente la mortalidad y severidad.

- **Consistencia**: `RESULTADO_LAB` vs `CLASIFICACION_FINAL` no siempre coinciden (ej. casos confirmados sin prueba positiva).

- **Exactitud**: errores de captura en `EDAD` (valores extremos como 999 años).

- **Oportunidad**: retraso en `FECHA_ACTUALIZACION` genera reportes desfasados que afectan la toma de decisiones en tiempo real.

- **Validez**: códigos de entidades (`ENTIDAD_RES`) no válidos que generan casos mal asignados a estados inexistentes.

Consecuencia:
Si los responsables asignan camas UCI en base a reportes incompletos o tardíos, se puede subestimar la demanda real, provocando saturación en hospitales de ciertos estados mientras otros quedan con recursos sin usar.

### Preguntas descriptivas (qué ha pasado, qué está pasando):

- ¿Cuál fue la evolución del número de casos confirmados por año y por estado (`YEAR`, `ENTIDAD_RES`, `CLASIFICACION_FINAL`)?

- ¿Qué proporción de casos confirmados resultaron en hospitalización (`TIPO_PACIENTE`)?

- ¿Cuál es la distribución de comorbilidades (`DIABETES`, `OBESIDAD`, `HIPERTENSION`) entre los casos positivos?

- ¿Cuál es la tasa de letalidad por entidad (`ENTIDAD_RES`) y grupo de edad (`EDAD`)?

- ¿Cuántos pacientes intubados (`INTUBADO`) hubo en cada año?

- ¿Qué sectores de salud (`SECTOR`) concentraron más casos confirmados y defunciones?

- ¿Cuál fue la distribución de casos en población indígena (`HABLA_LENGUA_INDIG`, `INDIGENA`)?

- ¿Cuántos casos positivos corresponden a mujeres embarazadas (`EMBARAZO`)?

- ¿Qué municipios (`MUNICIPIO_RES`) tuvieron mayor número de casos por entidad?

- ¿Cuál fue la proporción de resultados positivos en pruebas de laboratorio vs antígeno (`RESULTADO_LAB`, `RESULTADO_ANTIGENO`)?

### Preguntas predictivas (qué podría pasar, escenarios futuros):

- ¿Cuál es la probabilidad de que un paciente con ciertas comorbilidades (ej. `DIABETES`, `OBESIDAD`, `HIPERTENSION`) requiera UCI (`UCI`)?

- ¿Qué factores aumentan el riesgo de defunción (`FECHA_DEF`) en pacientes hospitalizados (`TIPO_PACIENTE`)?

- ¿Qué estados (`ENTIDAD_RES`) tienen mayor probabilidad de repunte en casos durante la siguiente temporada invernal?

- ¿Cuál es la probabilidad de hospitalización según la edad (`EDAD`) y sexo (`SEXO`)?

- ¿Qué características clínicas predicen la necesidad de intubación (`INTUBADO`)?

- ¿Cuál es la probabilidad de que un paciente migrante (`MIGRANTE`) sea hospitalizado?

- ¿Qué combinación de factores predice mayor mortalidad en mujeres embarazadas (`EMBARAZO` + `comorbilidades`)?

- ¿Cuál es la probabilidad de positividad de una prueba de antígeno según entidad y fecha (`RESULTADO_ANTIGENO` + `FECHA_INGRESO`)?

- ¿Qué municipios tienen mayor riesgo de saturación hospitalaria en caso de un nuevo brote?

- ¿Qué relación hay entre sentimientos expresados en redes (text data) y repunte de casos (sentiment_case_correlation)?

### Objetos relevantes para la toma de decisiones

Campos clave del dataset relacional (estructurados):

- **Identificación y tiempo**: `ID_REGISTRO`, `FECHA_ACTUALIZACION`, `FECHA_INGRESO`, `FECHA_SINTOMAS`, `FECHA_DEF`, `YEAR`.

- **Ubicación**: `ENTIDAD_RES`, `MUNICIPIO_RES`, `ENTIDAD_UM`.

- **Datos demográficos**: `SEXO`, `EDAD`, `NACIONALIDAD`, `EMBARAZO`, `HABLA_LENGUA_INDIG`, `INDIGENA`.

- **Estado clínico**: `TIPO_PACIENTE`, `INTUBADO`, `NEUMONIA`, `UCI`.

- **Comorbilidades**: `DIABETES`, `OBESIDAD`, `HIPERTENSION`, `RENAL_CRONICA`, `ASMA`, `EPOC`, `CARDIOVASCULAR`, `INMUSUPR`.

- **Resultados de pruebas**: `RESULTADO_LAB`, `RESULTADO_ANTIGENO`, `CLASIFICACION_FINAL`.

- **Factores externos**: `MIGRANTE`, `PAIS_ORIGEN`, `SECTOR`.

Nodos del grafo (series temporales):

- Relaciones entre `ENTIDAD_RES` y evolución temporal de casos.

- Nodos representando cadenas de contagio o correlaciones por municipio.

Texto (no estructurado):

- Palabras clave en reportes/noticias con sentimientos (ej. “saturación”, “rebrote”, “vacunación”).

- Polaridad y subjetividad como features de predicción.