In [3]:
# Importar librerías
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import pandas as pd
import numpy as np

from scipy.optimize import curve_fit
from scipy import stats

from sklearn.metrics import (
    r2_score, mean_absolute_error, mean_squared_error,
    confusion_matrix, accuracy_score, precision_score,
    recall_score, roc_auc_score, roc_curve, classification_report, f1_score, precision_recall_curve, average_precision_score, balanced_accuracy_score
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression

import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler


In [7]:
Estocolmo = pd.read_csv('Datasets/estocolmofinal.csv')
Estocolmo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5315 entries, 0 to 5314
Data columns (total 79 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            5315 non-null   int64  
 1   listing_url                                   5315 non-null   object 
 2   scrape_id                                     5315 non-null   int64  
 3   last_scraped                                  5315 non-null   object 
 4   source                                        5315 non-null   object 
 5   name                                          5315 non-null   object 
 6   description                                   5315 non-null   object 
 7   neighborhood_overview                         5315 non-null   object 
 8   picture_url                                   5315 non-null   object 
 9   host_id                                       5315 non-null   i

In [8]:
# Tasas de cambio
# Tasas de cambio a EUR
exchange_rates = {
    'MXN': 0.052,  # México
    'SEK': 0.088,  # Suecia
    'EUR': 1.0     # Alemania y España
}


In [9]:
# Estandarizar price y estimated_revenue_l365d
Estocolmo['price_eur'] = Estocolmo['price'] * exchange_rates['SEK']
Estocolmo['estimated_revenue_eur'] = Estocolmo['estimated_revenue_l365d'] * exchange_rates['SEK']

In [10]:
Estocolmo['price_eur'].describe()

count     5315.000000
mean       156.997712
std        412.342051
min          9.240000
25%         88.000000
50%        116.248000
75%        156.684000
max      12426.040000
Name: price_eur, dtype: float64

In [11]:
Estocolmo['price'].describe()

count      5315.000000
mean       1784.064911
std        4685.705123
min         105.000000
25%        1000.000000
50%        1321.000000
75%        1780.500000
max      141205.000000
Name: price, dtype: float64

In [12]:
# Price_Cat Estomocol
umbral_precio_Estocolmo = Estocolmo['price_eur'].quantile(0.75) #1780
Estocolmo['price_cat'] = np.where(Estocolmo['price_eur'] >= umbral_precio_Estocolmo, 'High price', 'Low price')
Estocolmo['price_cat'].value_counts()


price_cat
Low price     3986
High price    1329
Name: count, dtype: int64

In [13]:
Estocolmo.drop(['price','estimated_revenue_l365d'], axis=1)
Estocolmo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5315 entries, 0 to 5314
Data columns (total 82 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            5315 non-null   int64  
 1   listing_url                                   5315 non-null   object 
 2   scrape_id                                     5315 non-null   int64  
 3   last_scraped                                  5315 non-null   object 
 4   source                                        5315 non-null   object 
 5   name                                          5315 non-null   object 
 6   description                                   5315 non-null   object 
 7   neighborhood_overview                         5315 non-null   object 
 8   picture_url                                   5315 non-null   object 
 9   host_id                                       5315 non-null   i

In [14]:
Mexico = pd.read_csv('Datasets/Mexico_City.csv')
Mexico.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26401 entries, 0 to 26400
Data columns (total 76 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            26401 non-null  float64
 1   listing_url                                   26401 non-null  object 
 2   scrape_id                                     26401 non-null  float64
 3   last_scraped                                  26401 non-null  object 
 4   source                                        26401 non-null  object 
 5   name                                          26401 non-null  object 
 6   description                                   26401 non-null  object 
 7   neighborhood_overview                         26401 non-null  object 
 8   picture_url                                   26401 non-null  object 
 9   host_id                                       26401 non-null 

In [15]:
Mexico['price_eur'] = Mexico['price'] * exchange_rates['MXN']
Mexico['estimated_revenue_eur'] = Mexico['estimated_revenue_l365d'] * exchange_rates['MXN']

In [16]:
Mexico['price_eur'].describe()

count    26401.000000
mean        54.871055
std         29.444398
min          3.432000
25%         30.264000
50%         50.076000
75%         70.278000
max        152.464000
Name: price_eur, dtype: float64

In [17]:
Mexico['price'].describe()

count    26401.000000
mean      1055.212587
std        566.238420
min         66.000000
25%        582.000000
50%        963.000000
75%       1351.500000
max       2932.000000
Name: price, dtype: float64

In [18]:
# Creación de price_cat
umbral_precio_Mexico = Mexico['price_eur'].quantile(0.75)
Mexico['price_cat'] = np.where(Mexico['price_eur'] >= umbral_precio_Mexico, 'High price', 'Low price')
Mexico['price_cat'].value_counts()

price_cat
Low price     19607
High price     6794
Name: count, dtype: int64

In [19]:
Mexico.drop(['price', 'estimated_revenue_l365d'], axis=1)
Mexico.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26401 entries, 0 to 26400
Data columns (total 79 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            26401 non-null  float64
 1   listing_url                                   26401 non-null  object 
 2   scrape_id                                     26401 non-null  float64
 3   last_scraped                                  26401 non-null  object 
 4   source                                        26401 non-null  object 
 5   name                                          26401 non-null  object 
 6   description                                   26401 non-null  object 
 7   neighborhood_overview                         26401 non-null  object 
 8   picture_url                                   26401 non-null  object 
 9   host_id                                       26401 non-null 

In [20]:
Berlin = pd.read_csv('Datasets/Berlin_86.csv')
Berlin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14187 entries, 0 to 14186
Data columns (total 87 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Unnamed: 0                                    14187 non-null  int64  
 1   listing_url                                   14187 non-null  object 
 2   last_scraped                                  14187 non-null  object 
 3   source                                        14187 non-null  object 
 4   name                                          14187 non-null  object 
 5   description                                   14187 non-null  object 
 6   neighborhood_overview                         14187 non-null  object 
 7   picture_url                                   14187 non-null  object 
 8   host_id                                       14187 non-null  int64  
 9   host_url                                      14187 non-null 

In [21]:
Berlin['price'].describe()

count    14187.000000
mean       138.829426
std        107.554381
min          5.000000
25%         90.000000
50%        115.000000
75%        173.672536
max       3923.000000
Name: price, dtype: float64

In [22]:
# Renombrar columnas
Berlin.rename(columns= {
    'price': 'price_eur', 
    'estimated_revenue_l365d': 'estimated_revenue_eur'}
    ,inplace= True)

In [23]:
Valencia = pd.read_csv('Datasets/valencia_trabajo.csv')
Valencia.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9009 entries, 0 to 9008
Data columns (total 79 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            9009 non-null   int64  
 1   listing_url                                   9009 non-null   object 
 2   scrape_id                                     9009 non-null   int64  
 3   last_scraped                                  9009 non-null   object 
 4   source                                        9009 non-null   object 
 5   name                                          9009 non-null   object 
 6   description                                   9009 non-null   object 
 7   neighborhood_overview                         9009 non-null   object 
 8   picture_url                                   9009 non-null   object 
 9   host_id                                       9009 non-null   i

In [24]:
Valencia['price'].describe()

count    9009.000000
mean      101.443613
std        44.644134
min         8.000000
25%        72.000000
50%       101.443613
75%       125.000000
max       234.000000
Name: price, dtype: float64

In [25]:
Valencia.rename(columns= {'price':'price_eur', 'estimated_revenue_l365d':'estimated_revenue_eur'}, inplace= True)

In [26]:
# price_cat
umbral_precio_Valencia = Valencia['price_eur'].quantile(0.75)
Valencia['price_cat'] = np.where(Valencia['price_eur'] >= umbral_precio_Valencia, 'High price', 'Low price')
Valencia['price_cat'].value_counts()

price_cat
Low price     6725
High price    2284
Name: count, dtype: int64

In [27]:
#Estocolmo.to_csv('Estocolmo_Final.csv')
#Mexico.to_csv('Mexico_Final.csv')
#Berlin.to_csv('Berlin_Final.csv')
#Valencia.to_csv('Valencia_Final.csv')

In [None]:
%%writefile Dashboard_P1.py
# Dashboard Final equipo — Proyecto Airbnb (By Raymundo Díaz + IA + Profe Freddy)
# Versión de prueba para dejar listos a los 4 países.

##########
# Importar librerías
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
from sklearn.metrics import (
    r2_score, mean_absolute_error, mean_squared_error,
    confusion_matrix, accuracy_score, precision_score,
    recall_score, roc_auc_score, roc_curve, classification_report, f1_score,
    precision_recall_curve, average_precision_score, balanced_accuracy_score
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

##########
# Configuración global
st.set_page_config(
    page_title="Airbnb (Data Web)",
    page_icon="assets/icon.jpg",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Paleta Airbnb
AIRBNB_RED   = "#FF5A5F"
AIRBNB_TEAL  = "#00A699"
AIRBNB_ORANGE= "#FC642D"
AIRBNB_GRAY  = "#BFBFBF"
AIRBNB_DARK_BG = "#0E1117"
AIRBNB_CARD   = "#151A22"
AIRBNB_BORDER = "#232A35"
CONT_GRADIENT = "Reds"

##########
# CSS Look & Feel Airbnb
st.markdown(f"""
<style>
.block-container {{ padding-top: 1.2rem; padding-bottom: 2rem; }}

/* Fondo degradado unificado */
html, body, [data-testid="stAppViewContainer"], section[data-testid="stSidebar"] {{
    background: radial-gradient(circle at 30% 30%, #131722 0%, #0E1117 100%) !important;
    color: white !important;
}}
section[data-testid="stSidebar"] {{
    border-right: 1px solid {AIRBNB_BORDER};
}}

/* Tarjetas KPI */
.air-card {{
    border: 1px solid {AIRBNB_BORDER};
    border-radius:16px; padding:1rem;
    background:{AIRBNB_CARD};
}}

/* Botones */
.stButton>button {{
    background:{AIRBNB_RED}; color:white; border-radius:12px; border:none;
    padding:.6rem 1rem; font-weight:600;
}}
.stButton>button:hover {{ opacity:.9 }}

/* Tablas */
.stDataFrame, .stTable {{ color: white !important; }}
</style>
""", unsafe_allow_html=True)

##########
# Plotly: plantilla Airbnb
AIRBNB_COLORWAY = ["#FF5A5F", "#00A699", "#FC642D", "#BFBFBF", "#767676"]
pio.templates["airbnb_dark"] = pio.templates["plotly_dark"]
pio.templates["airbnb_dark"].layout.colorway = AIRBNB_COLORWAY
px.defaults.template = "airbnb_dark"
px.defaults.color_continuous_scale = CONT_GRADIENT
px.defaults.height = 420

##########
# Multi-país
COUNTRY_FILES = {
    "Alemania": "Berlin_Final.csv",
    "Valencia": "Valencia_Final.csv",
    "Estocolmo": "Estocolmo_Final.csv",
    "Mexico": "Mexico_Final.csv",
}

COUNTRY_IMAGES = {
    "Alemania": ["assets/Berlin1.jpg", "assets/Berlin3.jpg", "assets/Berlin2.jpg"],
    "Valencia": ["assets/Valencia1.jpg", "assets/Valencia2.jpg", "assets/Valencia3.jpg"],
    "Estocolmo": ["assets/Estocolmo1.jpg", "assets/Estocolmo2.jpg", "assets/Estocolmo3.jpg"],
    "Mexico": ["assets/Mexico1.jpg", "assets/Mexico2.jpg", "assets/Mexico3.jpg"],
}

##########
# Normalización
BIN_TRUE = {"t","true","True",1,"1",True}
BIN_FALSE= {"f","false","False",0,"0",False}

def _normalize_binary(series):
    s = series.copy()
    return s.apply(lambda v: 1 if v in BIN_TRUE else (0 if v in BIN_FALSE else np.nan)).astype("float")

def _normalize_df(df_raw):
    df = df_raw.copy()

    # 1) Normaliza nombres de columnas (quitan espacios accidentales)
    df.columns = df.columns.str.strip()

    # 2) Drop de *cualquier* columna Unnamed
    df = df.loc[:, ~df.columns.str.contains(r"^Unnamed", na=False)]

    # 3) Drops opcionales que ya tenías
    df = df.drop(['latitude','longitude','first_review','last_review','host_since', 'price', 'estimated_revenue_l365d','source','id', 'scrape_id'],
                 axis=1, errors="ignore")

    # 4) Tipos
    if 'id' in df.columns:
        df['id'] = df['id'].astype(str)

    if 'host_id' in df.columns:
        df['host_id'] = df['host_id'].astype(str)

    # 5) Normaliza binarias
    for col in ['host_is_superhost','host_identity_verified','instant_bookable']:
        if col in df.columns:
            df[col] = _normalize_binary(df[col])

    # 6) A numérico (coerce => NaN si hay “90%”, etc.)
    for col in ['host_response_rate','host_acceptance_rate','price','estimated_revenue_l365d']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    return df

def _clean_xy(df_base, y_col, x_cols):
    """
    Devuelve X, y sin NaN/Inf y un conteo de filas filtradas.
    """
    work = df_base[x_cols + [y_col]].replace([np.inf, -np.inf], np.nan)
    before = len(work)
    work = work.dropna()
    after = len(work)
    X = work[x_cols].to_numpy(dtype=float)
    y = work[y_col].to_numpy(dtype=float)
    return X, y, before - after

@st.cache_data(show_spinner=False)
def load_country_df(country: str):
    path = COUNTRY_FILES[country]
    raw = pd.read_csv(path)
    df = _normalize_df(raw)
    Lista = [
        'host_is_superhost','host_identity_verified','host_response_time',
        'host_response_rate','host_acceptance_rate','host_total_listings_count',
        'host_verifications','room_type','property_type','price_cat'
    ]
    return df, Lista

# Carga inicial
df, Lista = load_country_df("Alemania")

##########
# Header
col_logo, col_title = st.columns([1,5], vertical_alignment="center")
with col_logo:
    st.image("assets/Logo3.jpg", width=90)
with col_title:
    st.markdown("""
        # Airbnb Data Analysis
        <span style="color:#767676">Listados, precios y comportamiento de oferta</span>
    """, unsafe_allow_html=True)

##########
# Sidebar
st.sidebar.image("assets/Logoo.jpg", use_container_width=True)
st.sidebar.caption("Análisis exploratorio y modelos")
st.sidebar.markdown("---")
modo_presentacion = st.sidebar.toggle("Modo presentación", value=False)
country = st.sidebar.selectbox("País", list(COUNTRY_FILES.keys()), index=0)
df, Lista = load_country_df(country)
View = st.sidebar.selectbox(
    label='Tipo de análisis',
    options=['Extracción de Características', 'Regresión Lineal', 'Regresión No Lineal', 'Regresión Logística', 'Comparar países'],
    index=0
)

##########################################################################################
# Vista 1 — Extracción de características
if View == "Extracción de Características":
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        st.metric("Filas", f"{len(df):,}")
        st.markdown('</div>', unsafe_allow_html=True)
    with col2:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        st.metric("Tipos de propiedad", df['property_type'].nunique() if 'property_type' in df.columns else "—")
        st.markdown('</div>', unsafe_allow_html=True)
    with col3:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        med_price = np.nanmean(df['price_eur']) if 'price_eur' in df.columns else np.nan
        st.metric("Media de precio", f"€{med_price:,.0f}" if np.isfinite(med_price) else "—")
        st.markdown('</div>', unsafe_allow_html=True)
    with col4:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        superhosts = int((df['host_is_superhost'] == 1).sum()) if 'host_is_superhost' in df.columns else 0
        st.metric("Superhosts", superhosts)
        st.markdown('</div>', unsafe_allow_html=True)

    st.markdown("---")

    Variable_Cat = st.sidebar.selectbox("Variable categórica a analizar", options=Lista)
    Tabla_frecuencias = df[Variable_Cat].value_counts(dropna=False).reset_index().head(10)
    Tabla_frecuencias.columns = ['categorias', 'frecuencia']

    st.title("Extracción de Características")
    st.caption('Se muestran máximo las 10 categorías con más frecuencia.')

    Contenedor_A, Contenedor_B = st.columns(2)
    with Contenedor_A:
        st.subheader("Distribución por categoría (Bar Plot)")
        fig_bar = px.bar(Tabla_frecuencias, x='categorias', y='frecuencia', color='categorias')
        st.plotly_chart(fig_bar, use_container_width=True)
    with Contenedor_B:
        st.subheader("Proporción por categoría (Pie Chart)")
        fig_pie = px.pie(Tabla_frecuencias, names='categorias', values='frecuencia')
        st.plotly_chart(fig_pie, use_container_width=True)

    Contenedor_C, Contenedor_D = st.columns(2)
    with Contenedor_C:
        st.subheader("Gráfico tipo anillo")
        fig_donut = px.pie(Tabla_frecuencias, names='categorias', values='frecuencia', hole=0.5)
        st.plotly_chart(fig_donut, use_container_width=True)
    with Contenedor_D:
        st.subheader("Tendencia acumulada (Área)")
        fig_area = px.area(Tabla_frecuencias.sort_values(by='frecuencia', ascending=False),
                           x='categorias', y='frecuencia')
        st.plotly_chart(fig_area, use_container_width=True)

    st.markdown("---")
    st.subheader("Análisis más profundo")

    if Variable_Cat in ['room_type', 'property_type', 'price_cat'] and 'price' in df.columns:
        st.write("**Relación entre categorías y precio (Boxplot):**")
        fig_box = px.box(df, x=Variable_Cat, y='price', color=Variable_Cat)
        st.plotly_chart(fig_box, use_container_width=True)
    else:
        st.write("**Heatmap de proporciones:**")
        heat_df = pd.crosstab(index=df[Variable_Cat], columns='count', normalize='columns') * 100
        fig_heat = px.imshow(heat_df, color_continuous_scale=CONT_GRADIENT, title="Proporción por categoría")
        st.plotly_chart(fig_heat, use_container_width=True)

    if not modo_presentacion:
        st.markdown("---")
        st.subheader("Tabla de frecuencias")
        st.dataframe(Tabla_frecuencias.style.background_gradient(cmap='Reds'), use_container_width=True)

    st.markdown(f"**Galería:** {country} — Airbnb")
    imgs = COUNTRY_IMAGES.get(country, [])
    gcols = st.columns(3)
    for i, path in enumerate(imgs[:3]):
        with gcols[i]:
            try:
                st.image(path, use_container_width=True)
            except Exception:
                st.write("🖼️ Imagen no encontrada")



##########################################################################################
# Vista 2 
if View == "Regresión Lineal":
    st.title("Regresión Lineal")

    # Variables numéricas disponibles
    numeric_df = df.select_dtypes(include=['float', 'float64', 'int', 'int64']).copy()
    Lista_num = list(numeric_df.columns)

    # Lineal simple
    st.subheader("Regresión lineal simple")
    colL, colR = st.columns(2)
    with colL:
        Variable_y = st.selectbox("Variable dependiente (Y)", options=Lista_num, key="rl_y")
    with colR:
        Variable_x = st.selectbox("Variable independiente (X)", options=Lista_num, key="rl_x")

    # Ajuste
    X, y, dropped = _clean_xy(numeric_df, Variable_y, [Variable_x])
    if dropped > 0 and not modo_presentacion:
        st.info(f"Se descartaron {dropped} filas con NaN/Inf para el ajuste.")

    if len(y) < 3:
        st.error("No hay suficientes filas válidas para ajustar el modelo.")
        st.stop()

    model = LinearRegression()
    model.fit(X, y)
    y_pred = model.predict(X)


    # Métricas
    r2 = r2_score(y, y_pred)
    coef_Deter_simple = model.score(X= X, y= y)
    coef_Correl_simple = np.sqrt(abs(coef_Deter_simple))

    # Coeficientes
    coef_df_simple = pd.DataFrame({
        "Variable": [Variable_x],
        "Coeficiente": [model.coef_[0]],
        "Intercepto": [model.intercept_],
        "R": [coef_Correl_simple],
        "R^2": [coef_Deter_simple]
    })

    if not modo_presentacion:
        st.dataframe(coef_df_simple, use_container_width=True)

    # Gráfica: dispersión + recta y_pred
    fig_scat = px.scatter(numeric_df, x=Variable_x, y=Variable_y, opacity=0.6, title="Dispersión y recta ajustada")
    # Línea predicha ordenando por X
    order_idx = np.argsort(X[:, 0])
    fig_scat.add_trace(go.Scatter(
        x=X[order_idx, 0], y=y_pred[order_idx],
        mode="lines", name="Predicción de Y"
    ))
    st.plotly_chart(fig_scat, use_container_width=True)

    # Residuales
    resid = y - y_pred
    fig_res = px.scatter(x=y_pred, y=resid, labels={"x":"Ŷ", "y":"Residual"},
                         title="Residuos vs Predicción (diagnóstico)")
    fig_res.add_hline(y=0, line_dash="dot")
    st.plotly_chart(fig_res, use_container_width=True)

    st.markdown("---")

    # Lineal múltiple
    st.subheader("Regresión lineal múltiple")
    col1, col2 = st.columns([1,2])
    with col1:
        Variable_y_M = st.selectbox("Variable dependiente (Y)", options=Lista_num, key="rlm_y")
    with col2:
        Variables_x_M = st.multiselect("Variables independientes (X)", options= Lista_num, key="rlm_xs")

    if len(Variables_x_M) >= 1:
        X_M, y_M, droppedM = _clean_xy(numeric_df, Variable_y_M, Variables_x_M)
        if droppedM > 0 and not modo_presentacion:
            st.info(f"Se descartaron {droppedM} filas con NaN/Inf para el ajuste múltiple.")
        if len(y_M) < max(3, len(Variables_x_M)+1):
            st.error("No hay suficientes filas válidas para el modelo múltiple.")
            st.stop()

        Model_M = LinearRegression()
        Model_M.fit(X_M, y_M)
        y_pred_M = Model_M.predict(X_M)

        # Métricas
        coef_Deter_multiple = Model_M.score(X=X_M, y=y_M)
        coef_Correl_multiple = np.sqrt(abs(coef_Deter_multiple))

        # Coeficientes
        coef_tab = pd.DataFrame({
            "Variable": ["Intercepto"] + Variables_x_M,
            "Coeficiente": [Model_M.intercept_] + list(Model_M.coef_)
        })
        if not modo_presentacion:
            st.dataframe(coef_tab, use_container_width=True)

        met_tab = pd.DataFrame({'R^2': [coef_Deter_multiple], 'R ': [coef_Correl_multiple]})
        st.dataframe(met_tab, use_container_width=True)

        # Gráfica: Real vs Predicho
        fig_pred = px.scatter(x=y_M, y=y_pred_M, labels={"x":"Y real ", "y": "Y predicciones"}, title="Comparación Y Real vs Y Predicciones")
        fig_pred.add_trace(go.Scatter(x=[y_M.min(), y_M.max()], y=[y_M.min(), y_M.max()], mode="lines", name="Línea ideal", line=dict(dash="dot")))
        st.plotly_chart(fig_pred, use_container_width=True)
    else:
        st.info("Selecciona al menos 1 variable para el modelo múltiple.")


##########################################################################################
# Vista 3
if View == "Regresión No Lineal":
    st.title("Regresión No Lineal")

    # Variables numéricas
    numeric_df = df.select_dtypes(include=['float','float64','int','int64']).copy()
    Lista_num = list(numeric_df.columns)

    contA, contB = st.columns(2)
    with contA:
        Variable_y = st.selectbox("Variable dependiente (Y)", options=Lista_num, key="rnl_y_cf")
    with contB:
        Variable_x = st.selectbox("Variable independiente (X)", options=[c for c in Lista_num if c != Variable_y], key="rnl_x_cf")

    # Modelos disponibles
    modelos = [
        "Función cuadrática (a*x**2 + b*x + c)",
        "Función exponencial (a*np.exp(-b*x)+c)",
        "Función potencia (a*x**b)",
        "Función cúbica (a*x**3 + b*x**2 + c*x + d)"
    ]
    Modelo = st.selectbox("Elige modelo no lineal", options=modelos, key="rnl_modelo_cf")

    # Datos
    df_nl = numeric_df[[Variable_x, Variable_y]].replace([np.inf, -np.inf], np.nan).dropna()
    if len(df_nl) < 3:
        st.error("Datos insuficientes tras limpiar NaN/Inf para ajustar el modelo no lineal.")
        st.stop()

    x = df_nl[Variable_x].to_numpy(dtype=float)
    y = df_nl[Variable_y].to_numpy(dtype=float)
    sort_idx = np.argsort(x)
    x_sorted = x[sort_idx]


    # Definiciones de funciones
    def func_cuad(x, a, b, c):
        return a*x**2 + b*x + c

    def func_cub(x, a, b, c, d):
        return a*x**3 + b*x**2 + c*x + d

    def func_exp(x, a, b, c):
        return a * np.exp(-b * x) + c

    def func_pot(x, a, b):
        return a * np.power(x, b)

    # Ajuste
    try:
        if Modelo == "Función cuadrática (a*x**2 + b*x + c)":
            pars, cov = curve_fit(func_cuad, x, y, maxfev=20000)
            y_pred = func_cuad(x, *pars)
            y_line = func_cuad(x_sorted, *pars)
            params_df = pd.DataFrame({"Parámetro": ["a", "b", "c"], "Valor": pars})

        elif Modelo == "Función cúbica (a*x**3 + b*x**2 + c*x + d)":
            pars, cov = curve_fit(func_cub, x, y, maxfev=30000)
            y_pred = func_cub(x, *pars)
            y_line = func_cub(x_sorted, *pars)
            params_df = pd.DataFrame({"Parámetro": ["a", "b", "c", "d"], "Valor": pars})

        elif Modelo == "Función exponencial (a*np.exp(-b*x)+c)":
            mask = np.isfinite(y)
            if np.sum(mask) < 3:
                st.error("No hay suficientes datos válidos para ajustar el modelo exponencial.")
                st.stop()
            pars, cov = curve_fit(func_exp, x, y, maxfev=30000)
            y_pred = func_exp(x, *pars)
            y_line = func_exp(x_sorted, *pars)
            params_df = pd.DataFrame({"Parámetro": ["a", "b", "c"], "Valor": pars})

        elif Modelo == "Función potencia (a*x**b)":
            # Requiere x>0 y y>0
            mask = (x > 0) & (y > 0) & np.isfinite(x) & np.isfinite(y)
            if mask.sum() < 3:
                st.error("Para la función potencia se requieren suficientes valores con x>0 e y>0.")
                st.stop()
            x_pos, y_pos = x[mask], y[mask]
            pars, cov = curve_fit(func_pot, x_pos, y_pos, maxfev=20000)
            # Predicciones seguras en todo el rango
            x_safe = np.clip(x, 1e-12, None)
            x_sorted_safe = np.clip(x_sorted, 1e-12, None)
            y_pred = func_pot(x_safe, *pars)
            y_line = func_pot(x_sorted_safe, *pars)
            params_df = pd.DataFrame({"Parámetro": ["a", "b"], "Valor": pars})

        else:
            st.warning("Selecciona un modelo válido.")
            st.stop()

        # Métricas
        r2 = r2_score(y, y_pred)
        r = np.sqrt(abs(r2))

        # Salidas
        st.markdown("**Parámetros estimados (curve_fit):**")
        if not modo_presentacion:
            st.dataframe(params_df, use_container_width=True)

        st.markdown("**Métricas del ajuste:**")
        st.dataframe(pd.DataFrame({"R^2":[r2], "R ":[r]}), use_container_width=True)

        # Gráfica: dispersión + curva predicha
        fig = px.scatter(x=x, y=y, labels={"x": Variable_x, "y": Variable_y},
                         opacity=0.6, title=f"{Modelo} — Dispersión y curva ajustada")
        fig.add_trace(go.Scatter(x=x_sorted, y=y_line, mode="lines", name="Ŷ (curva)", line=dict(width=2)))
        st.plotly_chart(fig, use_container_width=True)

        # Residuos
        resid = y - y_pred
        fig_resid = px.scatter(x=y_pred, y=resid, labels={"x":"Ŷ", "y":"Residual"},
                               title="Residuos vs Predicción")
        fig_resid.add_hline(y=0, line_dash="dot")
        st.plotly_chart(fig_resid, use_container_width=True)

    except RuntimeError as e:
        st.error(f"No convergió el ajuste: {e}. Prueba con otra X/Y o revisa outliers.")
    except Exception as e:
        st.error(f"Error durante el ajuste: {e}")


##########################################################################################
# Vista 4
if View == "Regresión Logística":
    st.title("Regresión Logística")

    # 1) Listas base: Y binaria y X numéricas
    numeric_df = df.select_dtypes(include=['float', 'float64', 'int', 'int64'])
    Lista_num  = list(numeric_df.columns)

    # Detectar dicotómicas (exactamente 2 valores distintos, ignorando NaN)
    dico_cols = []
    for col in df.columns:
        vals = df[col].dropna().unique()
        if len(vals) == 2:
            dico_cols.append(col)

    # Sidebar
    if len(dico_cols) == 0:
        st.warning("No se detectaron variables binarias en el dataset.")
        st.stop()

    Variable_y = st.sidebar.selectbox("Variable dependiente (Y, binaria)", options=dico_cols)
    Variables_x = st.sidebar.multiselect("Variables independientes (X, numéricas)", options=Lista_num)

    # Sliders
    test_size = st.sidebar.slider("Tamaño de prueba", 0.1, 0.5, 0.30, 0.05)
    thr = st.sidebar.slider("Umbral de clasificación", 0.05, 0.95, 0.50, 0.01)

    if len(Variables_x) == 0:
        st.info("Selecciona al menos una variable independiente (X).")
    else:
        # 2) Preparar X, y (sin modificar df original)
        # Trabajar sobre un df que contenga todas las columnas necesarias
        base = df[Variables_x + [Variable_y]].copy()

        # Mapear Y a {0,1} conservando nombres
        vals = base[Variable_y].dropna().unique().tolist()
        if len(vals) != 2:
            st.error(f"La variable '{Variable_y}' debe tener exactamente 2 clases. Encontradas: {vals}")
            st.stop()

        mapping = {vals[0]: 0, vals[1]: 1}
        base['__y__'] = base[Variable_y].map(mapping)

        # Quita NaN/Inf en X y NaN en Y
        base = base.replace([np.inf, -np.inf], np.nan).dropna(subset=Variables_x + ['__y__'])

        if base['__y__'].nunique() < 2:
            st.error("Tras limpiar datos, solo queda una clase en Y. Ajusta la selección de variables o revisa faltantes.")
            st.stop()

        X = base[Variables_x].astype(float).to_numpy()
        y = base['__y__'].to_numpy(dtype=int)
        clases = vals  # para etiquetar métricas

        # 3) Split + escalado        
        # Sidebar extra: manejo de desbalance y estrategia de umbral
        st.sidebar.markdown("### Manejo de desbalance")
        imb_method = st.sidebar.selectbox("Método", ["Ninguno",
                                                    "class_weight='balanced'",
                                                    "SMOTE (over-sampling)",
                                                    "Under-sampling"])

        st.sidebar.markdown("### Estrategia de umbral")
        thr_mode = st.sidebar.selectbox("Seleccionar umbral por…",
                                        ["Manual", "F1 óptimo", "Minimizar costo", "Maximizar recall con precisión mínima"])
        prec_min = None
        c_fp = None
        c_fn = None
        if thr_mode == "Manual":
            thr = st.sidebar.slider("Umbral de clasificación", 0.01, 0.99, thr, 0.01)
        elif thr_mode == "Maximizar recall con precisión mínima":
            prec_min = st.sidebar.slider("Precisión mínima requerida", 0.1, 0.99, 0.6, 0.01)
        elif thr_mode == "Minimizar costo":
            # Ajusta estos valores a tu caso: p. ej., FP=10,000; FN=80,000 (como has usado antes)
            c_fp = st.sidebar.number_input("Costo por FP", min_value=0, value=10000, step=1000)
            c_fn = st.sidebar.number_input("Costo por FN", min_value=0, value=80000, step=1000)

        # 3) Split + escalado
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42, stratify=y
        )
        escalar = StandardScaler()
        X_train_s = escalar.fit_transform(X_train)
        X_test_s  = escalar.transform(X_test)

        # 3.1) Re-muestreo (solo sobre el set de entrenamiento ya escalado)
        if imb_method == "SMOTE (over-sampling)":
            sm = SMOTE(random_state=42)
            X_train_s, y_train = sm.fit_resample(X_train_s, y_train)
        elif imb_method == "Under-sampling":
            rus = RandomUnderSampler(random_state=42)
            X_train_s, y_train = rus.fit_resample(X_train_s, y_train)

        # 4) Modelo (class_weight según selección)
        if imb_method == "class_weight='balanced'":
            algoritmo = LogisticRegression(max_iter=1000, class_weight='balanced')
        else:
            algoritmo = LogisticRegression(max_iter=1000)

        algoritmo.fit(X_train_s, y_train)

        # 5) Probabilidades y selección de umbral
        y_proba = algoritmo.predict_proba(X_test_s)[:, 1]

        def pick_threshold_by_f1(y_true, y_score):
            p, r, th = precision_recall_curve(y_true, y_score)
            f1 = 2 * (p*r) / np.clip(p+r, 1e-12, None)
            # precision_recall_curve devuelve umbrales len-1 respecto a p/r
            best_idx = np.nanargmax(f1[:-1])
            return th[best_idx], f1[best_idx], p[best_idx], r[best_idx]

        def pick_threshold_by_cost(y_true, y_score, c_fp, c_fn):
            # Recorremos 1001 umbrales uniformes
            ths = np.linspace(0.0, 1.0, 1001)
            best_th, best_cost = 0.5, np.inf
            for t in ths:
                y_pred = (y_score >= t).astype(int)
                tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
                cost = fp * c_fp + fn * c_fn
                if cost < best_cost:
                    best_cost, best_th = cost, t
            return best_th, best_cost

        def pick_threshold_by_recall_with_prec_min(y_true, y_score, prec_min=0.6):
            p, r, th = precision_recall_curve(y_true, y_score)
            # p/r len N, th len N-1. Usamos índices de th.
            valid = np.where(p[:-1] >= prec_min)[0]
            if len(valid) == 0:
                return 0.5, 0.0, 0.0  # fallback
            # entre los que cumplen precisión mínima, elegimos el de mayor recall
            best_idx = valid[np.argmax(r[valid])]
            return th[best_idx], r[best_idx], p[best_idx]

        # Elegimos umbral según estrategia
        if thr_mode == "F1 óptimo":
            thr, best_f1, best_p, best_r = pick_threshold_by_f1(y_test, y_proba)
        elif thr_mode == "Minimizar costo":
            thr, best_cost = pick_threshold_by_cost(y_test, y_proba, c_fp, c_fn)
        elif thr_mode == "Maximizar recall con precisión mínima":
            thr, best_r, best_p = pick_threshold_by_recall_with_prec_min(y_test, y_proba, prec_min=prec_min)
        # Si es "Manual", ya viene de la sidebar

        y_pred = (y_proba >= thr).astype(int)

        # 6) Métricas ampliadas
        acc     = accuracy_score(y_test, y_pred)
        bacc    = balanced_accuracy_score(y_test, y_pred)
        prec_c0 = precision_score(y_test, y_pred, pos_label=0, zero_division=0)
        prec_c1 = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
        rec_c0  = recall_score(y_test, y_pred, pos_label=0, zero_division=0)
        rec_c1  = recall_score(y_test, y_pred, pos_label=1, zero_division=0)
        f1_min  = f1_score(y_test, y_pred, pos_label=1, zero_division=0)  # F1 de la minoritaria (etiqueta 1)
        auc     = roc_auc_score(y_test, y_proba)
        auprc   = average_precision_score(y_test, y_proba)  # área bajo curva Prec-Recall (clase 1)

        # Tabla de métricas
        met_rows = [
            ("Exactitud", acc),
            ("Balanced accuracy", bacc),
            (f"Precision ({clases[0]})", prec_c0),
            (f"Precision ({clases[1]})", prec_c1),
            (f"Sensibilidad ({clases[0]})", rec_c0),
            (f"Sensibilidad ({clases[1]})", rec_c1),
            (f"F1 ({clases[1]})", f1_min),
            ("ROC-AUC", auc)
        ]
        if thr_mode == "Minimizar costo":
            met_rows.append(("Costo total (FP/FN)", best_cost))

        met_tab = pd.DataFrame(met_rows, columns=["Métrica", "Valor"])
        st.subheader("Métricas")
        st.dataframe(met_tab, use_container_width=True)

        # Alertas útiles
        prev = y_test.mean()
        if prec_c1 == 1.0 and rec_c1 < 0.15:
            st.warning("La precisión de la clase minoritaria es 1.0 pero el recall es muy bajo. "
                    "Baja el umbral, usa class_weight='balanced' o aplica re-muestreo.")
        if acc > 0.9 and bacc < 0.65 and prev < 0.25:
            st.info("La exactitud es alta por el desbalance. Revisa balanced accuracy, AUPRC y F1 de la minoritaria.")

        # 7) Coeficientes y Odds Ratios (sin cambios)
        coef = algoritmo.coef_[0]
        intercepto = algoritmo.intercept_[0]
        coef_tab = pd.DataFrame({
            "Variable": ["Intercepto"] + Variables_x,
            "Coeficiente (log-odds)": [intercepto] + list(coef),
            "Odds Ratio (exp(coef))": [np.exp(intercepto)] + list(np.exp(coef))
        })
        if not modo_presentacion:
            st.subheader("Coeficientes del modelo")
            st.dataframe(coef_tab, use_container_width=True)

        # 8) Matriz de confusión (igual que ya tenías)
        matriz = confusion_matrix(y_test, y_pred, labels=[0, 1])
        labels_disp = [clases[0], clases[1]]
        fig_cm = go.Figure(data=go.Heatmap(
            z=matriz,
            x=[f"Pred {labels_disp[0]}", f"Pred {labels_disp[1]}"],
            y=[f"Real {labels_disp[0]}", f"Real {labels_disp[1]}"],
            colorscale="Oranges", showscale=True, hoverongaps=False
        ))
        ann = []
        tags = np.array([["TN","FP"],["FN","TP"]])
        for i in range(2):
            for j in range(2):
                ann.append(dict(
                    x=[f"Pred {labels_disp[0]}", f"Pred {labels_disp[1]}"][j],
                    y=[f"Real {labels_disp[0]}", f"Real {labels_disp[1]}"][i],
                    text=f"{tags[i,j]}: {matriz[i,j]}",
                    showarrow=False,
                    font=dict(color="white" if matriz[i,j] > matriz.max()/2 else "black")
                ))
        fig_cm.update_layout(title="Matriz de confusión", annotations=ann, width=520, height=520)
        st.plotly_chart(fig_cm, use_container_width=False)

        # 9) Curva ROC (igual)
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        fig_roc = go.Figure()
        fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name=f"ROC (AUC={auc:.3f})"))
        fig_roc.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", name="Aleatorio", line=dict(dash="dot")))
        fig_roc.update_layout(title="Curva ROC", xaxis_title="FPR", yaxis_title="TPR")
        st.plotly_chart(fig_roc, use_container_width=True)

        # 10) Curva Precisión-Recall y distribución de probabilidades
        p, r, th = precision_recall_curve(y_test, y_proba)
        fig_pr = go.Figure()
        fig_pr.add_trace(go.Scatter(x=r, y=p, mode="lines", name=f"PR (AP={auprc:.3f})"))
        fig_pr.update_layout(title="Curva Precisión-Recall (clase 1)",
                            xaxis_title="Recall", yaxis_title="Precisión")
        st.plotly_chart(fig_pr, use_container_width=True)

        fig_prob = px.strip(
            x=[labels_disp[i] for i in y_test], y=y_proba,
            labels={"x":"Clase real", "y":"Probabilidad P(Y=1)"},
            title="Distribución de probabilidades por clase real"
        )
        fig_prob.add_hline(y=thr, line_dash="dot", annotation_text=f"Umbral {thr:.2f}")
        st.plotly_chart(fig_prob, use_container_width=True)

        # Nota de mapeo (como ya tenías)
        st.caption(f"Mapeo interno (solo para el modelo): {clases[0]} → 0, {clases[1]} → 1. "
                f"Prevalencia clase 1 (test): {prev:.3f}")

# FOOTER
st.markdown("---")
st.markdown("""
<div style="text-align:center; opacity:0.8; font-size:0.9rem;">
© Proyecto para Gestión de Proyectos — Dashboard creado por <b>Los Guaranies</b> con ayuda de IA y profe Freddy/Malu.  
<br> Construido con Streamlit, Plotly y Python.
</div>
""", unsafe_allow_html=True)


Overwriting Dashboard_P1.py


In [30]:
%%writefile Dashboard_Comparativo.py
# Dashboard Final equipo integrando en las vistas a los 4 países - Proyecto Airbnb
# Versión final

##########
# Importar librerías
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
from sklearn.metrics import (
    r2_score, mean_absolute_error, mean_squared_error,
    confusion_matrix, accuracy_score, precision_score,
    recall_score, roc_auc_score, roc_curve, classification_report, f1_score,
    precision_recall_curve, average_precision_score, balanced_accuracy_score
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

##########
# Configuración global
st.set_page_config(
    page_title="Airbnb (Data Web)",
    page_icon="assets/icon.jpg",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Paleta Airbnb
AIRBNB_RED   = "#FF5A5F"
AIRBNB_TEAL  = "#00A699"
AIRBNB_ORANGE= "#FC642D"
AIRBNB_GRAY  = "#BFBFBF"
AIRBNB_DARK_BG = "#0E1117"
AIRBNB_CARD   = "#151A22"
AIRBNB_BORDER = "#232A35"
CONT_GRADIENT = "Reds"

##########
# CSS Look & Feel Airbnb
st.markdown(f"""
<style>
.block-container {{ padding-top: 1.2rem; padding-bottom: 2rem; }}
html, body, [data-testid="stAppViewContainer"], section[data-testid="stSidebar"] {{
    background: radial-gradient(circle at 30% 30%, #131722 0%, #0E1117 100%) !important;
    color: white !important;
}}
section[data-testid="stSidebar"] {{ border-right: 1px solid {AIRBNB_BORDER}; }}
.air-card {{
    border: 1px solid {AIRBNB_BORDER};
    border-radius:16px; padding:1rem;
    background:{AIRBNB_CARD};
}}
.stButton>button {{
    background:{AIRBNB_RED}; color:white; border-radius:12px; border:none;
    padding:.6rem 1rem; font-weight:600;
}}
.stButton>button:hover {{ opacity:.9 }}
.stDataFrame, .stTable {{ color: white !important; }}
</style>
""", unsafe_allow_html=True)

##########
# Plotly: plantilla Airbnb
AIRBNB_COLORWAY = ["#FF5A5F", "#00A699", "#FC642D", "#BFBFBF", "#767676"]
pio.templates["airbnb_dark"] = pio.templates["plotly_dark"]
pio.templates["airbnb_dark"].layout.colorway = AIRBNB_COLORWAY
px.defaults.template = "airbnb_dark"
px.defaults.color_continuous_scale = CONT_GRADIENT
px.defaults.height = 420

##########
# Multi-país
COUNTRY_FILES = {
    "Alemania": "Berlin_Final.csv",
    "Valencia": "Valencia_Final.csv",
    "Estocolmo": "Estocolmo_Final.csv",
    "Mexico": "Mexico_Final.csv",
}

COUNTRY_IMAGES = {
    "Alemania": ["assets/Berlin1.jpg", "assets/Berlin3.jpg", "assets/Berlin2.jpg"],
    "Valencia": ["assets/Valencia1.jpg", "assets/Valencia2.jpg", "assets/Valencia3.jpg"],
    "Estocolmo": ["assets/Estocolmo1.jpg", "assets/Estocolmo2.jpg", "assets/Estocolmo3.jpg"],
    "Mexico": ["assets/Mexico1.jpg", "assets/Mexico2.jpg", "assets/Mexico3.jpg"],
}

##########
# Normalización
BIN_TRUE = {"t","true","True",1,"1",True}
BIN_FALSE= {"f","false","False",0,"0",False}

def _normalize_binary(series):
    s = series.copy()
    return s.apply(lambda v: 1 if v in BIN_TRUE else (0 if v in BIN_FALSE else np.nan)).astype("float")

def _normalize_df(df_raw):
    df = df_raw.copy()
    df.columns = df.columns.str.strip()
    df = df.loc[:, ~df.columns.str.contains(r"^Unnamed", na=False)]
    df = df.drop(['latitude','longitude','first_review','last_review','host_since', 'price', 'estimated_revenue_l365d','source','id', 'scrape_id'],
                 axis=1, errors="ignore")
    if 'id' in df.columns:
        df['id'] = df['id'].astype(str)
    if 'host_id' in df.columns:
        df['host_id'] = df['host_id'].astype(str)
    for col in ['host_is_superhost','host_identity_verified','instant_bookable']:
        if col in df.columns:
            df[col] = _normalize_binary(df[col])
    for col in ['host_response_rate','host_acceptance_rate','price','estimated_revenue_l365d','price_eur']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

def _clean_xy(df_base, y_col, x_cols):
    work = df_base[x_cols + [y_col]].replace([np.inf, -np.inf], np.nan)
    before = len(work)
    work = work.dropna()
    after = len(work)
    X = work[x_cols].to_numpy(dtype=float)
    y = work[y_col].to_numpy(dtype=float)
    return X, y, before - after

@st.cache_data(show_spinner=False)
def load_country_df(country: str):
    path = COUNTRY_FILES[country]
    raw = pd.read_csv(path)
    df = _normalize_df(raw)
    Lista = [
        'host_is_superhost','host_identity_verified','host_response_time',
        'host_response_rate','host_acceptance_rate','host_total_listings_count',
        'host_verifications','room_type','property_type','price_cat'
    ]
    return df, Lista

def kpis_block(df, country):
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        st.metric(f"{country} · Filas", f"{len(df):,}")
        st.markdown('</div>', unsafe_allow_html=True)
    with col2:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        st.metric(f"{country} · Tipos de propiedad", df['property_type'].nunique() if 'property_type' in df.columns else "—")
        st.markdown('</div>', unsafe_allow_html=True)
    with col3:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        med_price = np.nanmean(df['price_eur']) if 'price_eur' in df.columns else np.nan
        st.metric(f"{country} · Media precio", f"€{med_price:,.0f}" if np.isfinite(med_price) else "—")
        st.markdown('</div>', unsafe_allow_html=True)
    with col4:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        superhosts = int((df['host_is_superhost'] == 1).sum()) if 'host_is_superhost' in df.columns else 0
        st.metric(f"{country} · Superhosts", superhosts)
        st.markdown('</div>', unsafe_allow_html=True)

def extraction_charts(df, var_cat: str):
    tabla = df[var_cat].value_counts(dropna=False).reset_index().head(10)
    tabla.columns = ['categorias','frecuencia']
    fig_bar = px.bar(tabla, x='categorias', y='frecuencia', color='categorias', title="Distribución por categoría")
    fig_pie = px.pie(tabla, names='categorias', values='frecuencia', title="Proporción por categoría")
    fig_donut = px.pie(tabla, names='categorias', values='frecuencia', hole=0.5, title="Gráfico tipo anillo")
    fig_area = px.area(tabla.sort_values('frecuencia', ascending=False),
                       x='categorias', y='frecuencia', title="Tendencia acumulada (Área)")
    # Detalle: box/heatmap según exista price
    detail_fig = None
    if var_cat in ['room_type','property_type','price_cat'] and 'price' in df.columns:
        detail_fig = px.box(df, x=var_cat, y='price', color=var_cat, title="Relación categorías vs precio (Boxplot)")
    else:
        heat_df = pd.crosstab(index=df[var_cat], columns='count', normalize='columns') * 100
        detail_fig = px.imshow(heat_df, color_continuous_scale=CONT_GRADIENT, title="Proporción por categoría (Heatmap)")
    return tabla, fig_bar, fig_pie, fig_donut, fig_area, detail_fig

def gallery_block(country):
    st.markdown(f"**Galería:** {country} — Airbnb")
    imgs = COUNTRY_IMAGES.get(country, [])
    gcols = st.columns(3)
    for i, path in enumerate(imgs[:3]):
        with gcols[i]:
            try:
                st.image(path, use_container_width=True)
            except Exception:
                st.write("🖼️ Imagen no encontrada")

def get_common_lists(dfs_dict):
    # Intersección de columnas numéricas y binarias para logística / extracción
    num_sets = []
    bin_sets = []
    cat_sets = []
    for _, df in dfs_dict.items():
        num_cols = set(df.select_dtypes(include=['float','float64','int','int64']).columns.tolist())
        # binarias: exactamente 2 valores (ignorando NaN)
        bin_cols = set([c for c in df.columns if df[c].dropna().nunique()==2])
        # categóricas candidatas (object o categóricas + algunas conocidas)
        cat_cols = set([c for c in df.columns if df[c].dtype=='object' or df[c].dtype.name=='category'])
        # agrega columnas 'conocidas' aunque sean numéricas codificadas
        cat_cols |= set([c for c in ['room_type','property_type','price_cat','host_response_time'] if c in df.columns])
        num_sets.append(num_cols)
        bin_sets.append(bin_cols)
        cat_sets.append(cat_cols)
    common_num = set.intersection(*num_sets) if num_sets else set()
    common_bin = set.intersection(*bin_sets) if bin_sets else set()
    common_cat = set.intersection(*cat_sets) if cat_sets else set()
    # excluir target obvios de num si molestan
    return sorted(list(common_num)), sorted(list(common_bin)), sorted(list(common_cat))

def run_logistic_block(df, y_col, x_cols, thr_mode="Manual", thr=0.5, c_fp=10000, c_fn=80000, prec_min=0.6, test_size=0.30, imb_method="Ninguno"):
    base = df[x_cols + [y_col]].copy()
    vals = base[y_col].dropna().unique().tolist()
    if len(vals) != 2:
        return None
    mapping = {vals[0]:0, vals[1]:1}
    base['__y__'] = base[y_col].map(mapping)
    base = base.replace([np.inf,-np.inf], np.nan).dropna(subset=x_cols + ['__y__'])
    if base['__y__'].nunique() < 2:
        return None
    X = base[x_cols].astype(float).to_numpy()
    y = base['__y__'].to_numpy(dtype=int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s  = scaler.transform(X_test)
    if imb_method == "SMOTE (over-sampling)":
        sm = SMOTE(random_state=42); X_train_s, y_train = sm.fit_resample(X_train_s, y_train)
    elif imb_method == "Under-sampling":
        rus = RandomUnderSampler(random_state=42); X_train_s, y_train = rus.fit_resample(X_train_s, y_train)
    if imb_method == "class_weight='balanced'":
        clf = LogisticRegression(max_iter=1000, class_weight='balanced')
    else:
        clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_s, y_train)
    y_proba = clf.predict_proba(X_test_s)[:,1]

    def pick_threshold_by_f1(y_true, y_score):
        p, r, th = precision_recall_curve(y_true, y_score)
        f1 = 2*(p*r)/np.clip(p+r, 1e-12, None)
        best_idx = np.nanargmax(f1[:-1])
        return th[best_idx]
    def pick_threshold_by_cost(y_true, y_score, c_fp, c_fn):
        ths = np.linspace(0.0,1.0,1001)
        best_th, best_cost = 0.5, np.inf
        for t in ths:
            y_pred = (y_score>=t).astype(int)
            tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
            cost = fp*c_fp + fn*c_fn
            if cost < best_cost:
                best_cost, best_th = cost, t
        return best_th
    def pick_threshold_by_recall_with_prec_min(y_true, y_score, prec_min=0.6):
        p, r, th = precision_recall_curve(y_true, y_score)
        valid = np.where(p[:-1] >= prec_min)[0]
        if len(valid)==0: return 0.5
        best_idx = valid[np.argmax(r[valid])]
        return th[best_idx]

    if thr_mode=="F1 óptimo":
        thr = pick_threshold_by_f1(y_test, y_proba)
    elif thr_mode=="Minimizar costo":
        thr = pick_threshold_by_cost(y_test, y_proba, c_fp, c_fn)
    elif thr_mode=="Maximizar recall con precisión mínima":
        thr = pick_threshold_by_recall_with_prec_min(y_test, y_proba, prec_min=prec_min)
    # Manual: se respeta valor de thr

    y_pred = (y_proba>=thr).astype(int)
    acc   = accuracy_score(y_test, y_pred)
    bacc  = balanced_accuracy_score(y_test, y_pred)
    prec1 = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
    rec1  = recall_score(y_test, y_pred, pos_label=1, zero_division=0)
    f1m   = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
    auc   = roc_auc_score(y_test, y_proba)
    auprc = average_precision_score(y_test, y_proba)
    cm    = confusion_matrix(y_test, y_pred, labels=[0,1])

    # Figuras
    labels_disp = [list(mapping.keys())[0], list(mapping.keys())[1]]
    fig_cm = go.Figure(data=go.Heatmap(
        z=cm,
        x=[f"Pred {labels_disp[0]}", f"Pred {labels_disp[1]}"],
        y=[f"Real {labels_disp[0]}", f"Real {labels_disp[1]}"],
        colorscale="Oranges", showscale=True, hoverongaps=False
    ))
    ann = []
    tags = np.array([["TN","FP"],["FN","TP"]])
    for i in range(2):
        for j in range(2):
            ann.append(dict(
                x=[f"Pred {labels_disp[0]}", f"Pred {labels_disp[1]}"][j],
                y=[f"Real {labels_disp[0]}", f"Real {labels_disp[1]}"][i],
                text=f"{tags[i,j]}: {cm[i,j]}",
                showarrow=False,
                font=dict(color="white" if cm[i,j] > cm.max()/2 else "black")
            ))
    fig_cm.update_layout(title=f"Matriz de confusión · umbral={thr:.2f}", annotations=ann, width=520, height=520)

    fpr, tpr, _ = roc_curve(y_test, y_proba)
    fig_roc = go.Figure()
    fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name=f"ROC (AUC={auc:.3f})"))
    fig_roc.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", name="Aleatorio", line=dict(dash="dot")))
    fig_roc.update_layout(title="Curva ROC", xaxis_title="FPR", yaxis_title="TPR")

    p, r, th = precision_recall_curve(y_test, y_proba)
    fig_pr = go.Figure()
    fig_pr.add_trace(go.Scatter(x=r, y=p, mode="lines", name=f"PR (AP={auprc:.3f})"))
    fig_pr.update_layout(title="Curva Precisión-Recall (clase 1)", xaxis_title="Recall", yaxis_title="Precisión")

    fig_prob = px.strip(
        x=[labels_disp[i] for i in y_test], y=y_proba,
        labels={"x":"Clase real", "y":"Probabilidad P(Y=1)"},
        title="Distribución de probabilidades por clase real"
    )
    fig_prob.add_hline(y=thr, line_dash="dot", annotation_text=f"Umbral {thr:.2f}")

    met_tab = pd.DataFrame({
        "Métrica": ["Exactitud","Balanced accuracy","Precisión (1)","Recall (1)","F1 (1)","ROC-AUC","AP (PR)"],
        "Valor":   [acc, bacc, prec1, rec1, f1m, auc, auprc]
    })
    return dict(
        metrics=met_tab, cm_fig=fig_cm, roc_fig=fig_roc, pr_fig=fig_pr, prob_fig=fig_prob,
        thr=thr, mapping=mapping
    )

# Carga inicial
df, Lista = load_country_df("Alemania")

##########
# Header
col_logo, col_title = st.columns([1,5], vertical_alignment="center")
with col_logo:
    st.image("assets/Logo3.jpg", width=90)
with col_title:
    st.markdown("""
        # Airbnb Data Analysis
        <span style="color:#767676">Listados, precios y comportamiento de oferta</span>
    """, unsafe_allow_html=True)

##########
# Sidebar
st.sidebar.image("assets/Logoo.jpg", use_container_width=True)
st.sidebar.caption("Análisis exploratorio y modelos")
st.sidebar.markdown("---")
modo_presentacion = st.sidebar.toggle("Modo presentación", value=False)
country = st.sidebar.selectbox("País", list(COUNTRY_FILES.keys()), index=0)
df, Lista = load_country_df(country)
View = st.sidebar.selectbox(
    label='Tipo de análisis',
    options=['Extracción de Características', 'Regresión Lineal', 'Regresión No Lineal', 'Regresión Logística', 'Comparar países'],
    index=0
)

##########################################################################################
# Vista 1 — Extracción de características
if View == "Extracción de Características":
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        st.metric("Filas", f"{len(df):,}")
        st.markdown('</div>', unsafe_allow_html=True)
    with col2:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        st.metric("Tipos de propiedad", df['property_type'].nunique() if 'property_type' in df.columns else "—")
        st.markdown('</div>', unsafe_allow_html=True)
    with col3:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        med_price = np.nanmean(df['price_eur']) if 'price_eur' in df.columns else np.nan
        st.metric("Media de precio", f"€{med_price:,.0f}" if np.isfinite(med_price) else "—")
        st.markdown('</div>', unsafe_allow_html=True)
    with col4:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        superhosts = int((df['host_is_superhost'] == 1).sum()) if 'host_is_superhost' in df.columns else 0
        st.metric("Superhosts", superhosts)
        st.markdown('</div>', unsafe_allow_html=True)

    st.markdown("---")

    Variable_Cat = st.sidebar.selectbox("Variable categórica a analizar", options=Lista)
    Tabla_frecuencias = df[Variable_Cat].value_counts(dropna=False).reset_index().head(10)
    Tabla_frecuencias.columns = ['categorias', 'frecuencia']

    st.title("Extracción de Características")
    st.caption('Se muestran máximo las 10 categorías con más frecuencia.')

    Contenedor_A, Contenedor_B = st.columns(2)
    with Contenedor_A:
        st.subheader("Distribución por categoría (Bar Plot)")
        fig_bar = px.bar(Tabla_frecuencias, x='categorias', y='frecuencia', color='categorias')
        st.plotly_chart(fig_bar, use_container_width=True)
    with Contenedor_B:
        st.subheader("Proporción por categoría (Pie Chart)")
        fig_pie = px.pie(Tabla_frecuencias, names='categorias', values='frecuencia')
        st.plotly_chart(fig_pie, use_container_width=True)

    Contenedor_C, Contenedor_D = st.columns(2)
    with Contenedor_C:
        st.subheader("Gráfico tipo anillo")
        fig_donut = px.pie(Tabla_frecuencias, names='categorias', values='frecuencia', hole=0.5)
        st.plotly_chart(fig_donut, use_container_width=True)
    with Contenedor_D:
        st.subheader("Tendencia acumulada (Área)")
        fig_area = px.area(Tabla_frecuencias.sort_values(by='frecuencia', ascending=False),
                           x='categorias', y='frecuencia')
        st.plotly_chart(fig_area, use_container_width=True)

    st.markdown("---")
    st.subheader("Análisis más profundo")

    if Variable_Cat in ['room_type', 'property_type', 'price_cat'] and 'price' in df.columns:
        st.write("**Relación entre categorías y precio (Boxplot):**")
        fig_box = px.box(df, x=Variable_Cat, y='price', color=Variable_Cat)
        st.plotly_chart(fig_box, use_container_width=True)
    else:
        st.write("**Heatmap de proporciones:**")
        heat_df = pd.crosstab(index=df[Variable_Cat], columns='count', normalize='columns') * 100
        fig_heat = px.imshow(heat_df, color_continuous_scale=CONT_GRADIENT, title="Proporción por categoría")
        st.plotly_chart(fig_heat, use_container_width=True)

    if not modo_presentacion:
        st.markdown("---")
        st.subheader("Tabla de frecuencias")
        st.dataframe(Tabla_frecuencias.style.background_gradient(cmap='Reds'), use_container_width=True)

    gallery_block(country)

##########################################################################################
# Vista 2 — Regresión Lineal
if View == "Regresión Lineal":
    st.title("Regresión Lineal")

    numeric_df = df.select_dtypes(include=['float', 'float64', 'int', 'int64']).copy()
    Lista_num = list(numeric_df.columns)

    st.subheader("Regresión lineal simple")
    colL, colR = st.columns(2)
    with colL:
        Variable_y = st.selectbox("Variable dependiente (Y)", options=Lista_num, key="rl_y")
    with colR:
        Variable_x = st.selectbox("Variable independiente (X)", options=Lista_num, key="rl_x")

    X, y, dropped = _clean_xy(numeric_df, Variable_y, [Variable_x])
    if dropped > 0 and not modo_presentacion:
        st.info(f"Se descartaron {dropped} filas con NaN/Inf para el ajuste.")
    if len(y) < 3:
        st.error("No hay suficientes filas válidas para ajustar el modelo.")
        st.stop()

    model = LinearRegression()
    model.fit(X, y)
    y_pred = model.predict(X)

    r2 = r2_score(y, y_pred)
    coef_Deter_simple = model.score(X= X, y= y)
    coef_Correl_simple = np.sqrt(abs(coef_Deter_simple))

    coef_df_simple = pd.DataFrame({
        "Variable": [Variable_x],
        "Coeficiente": [model.coef_[0]],
        "Intercepto": [model.intercept_],
        "R": [coef_Correl_simple],
        "R^2": [coef_Deter_simple]
    })
    if not modo_presentacion:
        st.dataframe(coef_df_simple, use_container_width=True)

    fig_scat = px.scatter(numeric_df, x=Variable_x, y=Variable_y, opacity=0.6, title="Dispersión y recta ajustada")
    order_idx = np.argsort(X[:, 0])
    fig_scat.add_trace(go.Scatter(
        x=X[order_idx, 0], y=y_pred[order_idx],
        mode="lines", name="Predicción de Y"
    ))
    st.plotly_chart(fig_scat, use_container_width=True)

    resid = y - y_pred
    fig_res = px.scatter(x=y_pred, y=resid, labels={"x":"Ŷ", "y":"Residual"},
                         title="Residuos vs Predicción (diagnóstico)")
    fig_res.add_hline(y=0, line_dash="dot")
    st.plotly_chart(fig_res, use_container_width=True)

    st.markdown("---")
    st.subheader("Regresión lineal múltiple")
    col1, col2 = st.columns([1,2])
    with col1:
        Variable_y_M = st.selectbox("Variable dependiente (Y)", options=Lista_num, key="rlm_y")
    with col2:
        Variables_x_M = st.multiselect("Variables independientes (X)", options= Lista_num, key="rlm_xs")

    if len(Variables_x_M) >= 1:
        X_M, y_M, droppedM = _clean_xy(numeric_df, Variable_y_M, Variables_x_M)
        if droppedM > 0 and not modo_presentacion:
            st.info(f"Se descartaron {droppedM} filas con NaN/Inf para el ajuste múltiple.")
        if len(y_M) < max(3, len(Variables_x_M)+1):
            st.error("No hay suficientes filas válidas para el modelo múltiple.")
            st.stop()

        Model_M = LinearRegression()
        Model_M.fit(X_M, y_M)
        y_pred_M = Model_M.predict(X_M)

        coef_Deter_multiple = Model_M.score(X=X_M, y=y_M)
        coef_Correl_multiple = np.sqrt(abs(coef_Deter_multiple))

        coef_tab = pd.DataFrame({
            "Variable": ["Intercepto"] + Variables_x_M,
            "Coeficiente": [Model_M.intercept_] + list(Model_M.coef_)
        })
        if not modo_presentacion:
            st.dataframe(coef_tab, use_container_width=True)

        met_tab = pd.DataFrame({'R^2': [coef_Deter_multiple], 'R ': [coef_Correl_multiple]})
        st.dataframe(met_tab, use_container_width=True)

        fig_pred = px.scatter(x=y_M, y=y_pred_M, labels={"x":"Y real ", "y": "Y predicciones"}, title="Comparación Y Real vs Y Predicciones")
        fig_pred.add_trace(go.Scatter(x=[y_M.min(), y_M.max()], y=[y_M.min(), y_M.max()], mode="lines", name="Línea ideal", line=dict(dash="dot")))
        st.plotly_chart(fig_pred, use_container_width=True)
    else:
        st.info("Selecciona al menos 1 variable para el modelo múltiple.")

##########################################################################################
# Vista 3 — Regresión No Lineal
if View == "Regresión No Lineal":
    st.title("Regresión No Lineal")

    numeric_df = df.select_dtypes(include=['float','float64','int','int64']).copy()
    Lista_num = list(numeric_df.columns)

    contA, contB = st.columns(2)
    with contA:
        Variable_y = st.selectbox("Variable dependiente (Y)", options=Lista_num, key="rnl_y_cf")
    with contB:
        Variable_x = st.selectbox("Variable independiente (X)", options=[c for c in Lista_num if c != Variable_y], key="rnl_x_cf")

    modelos = [
        "Función cuadrática (a*x**2 + b*x + c)",
        "Función exponencial (a*np.exp(-b*x)+c)",
        "Función potencia (a*x**b)",
        "Función cúbica (a*x**3 + b*x**2 + c*x + d)"
    ]
    Modelo = st.selectbox("Elige modelo no lineal", options=modelos, key="rnl_modelo_cf")

    df_nl = numeric_df[[Variable_x, Variable_y]].replace([np.inf, -np.inf], np.nan).dropna()
    if len(df_nl) < 3:
        st.error("Datos insuficientes tras limpiar NaN/Inf para ajustar el modelo no lineal.")
        st.stop()

    x = df_nl[Variable_x].to_numpy(dtype=float)
    y = df_nl[Variable_y].to_numpy(dtype=float)
    sort_idx = np.argsort(x)
    x_sorted = x[sort_idx]

    def func_cuad(x, a, b, c): return a*x**2 + b*x + c
    def func_cub(x, a, b, c, d): return a*x**3 + b*x**2 + c*x + d
    def func_exp(x, a, b, c): return a * np.exp(-b * x) + c
    def func_pot(x, a, b): return a * np.power(x, b)

    try:
        if Modelo == "Función cuadrática (a*x**2 + b*x + c)":
            pars, cov = curve_fit(func_cuad, x, y, maxfev=20000)
            y_pred = func_cuad(x, *pars); y_line = func_cuad(x_sorted, *pars)
            params_df = pd.DataFrame({"Parámetro": ["a", "b", "c"], "Valor": pars})
        elif Modelo == "Función cúbica (a*x**3 + b*x**2 + c*x + d)":
            pars, cov = curve_fit(func_cub, x, y, maxfev=30000)
            y_pred = func_cub(x, *pars); y_line = func_cub(x_sorted, *pars)
            params_df = pd.DataFrame({"Parámetro": ["a", "b", "c", "d"], "Valor": pars})
        elif Modelo == "Función exponencial (a*np.exp(-b*x)+c)":
            mask = np.isfinite(y)
            if np.sum(mask) < 3: st.error("No hay suficientes datos válidos para el modelo exponencial."); st.stop()
            pars, cov = curve_fit(func_exp, x, y, maxfev=30000)
            y_pred = func_exp(x, *pars); y_line = func_exp(x_sorted, *pars)
            params_df = pd.DataFrame({"Parámetro": ["a", "b", "c"], "Valor": pars})
        elif Modelo == "Función potencia (a*x**b)":
            mask = (x > 0) & (y > 0) & np.isfinite(x) & np.isfinite(y)
            if mask.sum() < 3: st.error("Para potencia se requieren suficientes valores con x>0 e y>0."); st.stop()
            x_pos, y_pos = x[mask], y[mask]
            pars, cov = curve_fit(func_pot, x_pos, y_pos, maxfev=20000)
            x_safe = np.clip(x, 1e-12, None); x_sorted_safe = np.clip(x_sorted, 1e-12, None)
            y_pred = func_pot(x_safe, *pars); y_line = func_pot(x_sorted_safe, *pars)
            params_df = pd.DataFrame({"Parámetro": ["a", "b"], "Valor": pars})
        else:
            st.warning("Selecciona un modelo válido."); st.stop()

        r2 = r2_score(y, y_pred); r = np.sqrt(abs(r2))

        st.markdown("**Parámetros estimados (curve_fit):**")
        if not modo_presentacion: st.dataframe(params_df, use_container_width=True)

        st.markdown("**Métricas del ajuste:**")
        st.dataframe(pd.DataFrame({"R^2":[r2], "R ":[r]}), use_container_width=True)

        fig = px.scatter(x=x, y=y, labels={"x": Variable_x, "y": Variable_y},
                         opacity=0.6, title=f"{Modelo} — Dispersión y curva ajustada")
        fig.add_trace(go.Scatter(x=x_sorted, y=y_line, mode="lines", name="Ŷ (curva)", line=dict(width=2)))
        st.plotly_chart(fig, use_container_width=True)

        resid = y - y_pred
        fig_resid = px.scatter(x=y_pred, y=resid, labels={"x":"Ŷ", "y":"Residual"},
                               title="Residuos vs Predicción")
        fig_resid.add_hline(y=0, line_dash="dot")
        st.plotly_chart(fig_resid, use_container_width=True)

    except RuntimeError as e:
        st.error(f"No convergió el ajuste: {e}.")
    except Exception as e:
        st.error(f"Error durante el ajuste: {e}")

##########################################################################################
# Vista 4 — Regresión Logística
if View == "Regresión Logística":
    st.title("Regresión Logística")

    numeric_df = df.select_dtypes(include=['float', 'float64', 'int', 'int64'])
    Lista_num  = list(numeric_df.columns)

    dico_cols = []
    for col in df.columns:
        vals = df[col].dropna().unique()
        if len(vals) == 2:
            dico_cols.append(col)

    if len(dico_cols) == 0:
        st.warning("No se detectaron variables binarias en el dataset."); st.stop()

    Variable_y = st.sidebar.selectbox("Variable dependiente (Y, binaria)", options=dico_cols)
    Variables_x = st.sidebar.multiselect("Variables independientes (X, numéricas)", options=Lista_num)

    test_size = st.sidebar.slider("Tamaño de prueba", 0.1, 0.5, 0.30, 0.05)
    thr = st.sidebar.slider("Umbral de clasificación", 0.05, 0.95, 0.50, 0.01)

    if len(Variables_x) == 0:
        st.info("Selecciona al menos una variable independiente (X).")
    else:
        base = df[Variables_x + [Variable_y]].copy()
        vals = base[Variable_y].dropna().unique().tolist()
        if len(vals) != 2:
            st.error(f"La variable '{Variable_y}' debe tener exactamente 2 clases. Encontradas: {vals}")
            st.stop()

        mapping = {vals[0]: 0, vals[1]: 1}
        base['__y__'] = base[Variable_y].map(mapping)
        base = base.replace([np.inf, -np.inf], np.nan).dropna(subset=Variables_x + ['__y__'])
        if base['__y__'].nunique() < 2:
            st.error("Tras limpiar datos, solo queda una clase en Y."); st.stop()

        X = base[Variables_x].astype(float).to_numpy()
        y = base['__y__'].to_numpy(dtype=int)
        clases = vals

        st.sidebar.markdown("### Manejo de desbalance")
        imb_method = st.sidebar.selectbox("Método", ["Ninguno","class_weight='balanced'","SMOTE (over-sampling)","Under-sampling"])

        st.sidebar.markdown("### Estrategia de umbral")
        thr_mode = st.sidebar.selectbox("Seleccionar umbral por…", ["Manual", "F1 óptimo", "Minimizar costo", "Maximizar recall con precisión mínima"])
        prec_min = None; c_fp = None; c_fn = None
        if thr_mode == "Manual":
            thr = st.sidebar.slider("Umbral de clasificación", 0.01, 0.99, thr, 0.01)
        elif thr_mode == "Maximizar recall con precisión mínima":
            prec_min = st.sidebar.slider("Precisión mínima requerida", 0.1, 0.99, 0.6, 0.01)
        elif thr_mode == "Minimizar costo":
            c_fp = st.sidebar.number_input("Costo por FP", min_value=0, value=10000, step=1000)
            c_fn = st.sidebar.number_input("Costo por FN", min_value=0, value=80000, step=1000)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
        escalar = StandardScaler()
        X_train_s = escalar.fit_transform(X_train)
        X_test_s  = escalar.transform(X_test)

        if imb_method == "SMOTE (over-sampling)":
            sm = SMOTE(random_state=42); X_train_s, y_train = sm.fit_resample(X_train_s, y_train)
        elif imb_method == "Under-sampling":
            rus = RandomUnderSampler(random_state=42); X_train_s, y_train = rus.fit_resample(X_train_s, y_train)

        if imb_method == "class_weight='balanced'":
            algoritmo = LogisticRegression(max_iter=1000, class_weight='balanced')
        else:
            algoritmo = LogisticRegression(max_iter=1000)
        algoritmo.fit(X_train_s, y_train)

        y_proba = algoritmo.predict_proba(X_test_s)[:, 1]

        def pick_threshold_by_f1(y_true, y_score):
            p, r, th = precision_recall_curve(y_true, y_score)
            f1 = 2 * (p*r) / np.clip(p+r, 1e-12, None); best_idx = np.nanargmax(f1[:-1])
            return th[best_idx], f1[best_idx], p[best_idx], r[best_idx]

        def pick_threshold_by_cost(y_true, y_score, c_fp, c_fn):
            ths = np.linspace(0.0, 1.0, 1001); best_th, best_cost = 0.5, np.inf
            for t in ths:
                y_pred = (y_score >= t).astype(int)
                tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
                cost = fp * c_fp + fn * c_fn
                if cost < best_cost: best_cost, best_th = cost, t
            return best_th, best_cost

        def pick_threshold_by_recall_with_prec_min(y_true, y_score, prec_min=0.6):
            p, r, th = precision_recall_curve(y_true, y_score)
            valid = np.where(p[:-1] >= prec_min)[0]
            if len(valid) == 0: return 0.5, 0.0, 0.0
            best_idx = valid[np.argmax(r[valid])]; return th[best_idx], r[best_idx], p[best_idx]

        if thr_mode == "F1 óptimo":
            thr, best_f1, best_p, best_r = pick_threshold_by_f1(y_test, y_proba)
        elif thr_mode == "Minimizar costo":
            thr, best_cost = pick_threshold_by_cost(y_test, y_proba, c_fp, c_fn)
        elif thr_mode == "Maximizar recall con precisión mínima":
            thr, best_r, best_p = pick_threshold_by_recall_with_prec_min(y_test, y_proba, prec_min=prec_min)

        y_pred = (y_proba >= thr).astype(int)

        acc     = accuracy_score(y_test, y_pred)
        bacc    = balanced_accuracy_score(y_test, y_pred)
        prec_c0 = precision_score(y_test, y_pred, pos_label=0, zero_division=0)
        prec_c1 = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
        rec_c0  = recall_score(y_test, y_pred, pos_label=0, zero_division=0)
        rec_c1  = recall_score(y_test, y_pred, pos_label=1, zero_division=0)
        f1_min  = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
        auc     = roc_auc_score(y_test, y_proba)
        auprc   = average_precision_score(y_test, y_proba)

        met_rows = [
            ("Exactitud", acc),
            ("Balanced accuracy", bacc),
            (f"Precision ({clases[0]})", prec_c0),
            (f"Precision ({clases[1]})", prec_c1),
            (f"Sensibilidad ({clases[0]})", rec_c0),
            (f"Sensibilidad ({clases[1]})", rec_c1),
            (f"F1 ({clases[1]})", f1_min),
            ("ROC-AUC", auc)
        ]
        if thr_mode == "Minimizar costo":
            met_rows.append(("Costo total (FP/FN)", best_cost))

        met_tab = pd.DataFrame(met_rows, columns=["Métrica", "Valor"])
        st.subheader("Métricas")
        st.dataframe(met_tab, use_container_width=True)

        prev = y_test.mean()
        if prec_c1 == 1.0 and rec_c1 < 0.15:
            st.warning("La precisión de la clase minoritaria es 1.0 pero el recall es muy bajo. Ajusta umbral o balanceo.")
        if acc > 0.9 and bacc < 0.65 and prev < 0.25:
            st.info("La exactitud es alta por el desbalance. Revisa balanced accuracy, AUPRC y F1 de la minoritaria.")

        coef = algoritmo.coef_[0]; intercepto = algoritmo.intercept_[0]
        coef_tab = pd.DataFrame({
            "Variable": ["Intercepto"] + Variables_x,
            "Coeficiente (log-odds)": [intercepto] + list(coef),
            "Odds Ratio (exp(coef))": [np.exp(intercepto)] + list(np.exp(coef))
        })
        if not modo_presentacion:
            st.subheader("Coeficientes del modelo")
            st.dataframe(coef_tab, use_container_width=True)

        matriz = confusion_matrix(y_test, y_pred, labels=[0, 1])
        labels_disp = [clases[0], clases[1]]
        fig_cm = go.Figure(data=go.Heatmap(
            z=matriz,
            x=[f"Pred {labels_disp[0]}", f"Pred {labels_disp[1]}"],
            y=[f"Real {labels_disp[0]}", f"Real {labels_disp[1]}"],
            colorscale="Oranges", showscale=True, hoverongaps=False
        ))
        ann = []
        tags = np.array([["TN","FP"],["FN","TP"]])
        for i in range(2):
            for j in range(2):
                ann.append(dict(
                    x=[f"Pred {labels_disp[0]}", f"Pred {labels_disp[1]}"][j],
                    y=[f"Real {labels_disp[0]}", f"Real {labels_disp[1]}"][i],
                    text=f"{tags[i,j]}: {matriz[i,j]}",
                    showarrow=False,
                    font=dict(color="white" if matriz[i,j] > matriz.max()/2 else "black")
                ))
        fig_cm.update_layout(title=f"Matriz de confusión (umbral={thr:.2f})", annotations=ann, width=520, height=520)
        st.plotly_chart(fig_cm, use_container_width=False)

        fpr, tpr, _ = roc_curve(y_test, y_proba)
        fig_roc = go.Figure()
        fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name=f"ROC (AUC={auc:.3f})"))
        fig_roc.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", name="Aleatorio", line=dict(dash="dot")))
        fig_roc.update_layout(title="Curva ROC", xaxis_title="FPR", yaxis_title="TPR")
        st.plotly_chart(fig_roc, use_container_width=True)

        p, r, th = precision_recall_curve(y_test, y_proba)
        fig_pr = go.Figure()
        fig_pr.add_trace(go.Scatter(x=r, y=p, mode="lines", name=f"PR (AP={auprc:.3f})"))
        fig_pr.update_layout(title="Curva Precisión-Recall (clase 1)", xaxis_title="Recall", yaxis_title="Precisión")
        st.plotly_chart(fig_pr, use_container_width=True)

        fig_prob = px.strip(
            x=[labels_disp[i] for i in y_test], y=y_proba,
            labels={"x":"Clase real", "y":"Probabilidad P(Y=1)"},
            title="Distribución de probabilidades por clase real"
        )
        fig_prob.add_hline(y=thr, line_dash="dot", annotation_text=f"Umbral {thr:.2f}")
        st.plotly_chart(fig_prob, use_container_width=True)

        st.caption(f"Mapeo interno (solo para el modelo): {clases[0]} → 0, {clases[1]} → 1. Prevalencia clase 1 (test): {prev:.3f}")

##########################################################################################
# Vista 5 — COMPARAR PAÍSES (Nueva)
if View == "Comparar países":
    st.title("Comparación de países (Alemania · Valencia · Estocolmo · México)")
    st.caption("Misma métrica y visual por país, en una sola vista.")

    # Cargar todos los países
    dfs = {}
    listas_cat = {}
    for c in COUNTRY_FILES.keys():
        dfi, Li = load_country_df(c)
        dfs[c] = dfi
        listas_cat[c] = set(Li).intersection(set(dfi.columns))

    common_num, common_bin, common_cat = get_common_lists(dfs)

    # Sub-vistas
    subview = st.radio("Sub-vista", ["Extracción comparada", "Regresión logística comparada"], horizontal=True)

    if subview == "Extracción comparada":
        if len(common_cat)==0:
            st.error("No hay columnas categóricas en común en los 4 datasets.")
            st.stop()
        var_cat = st.selectbox("Variable categórica común", options=sorted(common_cat), index=sorted(common_cat).index("room_type") if "room_type" in common_cat else 0)

        # KPI's por país (fila completa)
        st.markdown("### KPI's por país")
        for c in COUNTRY_FILES.keys():
            with st.container():
                kpis_block(dfs[c], c)

        st.markdown("---")
        st.markdown("### Extracción (4× gráficas por país)")
        # Grilla 2x2 por país con (bar, pie, donut, área) + detalle (box/heatmap) + galería
        for c in COUNTRY_FILES.keys():
            st.subheader(f"{c}")
            tabla, fig_bar, fig_pie, fig_donut, fig_area, detail_fig = extraction_charts(dfs[c], var_cat)
            colA, colB = st.columns(2)
            with colA: st.plotly_chart(fig_bar, use_container_width=True)
            with colB: st.plotly_chart(fig_pie, use_container_width=True)
            colC, colD = st.columns(2)
            with colC: st.plotly_chart(fig_donut, use_container_width=True)
            with colD: st.plotly_chart(fig_area, use_container_width=True)

            st.plotly_chart(detail_fig, use_container_width=True)

            if not modo_presentacion:
                with st.expander(f"Tabla de frecuencias · {c}"):
                    st.dataframe(tabla.style.background_gradient(cmap='Reds'), use_container_width=True)

            with st.expander(f"Galería · {c}"):
                gallery_block(c)

            st.markdown("---")

    else:
        # Logística comparada
        if len(common_bin)==0:
            st.error("No hay variables binarias en común en los 4 datasets.")
            st.stop()
        if len(common_num)==0:
            st.error("No hay variables numéricas en común en los 4 datasets.")
            st.stop()

        st.markdown("### Parámetros comunes")
        y_col = st.selectbox("Variable Y (binaria, común)", options=common_bin)
        x_cols = st.multiselect("Variables X (numéricas, comunes)", options=common_num, default=[c for c in common_num if c not in [y_col]][:3])
        test_size = st.slider("Tamaño de prueba", 0.1, 0.5, 0.30, 0.05)

        colU, colV, colW = st.columns(3)
        with colU:
            imb_method = st.selectbox("Manejo de desbalance", ["Ninguno","class_weight='balanced'","SMOTE (over-sampling)","Under-sampling"])
        with colV:
            thr_mode = st.selectbox("Umbral por", ["Manual","F1 óptimo","Minimizar costo","Maximizar recall con precisión mínima"])
        with colW:
            thr_manual = st.slider("Umbral (si Manual)", 0.01, 0.99, 0.50, 0.01)

        colX, colY = st.columns(2)
        with colX:
            c_fp = st.number_input("Costo por FP (si Minimizar costo)", min_value=0, value=10000, step=1000)
        with colY:
            c_fn = st.number_input("Costo por FN (si Minimizar costo)", min_value=0, value=80000, step=1000)

        prec_min = st.slider("Precisión mínima (si Máx. recall)", 0.1, 0.99, 0.60, 0.01)

        if len(x_cols)==0:
            st.info("Selecciona al menos 1 X para correr comparación.")
            st.stop()

        # Ejecutar por país
        results = {}
        for c in COUNTRY_FILES.keys():
            res = run_logistic_block(
                dfs[c], y_col, x_cols,
                thr_mode=thr_mode,
                thr=thr_manual,
                c_fp=c_fp, c_fn=c_fn,
                prec_min=prec_min,
                test_size=test_size,
                imb_method=imb_method
            )
            if res is not None:
                results[c] = res

        if len(results)==0:
            st.error("No se pudo entrenar el modelo en ninguno de los países (revisa datos y clases).")
            st.stop()

        st.markdown("### Métricas comparadas")
        # Tabla apilada por país
        tabs = st.tabs(list(results.keys()))
        for tab, (c, res) in zip(tabs, results.items()):
            with tab:
                st.dataframe(res["metrics"], use_container_width=True)

        st.markdown("### Matrices de confusión por país")
        # Grilla 2x2
        countries = list(results.keys())
        rows = [countries[:2], countries[2:4]]
        for row in rows:
            cols = st.columns(len(row))
            for i, c in enumerate(row):
                with cols[i]:
                    st.markdown(f"**{c}**")
                    st.plotly_chart(results[c]["cm_fig"], use_container_width=True)

        with st.expander("Curvas ROC por país"):
            cols = st.columns(2)
            items = list(results.items())
            for i, (c, res) in enumerate(items):
                with cols[i%2]:
                    st.markdown(f"**{c}**"); st.plotly_chart(res["roc_fig"], use_container_width=True)

        with st.expander("Curvas Precisión-Recall por país"):
            cols = st.columns(2)
            items = list(results.items())
            for i, (c, res) in enumerate(items):
                with cols[i%2]:
                    st.markdown(f"**{c}**"); st.plotly_chart(res["pr_fig"], use_container_width=True)

        with st.expander("Distribución de probabilidades por país"):
            cols = st.columns(2)
            items = list(results.items())
            for i, (c, res) in enumerate(items):
                with cols[i%2]:
                    st.markdown(f"**{c}**"); st.plotly_chart(res["prob_fig"], use_container_width=True)

# FOOTER
st.markdown("---")
st.markdown("""
<div style="text-align:center; opacity:0.8; font-size:0.9rem;">
© Proyecto para Gestión de Proyectos — Dashboard creado por <b>Los Guaranies</b> con ayuda de IA y profe Freddy/Malu.  
<br> Construido con Streamlit, Plotly y Python.
</div>
""", unsafe_allow_html=True)



Writing Dashboard_Comparativo.py


In [1]:
%%writefile Dashboard_Final.py
# Dashboard Final equipo integrando en las vistas a los 4 países - Proyecto Airbnb
# Versión final

##########
# Importar librerías
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
from sklearn.metrics import (
    r2_score, mean_absolute_error, mean_squared_error,
    confusion_matrix, accuracy_score, precision_score,
    recall_score, roc_auc_score, roc_curve, classification_report, f1_score,
    precision_recall_curve, average_precision_score, balanced_accuracy_score
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

##########
# Configuración global
st.set_page_config(
    page_title="Airbnb (Data Web)",
    page_icon="assets/icon.jpg",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Paleta Airbnb
AIRBNB_RED   = "#FF5A5F"
AIRBNB_TEAL  = "#00A699"
AIRBNB_ORANGE= "#FC642D"
AIRBNB_GRAY  = "#BFBFBF"
AIRBNB_DARK_BG = "#0E1117"
AIRBNB_CARD   = "#151A22"
AIRBNB_BORDER = "#232A35"
CONT_GRADIENT = "Reds"

##########
# CSS Look & Feel Airbnb
st.markdown(f"""
<style>
.block-container {{ padding-top: 1.2rem; padding-bottom: 2rem; }}
html, body, [data-testid="stAppViewContainer"], section[data-testid="stSidebar"] {{
    background: radial-gradient(circle at 30% 30%, #131722 0%, #0E1117 100%) !important;
    color: white !important;
}}
section[data-testid="stSidebar"] {{ border-right: 1px solid {AIRBNB_BORDER}; }}
.air-card {{
    border: 1px solid {AIRBNB_BORDER};
    border-radius:16px; padding:1rem;
    background:{AIRBNB_CARD};
}}
.stButton>button {{
    background:{AIRBNB_RED}; color:white; border-radius:12px; border:none;
    padding:.6rem 1rem; font-weight:600;
}}
.stButton>button:hover {{ opacity:.9 }}
.stDataFrame, .stTable {{ color: white !important; }}
</style>
""", unsafe_allow_html=True)

##########
# Plotly: plantilla Airbnb
AIRBNB_COLORWAY = ["#FF5A5F", "#00A699", "#FC642D", "#BFBFBF", "#767676"]
pio.templates["airbnb_dark"] = pio.templates["plotly_dark"]
pio.templates["airbnb_dark"].layout.colorway = AIRBNB_COLORWAY
px.defaults.template = "airbnb_dark"
px.defaults.color_continuous_scale = CONT_GRADIENT
px.defaults.height = 420

##########
# Multi-país
COUNTRY_FILES = {
    "Alemania": "Berlin_Final.csv",
    "Valencia": "Valencia_Final.csv",
    "Estocolmo": "Estocolmo_Final.csv",
    "Mexico": "Mexico_Final.csv",
}

COUNTRY_IMAGES = {
    "Alemania": ["assets/Berlin1.jpg", "assets/Berlin3.jpg", "assets/Berlin2.jpg"],
    "Valencia": ["assets/Valencia1.jpg", "assets/Valencia2.jpg", "assets/Valencia3.jpg"],
    "Estocolmo": ["assets/Estocolmo1.jpg", "assets/Estocolmo2.jpg", "assets/Estocolmo3.jpg"],
    "Mexico": ["assets/Mexico1.jpg", "assets/Mexico2.jpg", "assets/Mexico3.jpg"],
}

##########
# Normalización
BIN_TRUE = {"t","true","True",1,"1",True}
BIN_FALSE= {"f","false","False",0,"0",False}

def _normalize_binary(series):
    s = series.copy()
    return s.apply(lambda v: 1 if v in BIN_TRUE else (0 if v in BIN_FALSE else np.nan)).astype("float")

def _normalize_df(df_raw):
    df = df_raw.copy()
    df.columns = df.columns.str.strip()
    df = df.loc[:, ~df.columns.str.contains(r"^Unnamed", na=False)]
    df = df.drop(['latitude','longitude','first_review','last_review','host_since', 'price', 'estimated_revenue_l365d','source','id', 'scrape_id'],
                 axis=1, errors="ignore")
    if 'id' in df.columns:
        df['id'] = df['id'].astype(str)
    if 'host_id' in df.columns:
        df['host_id'] = df['host_id'].astype(str)
    for col in ['host_is_superhost','host_identity_verified','instant_bookable']:
        if col in df.columns:
            df[col] = _normalize_binary(df[col])
    for col in ['host_response_rate','host_acceptance_rate','price','estimated_revenue_l365d','price_eur']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

def _clean_xy(df_base, y_col, x_cols):
    work = df_base[x_cols + [y_col]].replace([np.inf, -np.inf], np.nan)
    before = len(work)
    work = work.dropna()
    after = len(work)
    X = work[x_cols].to_numpy(dtype=float)
    y = work[y_col].to_numpy(dtype=float)
    return X, y, before - after

@st.cache_data(show_spinner=False)
def load_country_df(country: str):
    path = COUNTRY_FILES[country]
    raw = pd.read_csv(path)
    df = _normalize_df(raw)
    Lista = [
        'host_is_superhost','host_identity_verified','host_response_time',
        'host_response_rate','host_acceptance_rate','host_total_listings_count',
        'host_verifications','room_type','property_type','price_cat'
    ]
    return df, Lista

def kpis_block(df, country):
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        st.metric(f"{country} · Filas", f"{len(df):,}")
        st.markdown('</div>', unsafe_allow_html=True)
    with col2:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        st.metric(f"{country} · Tipos de propiedad", df['property_type'].nunique() if 'property_type' in df.columns else "—")
        st.markdown('</div>', unsafe_allow_html=True)
    with col3:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        med_price = np.nanmean(df['price_eur']) if 'price_eur' in df.columns else np.nan
        st.metric(f"{country} · Media precio", f"€{med_price:,.0f}" if np.isfinite(med_price) else "—")
        st.markdown('</div>', unsafe_allow_html=True)
    with col4:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        superhosts = int((df['host_is_superhost'] == 1).sum()) if 'host_is_superhost' in df.columns else 0
        st.metric(f"{country} · Superhosts", superhosts)
        st.markdown('</div>', unsafe_allow_html=True)

def extraction_tables_and_figs(df, var_cat: str):
    tabla = df[var_cat].value_counts(dropna=False).reset_index().head(10)
    tabla.columns = ['categorias','frecuencia']
    fig_bar = px.bar(tabla, x='categorias', y='frecuencia', color='categorias', title=None)
    fig_pie = px.pie(tabla, names='categorias', values='frecuencia', title=None)
    fig_donut = px.pie(tabla, names='categorias', values='frecuencia', hole=0.5, title=None)
    fig_area = px.area(tabla.sort_values('frecuencia', ascending=False), x='categorias', y='frecuencia', title=None)
    heat_df = pd.crosstab(index=df[var_cat], columns='count', normalize='columns') * 100
    fig_heat = px.imshow(heat_df, color_continuous_scale=CONT_GRADIENT, title=None)
    return tabla, fig_bar, fig_pie, fig_donut, fig_area, fig_heat
    
def gallery_block(country):
    st.markdown(f"**Galería:** {country} — Airbnb")
    imgs = COUNTRY_IMAGES.get(country, [])
    gcols = st.columns(3)
    for i, path in enumerate(imgs[:3]):
        with gcols[i]:
            try:
                st.image(path, use_container_width=True)
            except Exception:
                st.write("🖼️ Imagen no encontrada")

def get_common_lists(dfs_dict):
    num_sets, bin_sets, cat_sets = [], [], []
    for _, df in dfs_dict.items():
        num_cols = set(df.select_dtypes(include=['float','float64','int','int64']).columns.tolist())
        bin_cols = set([c for c in df.columns if df[c].dropna().nunique()==2])
        cat_cols = set([c for c in df.columns if df[c].dtype=='object' or df[c].dtype.name=='category'])
        cat_cols |= set([c for c in ['room_type','property_type','price_cat','host_response_time'] if c in df.columns])
        num_sets.append(num_cols); bin_sets.append(bin_cols); cat_sets.append(cat_cols)
    common_num = set.intersection(*num_sets) if num_sets else set()
    common_bin = set.intersection(*bin_sets) if bin_sets else set()
    common_cat = set.intersection(*cat_sets) if cat_sets else set()
    return sorted(list(common_num)), sorted(list(common_bin)), sorted(list(common_cat))

def run_logistic_block(df, y_col, x_cols, thr_mode="Manual", thr=0.5, c_fp=10000, c_fn=80000,
                       prec_min=0.6, test_size=0.30, imb_method="Ninguno"):
    # Asegura que y_col sea un string escalar
    if not isinstance(y_col, str):
        if isinstance(y_col, (list, tuple, np.ndarray)) and len(y_col) > 0:
            y_col = y_col[0]
        else:
            y_col = str(y_col)

    # Evita que Y esté en X (por si el usuario la seleccionó por error)
    x_cols = [x for x in x_cols if x != y_col]

    # Si por esta limpieza te quedas sin X, aborta con None
    if len(x_cols) == 0:
        return None

    # Construye base y fuerza Series para Y (no DataFrame)
    base = df[x_cols + [y_col]].copy()
    y_obj = base[y_col]
    if isinstance(y_obj, pd.DataFrame):
        # Si por alguna razón llega como DF (p.ej. columnas duplicadas), toma la primera
        y_obj = y_obj.iloc[:, 0]

    # Obtén clases de Y (exactamente dos)
    vals = pd.Series(y_obj).dropna().astype(object).unique().tolist()
    if len(vals) != 2:
        return None

    # Mapeo a {0,1} preservando el orden de aparición
    mapping = {vals[0]: 0, vals[1]: 1}
    base['__y__'] = y_obj.map(mapping)

    # Limpieza de NaN/Inf
    base = base.replace([np.inf, -np.inf], np.nan).dropna(subset=x_cols + ['__y__'])
    if base['__y__'].nunique() < 2:
        return None

    X = base[x_cols].astype(float).to_numpy()
    y = base['__y__'].to_numpy(dtype=int)

    # Split + escala
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s  = scaler.transform(X_test)

    # Re-muestreo
    if imb_method == "SMOTE (over-sampling)":
        sm = SMOTE(random_state=42)
        X_train_s, y_train = sm.fit_resample(X_train_s, y_train)
    elif imb_method == "Under-sampling":
        rus = RandomUnderSampler(random_state=42)
        X_train_s, y_train = rus.fit_resample(X_train_s, y_train)

    # Modelo
    if imb_method == "class_weight='balanced'":
        clf = LogisticRegression(max_iter=1000, class_weight='balanced')
    else:
        clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train_s, y_train)

    # Probabilidades
    y_proba = clf.predict_proba(X_test_s)[:, 1]

    # Selectores de umbral
    def pick_threshold_by_f1(y_true, y_score):
        p, r, th = precision_recall_curve(y_true, y_score)
        f1 = 2 * (p * r) / np.clip(p + r, 1e-12, None)
        best_idx = np.nanargmax(f1[:-1])
        return th[best_idx]

    def pick_threshold_by_cost(y_true, y_score, c_fp, c_fn):
        ths = np.linspace(0.0, 1.0, 1001)
        best_th, best_cost = 0.5, np.inf
        for t in ths:
            y_pred = (y_score >= t).astype(int)
            tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
            cost = fp * c_fp + fn * c_fn
            if cost < best_cost:
                best_cost, best_th = cost, t
        return best_th

    def pick_threshold_by_recall_with_prec_min(y_true, y_score, prec_min=0.6):
        p, r, th = precision_recall_curve(y_true, y_score)
        valid = np.where(p[:-1] >= prec_min)[0]
        if len(valid) == 0:
            return 0.5
        best_idx = valid[np.argmax(r[valid])]
        return th[best_idx]

    # Determina umbral
    if thr_mode == "F1 óptimo":
        thr = pick_threshold_by_f1(y_test, y_proba)
    elif thr_mode == "Minimizar costo":
        thr = pick_threshold_by_cost(y_test, y_proba, c_fp, c_fn)
    elif thr_mode == "Maximizar recall con precisión mínima":
        thr = pick_threshold_by_recall_with_prec_min(y_test, y_proba, prec_min=prec_min)
    # si es Manual, se respeta el thr que viene

    y_pred = (y_proba >= thr).astype(int)

    # Métricas
    acc   = accuracy_score(y_test, y_pred)
    bacc  = balanced_accuracy_score(y_test, y_pred)
    prec1 = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
    rec1  = recall_score(y_test, y_pred, pos_label=1, zero_division=0)
    f1m   = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
    auc   = roc_auc_score(y_test, y_proba)
    auprc = average_precision_score(y_test, y_proba)
    cm    = confusion_matrix(y_test, y_pred, labels=[0, 1])

    # Figuras
    labels_disp = [list(mapping.keys())[0], list(mapping.keys())[1]]
    fig_cm = go.Figure(data=go.Heatmap(
        z=cm,
        x=[f"Pred {labels_disp[0]}", f"Pred {labels_disp[1]}"],
        y=[f"Real {labels_disp[0]}", f"Real {labels_disp[1]}"],
        colorscale="Oranges", showscale=True, hoverongaps=False
    ))
    ann = []
    tags = np.array([["TN","FP"],["FN","TP"]])
    for i in range(2):
        for j in range(2):
            ann.append(dict(
                x=[f"Pred {labels_disp[0]}", f"Pred {labels_disp[1]}"][j],
                y=[f"Real {labels_disp[0]}", f"Real {labels_disp[1]}"][i],
                text=f"{tags[i,j]}: {cm[i,j]}",
                showarrow=False,
                font=dict(color="white" if cm[i,j] > cm.max()/2 else "black")
            ))
    fig_cm.update_layout(title=f"Matriz de confusión · umbral={thr:.2f}", annotations=ann, width=520, height=520)

    fpr, tpr, _ = roc_curve(y_test, y_proba)
    fig_roc = go.Figure()
    fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name=f"ROC (AUC={auc:.3f})"))
    fig_roc.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", name="Aleatorio", line=dict(dash="dot")))
    fig_roc.update_layout(title="Curva ROC", xaxis_title="FPR", yaxis_title="TPR")

    p, r, th = precision_recall_curve(y_test, y_proba)
    fig_pr = go.Figure()
    fig_pr.add_trace(go.Scatter(x=r, y=p, mode="lines", name=f"PR (AP={auprc:.3f})"))
    fig_pr.update_layout(title="Curva Precisión-Recall (clase 1)", xaxis_title="Recall", yaxis_title="Precisión")

    met_tab = pd.DataFrame({
        "Métrica": ["Exactitud","Balanced accuracy","Precisión (1)","Recall (1)","F1 (1)","ROC-AUC","AP (PR)"],
        "Valor":   [acc, bacc, prec1, rec1, f1m, auc, auprc]
    })

    return dict(
        metrics=met_tab, cm_fig=fig_cm, roc_fig=fig_roc, pr_fig=fig_pr,
        thr=thr, mapping=mapping
    )


# Carga inicial
df, Lista = load_country_df("Alemania")

##########
# Header
col_logo, col_title = st.columns([1,5], vertical_alignment="center")
with col_logo:
    st.image("assets/Logo3.jpg", width=90)
with col_title:
    st.markdown("""
        # Airbnb Data Analysis
        <span style="color:#767676">Listados, precios y comportamiento de oferta</span>
    """, unsafe_allow_html=True)

##########
# Sidebar
st.sidebar.image("assets/Logoo.jpg", use_container_width=True)
st.sidebar.caption("Análisis exploratorio y modelos")
st.sidebar.markdown("---")
modo_presentacion = st.sidebar.toggle("Modo presentación", value=False)
country = st.sidebar.selectbox("País", list(COUNTRY_FILES.keys()), index=0)
df, Lista = load_country_df(country)
View = st.sidebar.selectbox(
    label='Tipo de análisis',
    options=['Extracción de Características', 'Regresión Lineal', 'Regresión No Lineal', 'Regresión Logística', 'Comparar países'],
    index=0
)

##########################################################################################
# Vista 1 — Extracción de características (individual)
if View == "Extracción de Características":
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        st.metric("Filas", f"{len(df):,}")
        st.markdown('</div>', unsafe_allow_html=True)
    with col2:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        st.metric("Tipos de propiedad", df['property_type'].nunique() if 'property_type' in df.columns else "—")
        st.markdown('</div>', unsafe_allow_html=True)
    with col3:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        med_price = np.nanmean(df['price_eur']) if 'price_eur' in df.columns else np.nan
        st.metric("Media de precio", f"€{med_price:,.0f}" if np.isfinite(med_price) else "—")
        st.markdown('</div>', unsafe_allow_html=True)
    with col4:
        st.markdown('<div class="air-card">', unsafe_allow_html=True)
        superhosts = int((df['host_is_superhost'] == 1).sum()) if 'host_is_superhost' in df.columns else 0
        st.metric("Superhosts", superhosts)
        st.markdown('</div>', unsafe_allow_html=True)

    st.markdown("---")

    Variable_Cat = st.sidebar.selectbox("Variable categórica a analizar", options=Lista)
    Tabla_frecuencias = df[Variable_Cat].value_counts(dropna=False).reset_index().head(10)
    Tabla_frecuencias.columns = ['categorias', 'frecuencia']

    st.title("Extracción de Características")
    st.caption('Se muestran máximo las 10 categorías con más frecuencia.')

    Contenedor_A, Contenedor_B = st.columns(2)
    with Contenedor_A:
        st.subheader("Distribución por categoría (Bar Plot)")
        fig_bar = px.bar(Tabla_frecuencias, x='categorias', y='frecuencia', color='categorias')
        st.plotly_chart(fig_bar, use_container_width=True)
    with Contenedor_B:
        st.subheader("Proporción por categoría (Pie Chart)")
        fig_pie = px.pie(Tabla_frecuencias, names='categorias', values='frecuencia')
        st.plotly_chart(fig_pie, use_container_width=True)

    Contenedor_C, Contenedor_D = st.columns(2)
    with Contenedor_C:
        st.subheader("Gráfico tipo anillo")
        fig_donut = px.pie(Tabla_frecuencias, names='categorias', values='frecuencia', hole=0.5)
        st.plotly_chart(fig_donut, use_container_width=True)
    with Contenedor_D:
        st.subheader("Tendencia acumulada (Área)")
        fig_area = px.area(Tabla_frecuencias.sort_values(by='frecuencia', ascending=False),
                           x='categorias', y='frecuencia')
        st.plotly_chart(fig_area, use_container_width=True)

    st.markdown("---")
    st.subheader("Proporción por categoría (Heatmap)")
    heat_df = pd.crosstab(index=df[Variable_Cat], columns='count', normalize='columns') * 100
    fig_heat = px.imshow(heat_df, color_continuous_scale=CONT_GRADIENT, title="Proporción por categoría")
    st.plotly_chart(fig_heat, use_container_width=True)

    if not modo_presentacion:
        st.markdown("---")
        st.subheader("Tabla de frecuencias")
        st.dataframe(Tabla_frecuencias.style.background_gradient(cmap='Reds'), use_container_width=True)
        
    gallery_block(country)

##########################################################################################
# Vista 2 — Regresión Lineal (individual)
if View == "Regresión Lineal":
    st.title("Regresión Lineal")

    numeric_df = df.select_dtypes(include=['float', 'float64', 'int', 'int64']).copy()
    Lista_num = list(numeric_df.columns)

    st.subheader("Regresión lineal simple")
    colL, colR = st.columns(2)
    with colL:
        Variable_y = st.selectbox("Variable dependiente (Y)", options=Lista_num, key="rl_y")
    with colR:
        Variable_x = st.selectbox("Variable independiente (X)", options=Lista_num, key="rl_x")

    X, y, dropped = _clean_xy(numeric_df, Variable_y, [Variable_x])
    if dropped > 0 and not modo_presentacion:
        st.info(f"Se descartaron {dropped} filas con NaN/Inf para el ajuste.")
    if len(y) < 3:
        st.error("No hay suficientes filas válidas para ajustar el modelo.")
        st.stop()

    model = LinearRegression()
    model.fit(X, y)
    y_pred = model.predict(X)

    coef_Deter_simple = model.score(X= X, y= y)
    coef_Correl_simple = np.sqrt(abs(coef_Deter_simple))

    coef_df_simple = pd.DataFrame({
        "Variable": [Variable_x],
        "Coeficiente": [model.coef_[0]],
        "Intercepto": [model.intercept_],
        "R": [coef_Correl_simple],
        "R^2": [coef_Deter_simple]
    })
    if not modo_presentacion:
        st.dataframe(coef_df_simple, use_container_width=True)

    fig_scat = px.scatter(numeric_df, x=Variable_x, y=Variable_y, opacity=0.6, title="Dispersión y recta ajustada")
    order_idx = np.argsort(X[:, 0])
    fig_scat.add_trace(go.Scatter(x=X[order_idx, 0], y=y_pred[order_idx], mode="lines", name="Predicción de Y"))
    st.plotly_chart(fig_scat, use_container_width=True)

    resid = y - y_pred
    fig_res = px.scatter(x=y_pred, y=resid, labels={"x":"Ŷ", "y":"Residual"}, title="Residuos vs Predicción (diagnóstico)")
    fig_res.add_hline(y=0, line_dash="dot")
    st.plotly_chart(fig_res, use_container_width=True)

    st.markdown("---")
    st.subheader("Regresión lineal múltiple")
    col1, col2 = st.columns([1,2])
    with col1:
        Variable_y_M = st.selectbox("Variable dependiente (Y)", options=Lista_num, key="rlm_y")
    with col2:
        Variables_x_M = st.multiselect("Variables independientes (X)", options= Lista_num, key="rlm_xs")

    if len(Variables_x_M) >= 1:
        X_M, y_M, droppedM = _clean_xy(numeric_df, Variable_y_M, Variables_x_M)
        if droppedM > 0 and not modo_presentacion:
            st.info(f"Se descartaron {droppedM} filas con NaN/Inf para el ajuste múltiple.")
        if len(y_M) < max(3, len(Variables_x_M)+1):
            st.error("No hay suficientes filas válidas para el modelo múltiple.")
            st.stop()

        Model_M = LinearRegression()
        Model_M.fit(X_M, y_M)
        y_pred_M = Model_M.predict(X_M)

        coef_Deter_multiple = Model_M.score(X=X_M, y=y_M)
        coef_Correl_multiple = np.sqrt(abs(coef_Deter_multiple))

        coef_tab = pd.DataFrame({
            "Variable": ["Intercepto"] + Variables_x_M,
            "Coeficiente": [Model_M.intercept_] + list(Model_M.coef_)
        })
        if not modo_presentacion:
            st.dataframe(coef_tab, use_container_width=True)

        met_tab = pd.DataFrame({'R^2': [coef_Deter_multiple], 'R ': [coef_Correl_multiple]})
        st.dataframe(met_tab, use_container_width=True)

        fig_pred = px.scatter(x=y_M, y=y_pred_M, labels={"x":"Y real ", "y": "Y predicciones"}, title="Comparación Y Real vs Y Predicciones")
        fig_pred.add_trace(go.Scatter(x=[y_M.min(), y_M.max()], y=[y_M.min(), y_M.max()], mode="lines", name="Línea ideal", line=dict(dash="dot")))
        st.plotly_chart(fig_pred, use_container_width=True)
    else:
        st.info("Selecciona al menos 1 variable para el modelo múltiple.")

##########################################################################################
# Vista 3 — Regresión No Lineal (individual)
if View == "Regresión No Lineal":
    st.title("Regresión No Lineal")

    numeric_df = df.select_dtypes(include=['float','float64','int','int64']).copy()
    Lista_num = list(numeric_df.columns)

    contA, contB = st.columns(2)
    with contA:
        Variable_y = st.selectbox("Variable dependiente (Y)", options=Lista_num, key="rnl_y_cf")
    with contB:
        Variable_x = st.selectbox("Variable independiente (X)", options=[c for c in Lista_num if c != Variable_y], key="rnl_x_cf")

    modelos = [
        "Función cuadrática (a*x**2 + b*x + c)",
        "Función exponencial (a*np.exp(-b*x)+c)",
        "Función potencia (a*x**b)",
        "Función cúbica (a*x**3 + b*x**2 + c*x + d)"
    ]
    Modelo = st.selectbox("Elige modelo no lineal", options=modelos, key="rnl_modelo_cf")

    df_nl = numeric_df[[Variable_x, Variable_y]].replace([np.inf, -np.inf], np.nan).dropna()
    if len(df_nl) < 3:
        st.error("Datos insuficientes tras limpiar NaN/Inf para ajustar el modelo no lineal.")
        st.stop()

    x = df_nl[Variable_x].to_numpy(dtype=float)
    y = df_nl[Variable_y].to_numpy(dtype=float)
    sort_idx = np.argsort(x)
    x_sorted = x[sort_idx]

    def func_cuad(x, a, b, c): return a*x**2 + b*x + c
    def func_cub(x, a, b, c, d): return a*x**3 + b*x**2 + c*x + d
    def func_exp(x, a, b, c): return a * np.exp(-b * x) + c
    def func_pot(x, a, b): return a * np.power(x, b)

    try:
        if Modelo == "Función cuadrática (a*x**2 + b*x + c)":
            pars, cov = curve_fit(func_cuad, x, y, maxfev=20000)
            y_pred = func_cuad(x, *pars); y_line = func_cuad(x_sorted, *pars)
            params_df = pd.DataFrame({"Parámetro": ["a", "b", "c"], "Valor": pars})
        elif Modelo == "Función cúbica (a*x**3 + b*x**2 + c*x + d)":
            pars, cov = curve_fit(func_cub, x, y, maxfev=30000)
            y_pred = func_cub(x, *pars); y_line = func_cub(x_sorted, *pars)
            params_df = pd.DataFrame({"Parámetro": ["a", "b", "c", "d"], "Valor": pars})
        elif Modelo == "Función exponencial (a*np.exp(-b*x)+c)":
            mask = np.isfinite(y)
            if np.sum(mask) < 3: st.error("No hay suficientes datos válidos para el modelo exponencial."); st.stop()
            pars, cov = curve_fit(func_exp, x, y, maxfev=30000)
            y_pred = func_exp(x, *pars); y_line = func_exp(x_sorted, *pars)
            params_df = pd.DataFrame({"Parámetro": ["a", "b", "c"], "Valor": pars})
        elif Modelo == "Función potencia (a*x**b)":
            mask = (x > 0) & (y > 0) & np.isfinite(x) & np.isfinite(y)
            if mask.sum() < 3: st.error("Para potencia se requieren suficientes valores con x>0 e y>0."); st.stop()
            x_pos, y_pos = x[mask], y[mask]
            pars, cov = curve_fit(func_pot, x_pos, y_pos, maxfev=20000)
            x_safe = np.clip(x, 1e-12, None); x_sorted_safe = np.clip(x_sorted, 1e-12, None)
            y_pred = func_pot(x_safe, *pars); y_line = func_pot(x_sorted_safe, *pars)
            params_df = pd.DataFrame({"Parámetro": ["a", "b"], "Valor": pars})
        else:
            st.warning("Selecciona un modelo válido."); st.stop()

        r2 = r2_score(y, y_pred); r = np.sqrt(abs(r2))

        st.markdown("**Parámetros estimados (curve_fit):**")
        if not modo_presentacion: st.dataframe(params_df, use_container_width=True)

        st.markdown("**Métricas del ajuste:**")
        st.dataframe(pd.DataFrame({"R^2":[r2], "R ":[r]}), use_container_width=True)

        fig = px.scatter(x=x, y=y, labels={"x": Variable_x, "y": Variable_y},
                         opacity=0.6, title=f"{Modelo} — Dispersión y curva ajustada")
        fig.add_trace(go.Scatter(x=x_sorted, y=y_line, mode="lines", name="Ŷ (curva)", line=dict(width=2)))
        st.plotly_chart(fig, use_container_width=True)

        resid = y - y_pred
        fig_resid = px.scatter(x=y_pred, y=resid, labels={"x":"Ŷ", "y":"Residual"},
                               title="Residuos vs Predicción")
        fig_resid.add_hline(y=0, line_dash="dot")
        st.plotly_chart(fig_resid, use_container_width=True)

    except RuntimeError as e:
        st.error(f"No convergió el ajuste: {e}.")
    except Exception as e:
        st.error(f"Error durante el ajuste: {e}")

##########################################################################################
# Vista 4 — Regresión Logística (individual)
if View == "Regresión Logística":
    st.title("Regresión Logística")

    numeric_df = df.select_dtypes(include=['float', 'float64', 'int', 'int64'])
    Lista_num  = list(numeric_df.columns)

    dico_cols = []
    for col in df.columns:
        vals = df[col].dropna().unique()
        if len(vals) == 2:
            dico_cols.append(col)

    if len(dico_cols) == 0:
        st.warning("No se detectaron variables binarias en el dataset."); st.stop()

    Variable_y = st.sidebar.selectbox("Variable dependiente (Y, binaria)", options=dico_cols)
    Variables_x = st.sidebar.multiselect("Variables independientes (X, numéricas)", options=Lista_num)

    test_size = st.sidebar.slider("Tamaño de prueba", 0.1, 0.5, 0.30, 0.05)
    thr = st.sidebar.slider("Umbral de clasificación", 0.05, 0.95, 0.50, 0.01)

    if len(Variables_x) == 0:
        st.info("Selecciona al menos una variable independiente (X).")
    else:
        base = df[Variables_x + [Variable_y]].copy()
        vals = base[Variable_y].dropna().unique().tolist()
        if len(vals) != 2:
            st.error(f"La variable '{Variable_y}' debe tener exactamente 2 clases. Encontradas: {vals}")
            st.stop()

        mapping = {vals[0]: 0, vals[1]: 1}
        base['__y__'] = base[Variable_y].map(mapping)
        base = base.replace([np.inf, -np.inf], np.nan).dropna(subset=Variables_x + ['__y__'])
        if base['__y__'].nunique() < 2:
            st.error("Tras limpiar datos, solo queda una clase en Y."); st.stop()

        X = base[Variables_x].astype(float).to_numpy()
        y = base['__y__'].to_numpy(dtype=int)
        clases = vals

        st.sidebar.markdown("### Manejo de desbalance")
        imb_method = st.sidebar.selectbox("Método", ["Ninguno","class_weight='balanced'","SMOTE (over-sampling)","Under-sampling"])

        st.sidebar.markdown("### Estrategia de umbral")
        thr_mode = st.sidebar.selectbox("Seleccionar umbral por…", ["Manual", "F1 óptimo", "Minimizar costo", "Maximizar recall con precisión mínima"])
        prec_min = None; c_fp = None; c_fn = None
        if thr_mode == "Manual":
            thr = st.sidebar.slider("Umbral de clasificación", 0.01, 0.99, thr, 0.01)
        elif thr_mode == "Maximizar recall con precisión mínima":
            prec_min = st.sidebar.slider("Precisión mínima requerida", 0.1, 0.99, 0.6, 0.01)
        elif thr_mode == "Minimizar costo":
            c_fp = st.sidebar.number_input("Costo por FP", min_value=0, value=10000, step=1000)
            c_fn = st.sidebar.number_input("Costo por FN", min_value=0, value=80000, step=1000)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
        escalar = StandardScaler()
        X_train_s = escalar.fit_transform(X_train)
        X_test_s  = escalar.transform(X_test)

        if imb_method == "SMOTE (over-sampling)":
            sm = SMOTE(random_state=42); X_train_s, y_train = sm.fit_resample(X_train_s, y_train)
        elif imb_method == "Under-sampling":
            rus = RandomUnderSampler(random_state=42); X_train_s, y_train = rus.fit_resample(X_train_s, y_train)

        if imb_method == "class_weight='balanced'":
            algoritmo = LogisticRegression(max_iter=1000, class_weight='balanced')
        else:
            algoritmo = LogisticRegression(max_iter=1000)
        algoritmo.fit(X_train_s, y_train)

        y_proba = algoritmo.predict_proba(X_test_s)[:, 1]

        def pick_threshold_by_f1(y_true, y_score):
            p, r, th = precision_recall_curve(y_true, y_score)
            f1 = 2 * (p*r) / np.clip(p+r, 1e-12, None); best_idx = np.nanargmax(f1[:-1])
            return th[best_idx], f1[best_idx], p[best_idx], r[best_idx]

        def pick_threshold_by_cost(y_true, y_score, c_fp, c_fn):
            ths = np.linspace(0.0, 1.0, 1001); best_th, best_cost = 0.5, np.inf
            for t in ths:
                y_pred = (y_score >= t).astype(int)
                tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
                cost = fp * c_fp + fn * c_fn
                if cost < best_cost: best_cost, best_th = cost, t
            return best_th, best_cost

        def pick_threshold_by_recall_with_prec_min(y_true, y_score, prec_min=0.6):
            p, r, th = precision_recall_curve(y_true, y_score)
            valid = np.where(p[:-1] >= prec_min)[0]
            if len(valid) == 0: return 0.5, 0.0, 0.0
            best_idx = valid[np.argmax(r[valid])]; return th[best_idx], r[best_idx], p[best_idx]

        if thr_mode == "F1 óptimo":
            thr, best_f1, best_p, best_r = pick_threshold_by_f1(y_test, y_proba)
        elif thr_mode == "Minimizar costo":
            thr, best_cost = pick_threshold_by_cost(y_test, y_proba, c_fp, c_fn)
        elif thr_mode == "Maximizar recall con precisión mínima":
            thr, best_r, best_p = pick_threshold_by_recall_with_prec_min(y_test, y_proba, prec_min=prec_min)

        y_pred = (y_proba >= thr).astype(int)

        acc     = accuracy_score(y_test, y_pred)
        bacc    = balanced_accuracy_score(y_test, y_pred)
        prec_c0 = precision_score(y_test, y_pred, pos_label=0, zero_division=0)
        prec_c1 = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
        rec_c0  = recall_score(y_test, y_pred, pos_label=0, zero_division=0)
        rec_c1  = recall_score(y_test, y_pred, pos_label=1, zero_division=0)
        f1_min  = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
        auc     = roc_auc_score(y_test, y_proba)
        auprc   = average_precision_score(y_test, y_proba)

        met_rows = [
            ("Exactitud", acc),
            ("Balanced accuracy", bacc),
            (f"Precision ({clases[0]})", prec_c0),
            (f"Precision ({clases[1]})", prec_c1),
            (f"Sensibilidad ({clases[0]})", rec_c0),
            (f"Sensibilidad ({clases[1]})", rec_c1),
            (f"F1 ({clases[1]})", f1_min),
            ("ROC-AUC", auc)
        ]
        if thr_mode == "Minimizar costo":
            met_rows.append(("Costo total (FP/FN)", best_cost))

        met_tab = pd.DataFrame(met_rows, columns=["Métrica", "Valor"])
        st.subheader("Métricas")
        st.dataframe(met_tab, use_container_width=True)

        prev = y_test.mean()
        if prec_c1 == 1.0 and rec_c1 < 0.15:
            st.warning("La precisión de la clase minoritaria es 1.0 pero el recall es muy bajo. Ajusta umbral o balanceo.")
        if acc > 0.9 and bacc < 0.65 and prev < 0.25:
            st.info("La exactitud es alta por el desbalance. Revisa balanced accuracy, AUPRC y F1 de la minoritaria.")

        coef = algoritmo.coef_[0]; intercepto = algoritmo.intercept_[0]
        coef_tab = pd.DataFrame({
            "Variable": ["Intercepto"] + Variables_x,
            "Coeficiente (log-odds)": [intercepto] + list(coef),
            "Odds Ratio (exp(coef))": [np.exp(intercepto)] + list(np.exp(coef))
        })
        if not modo_presentacion:
            st.subheader("Coeficientes del modelo")
            st.dataframe(coef_tab, use_container_width=True)

        matriz = confusion_matrix(y_test, y_pred, labels=[0, 1])
        labels_disp = [clases[0], clases[1]]
        fig_cm = go.Figure(data=go.Heatmap(
            z=matriz,
            x=[f"Pred {labels_disp[0]}", f"Pred {labels_disp[1]}"],
            y=[f"Real {labels_disp[0]}", f"Real {labels_disp[1]}"],
            colorscale="Oranges", showscale=True, hoverongaps=False
        ))
        ann = []; tags = np.array([["TN","FP"],["FN","TP"]])
        for i in range(2):
            for j in range(2):
                ann.append(dict(
                    x=[f"Pred {labels_disp[0]}", f"Pred {labels_disp[1]}"][j],
                    y=[f"Real {labels_disp[0]}", f"Real {labels_disp[1]}"][i],
                    text=f"{tags[i,j]}: {matriz[i,j]}",
                    showarrow=False,
                    font=dict(color="white" if matriz[i,j] > matriz.max()/2 else "black")
                ))
        fig_cm.update_layout(title=f"Matriz de confusión (umbral={thr:.2f})", annotations=ann, width=520, height=520)
        st.plotly_chart(fig_cm, use_container_width=False)

        fpr, tpr, _ = roc_curve(y_test, y_proba)
        fig_roc = go.Figure()
        fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name=f"ROC (AUC={auc:.3f})"))
        fig_roc.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", name="Aleatorio", line=dict(dash="dot")))
        fig_roc.update_layout(title="Curva ROC", xaxis_title="FPR", yaxis_title="TPR")
        st.plotly_chart(fig_roc, use_container_width=True)

        p, r, th = precision_recall_curve(y_test, y_proba)
        fig_pr = go.Figure()
        fig_pr.add_trace(go.Scatter(x=r, y=p, mode="lines", name=f"PR (AP={auprc:.3f})"))
        fig_pr.update_layout(title="Curva Precisión-Recall (clase 1)", xaxis_title="Recall", yaxis_title="Precisión")
        st.plotly_chart(fig_pr, use_container_width=True)

        fig_prob = px.strip(
            x=[labels_disp[i] for i in y_test], y=y_proba,
            labels={"x":"Clase real", "y":"Probabilidad P(Y=1)"},
            title="Distribución de probabilidades por clase real"
        )
        fig_prob.add_hline(y=thr, line_dash="dot", annotation_text=f"Umbral {thr:.2f}")
        st.plotly_chart(fig_prob, use_container_width=True)

        st.caption(f"Mapeo interno (solo para el modelo): {clases[0]} → 0, {clases[1]} → 1. Prevalencia clase 1 (test): {prev:.3f}")

##########################################################################################
# Vista 5 — COMPARAR PAÍSES (Mejorada)
if View == "Comparar países":
    st.title("Comparación de países (Alemania · Valencia · Estocolmo · México)")
    st.caption("Misma métrica y visual por país, en una sola vista.")

    # Cargar todos los países
    dfs = {}
    for c in COUNTRY_FILES.keys():
        dfi, _ = load_country_df(c)
        dfs[c] = dfi

    common_num, common_bin, common_cat = get_common_lists(dfs)

    # Sub-vistas actualizadas
    subview = st.radio(
        "Sub-vista",
        ["Extracción comparada", "Regresión logística comparada", "Regresión lineal comparada", "Regresión no lineal comparada"],
        horizontal=True
    )

    # ============= EXTRACCIÓN COMPARADA (con KPIs + filas por tipo de gráfico + heatmaps + tablas) =============
    if subview == "Extracción comparada":
        if len(common_cat)==0:
            st.error("No hay columnas categóricas en común en los 4 datasets.")
            st.stop()
        var_cat = st.selectbox("Variable categórica común", options=sorted(common_cat), index=sorted(common_cat).index("room_type") if "room_type" in common_cat else 0)

        # KPIs por país (en una sola fila)
        #st.markdown("### KPI's por país")
        #cols = st.columns(4)
        #for i, c in enumerate(COUNTRY_FILES.keys()):
        #    with cols[i]:
        #        kpis_block(dfs[c], c)

        # KPI's por país (fila completa)
        st.markdown("### KPI's por país")
        for c in COUNTRY_FILES.keys():
            with st.container():
                kpis_block(dfs[c], c)

        # Preparar tablas y figuras por país (una sola vez)
        cache = {}
        for c in COUNTRY_FILES.keys():
            # devuelve: tabla, fig_bar, fig_pie, fig_donut, fig_area, fig_heat
            cache[c] = extraction_tables_and_figs(dfs[c], var_cat)
        
        countries = list(COUNTRY_FILES.keys())

        def _grid_2x2(fig_key_idx: int, titulo: str):
            st.subheader(titulo)
            rows = [countries[:2], countries[2:4]]
            for row in rows:
                row_cols = st.columns(2)
                for j, cc in enumerate(row):
                    with row_cols[j]:
                        st.markdown(f"**{cc}**")
                        st.plotly_chart(cache[cc][fig_key_idx], use_container_width=True)

        # 2x2 por tipo de gráfico (mismo tipo por fila)
        _grid_2x2(fig_key_idx=1, titulo="Distribución por categoría (Bar) — Comparación directa")
        _grid_2x2(fig_key_idx=2, titulo="Proporción por categoría (Pie) — Comparación directa")
        _grid_2x2(fig_key_idx=3, titulo="Gráfico tipo anillo (Donut) — Comparación directa")
        _grid_2x2(fig_key_idx=4, titulo="Tendencia acumulada (Área) — Comparación directa")
        _grid_2x2(fig_key_idx=5, titulo="Proporción por categoría (Heatmap) — Comparación directa")

        st.markdown("---")
        st.subheader("Tablas de frecuencias (Top 10) por país")
        tabs = st.tabs(countries)
        for t, c in zip(tabs, countries):
            with t:
                tabla = cache[c][0]
                st.dataframe(tabla.style.background_gradient(cmap='Reds'), use_container_width=True)
        
        # Imágenes por país (grid 2x2) — ahora muestra las 3 imágenes por ciudad
        st.markdown("### Imágenes por país")
        rows_img = [countries[:2], countries[2:4]]
        for row in rows_img:
            row_cols = st.columns(2)
            for j, cc in enumerate(row):
                with row_cols[j]:
                    st.markdown(f"**{cc}**")
                    img_list = COUNTRY_IMAGES.get(cc, [])
                    if len(img_list) > 0:
                        # Muestra hasta 3 imágenes en una fila
                        img_cols = st.columns(len(img_list))
                        for k, img_path in enumerate(img_list[:3]):
                            with img_cols[k]:
                                try:
                                    st.image(img_path, use_container_width=True)
                                except Exception:
                                    st.write("🖼️ Imagen no encontrada")
                    else:
                        st.write("Sin imágenes registradas.")


    # ============= LOGÍSTICA COMPARADA (igual que antes) =============
    elif subview == "Regresión logística comparada":
        if len(common_bin)==0:
            st.error("No hay variables binarias en común en los 4 datasets.")
            st.stop()
        if len(common_num)==0:
            st.error("No hay variables numéricas en común en los 4 datasets.")
            st.stop()

        st.markdown("### Parámetros comunes")
        y_col = st.selectbox("Variable Y (binaria, común)", options=common_bin)
        x_cols = st.multiselect("Variables X (numéricas, comunes)", options=common_num, default=[c for c in common_num if c not in [y_col]][:3])
        test_size = st.slider("Tamaño de prueba", 0.1, 0.5, 0.30, 0.05)

        colU, colV, colW = st.columns(3)
        with colU:
            imb_method = st.selectbox("Manejo de desbalance", ["Ninguno","class_weight='balanced'","SMOTE (over-sampling)","Under-sampling"])
        with colV:
            thr_mode = st.selectbox("Umbral por", ["Manual","F1 óptimo","Minimizar costo","Maximizar recall con precisión mínima"])
        with colW:
            thr_manual = st.slider("Umbral (si Manual)", 0.01, 0.99, 0.50, 0.01)

        colX, colY = st.columns(2)
        with colX:
            c_fp = st.number_input("Costo por FP (si Minimizar costo)", min_value=0, value=10000, step=1000)
        with colY:
            c_fn = st.number_input("Costo por FN (si Minimizar costo)", min_value=0, value=80000, step=1000)

        prec_min = st.slider("Precisión mínima (si Máx. recall)", 0.1, 0.99, 0.60, 0.01)

        if len(x_cols)==0:
            st.info("Selecciona al menos 1 X para correr comparación.")
            st.stop()

        # Ejecutar por país
        results = {}
        for c in COUNTRY_FILES.keys():
            res = run_logistic_block(
                dfs[c], y_col, x_cols,
                thr_mode=thr_mode,
                thr=thr_manual,
                c_fp=c_fp, c_fn=c_fn,
                prec_min=prec_min,
                test_size=test_size,
                imb_method=imb_method
            )
            if res is not None:
                results[c] = res

        if len(results)==0:
            st.error("No se pudo entrenar el modelo en ninguno de los países (revisa datos y clases).")
            st.stop()

        st.markdown("### Métricas comparadas")
        # Tabla apilada por país
        tabs = st.tabs(list(results.keys()))
        for tab, (c, res) in zip(tabs, results.items()):
            with tab:
                st.dataframe(res["metrics"], use_container_width=True)

        st.markdown("### Matrices de confusión por país")
        # Grilla 2x2
        countries = list(results.keys())
        rows = [countries[:2], countries[2:4]]
        for row in rows:
            cols = st.columns(len(row))
            for i, c in enumerate(row):
                with cols[i]:
                    st.markdown(f"**{c}**")
                    st.plotly_chart(results[c]["cm_fig"], use_container_width=True)

        with st.expander("Curvas ROC por país"):
            cols = st.columns(2)
            items = list(results.items())
            for i, (c, res) in enumerate(items):
                with cols[i%2]:
                    st.markdown(f"**{c}**"); st.plotly_chart(res["roc_fig"], use_container_width=True)

        with st.expander("Curvas Precisión-Recall por país"):
            cols = st.columns(2)
            items = list(results.items())
            for i, (c, res) in enumerate(items):
                with cols[i%2]:
                    st.markdown(f"**{c}**"); st.plotly_chart(res["pr_fig"], use_container_width=True)



    # ============= LINEAL COMPARADA (Simple y Múltiple) =============
    elif subview == "Regresión lineal comparada":
        if len(common_num)==0:
            st.error("No hay variables numéricas en común en los 4 datasets."); st.stop()

        tab_simple, tab_multiple = st.tabs(["Lineal Simple", "Lineal Múltiple"])

        # --- SIMPLE ---
        with tab_simple:
            colA, colB = st.columns(2)
            with colA:
                y_lin = st.selectbox("Y (numérica común)", options=common_num, key="cmp_rl_y")
            with colB:
                x_lin = st.selectbox("X (numérica común)", options=[c for c in common_num if c!=y_lin], key="cmp_rl_x")

            st.markdown("### Dispersión + recta por país")
            cols = st.columns(4)
            for i, c in enumerate(COUNTRY_FILES.keys()):
                dfi = dfs[c].select_dtypes(include=['float','float64','int','int64'])
                if y_lin not in dfi.columns or x_lin not in dfi.columns:
                    continue
                X, y, dropped = _clean_xy(dfi, y_lin, [x_lin])
                if len(y) < 3:
                    continue
                model = LinearRegression().fit(X, y)
                y_pred = model.predict(X)
                r2 = model.score(X, y); r = float(np.sqrt(abs(r2)))
                fig = px.scatter(dfi, x=x_lin, y=y_lin, opacity=0.6, title=None)
                order_idx = np.argsort(X[:,0])
                fig.add_trace(go.Scatter(x=X[order_idx,0], y=y_pred[order_idx], mode="lines", name="Ŷ"))
                with cols[i]:
                    st.markdown(f"**{c}**  \nR²: `{r2:.2f}` · R: `{r:.2f}`")
                    st.plotly_chart(fig, use_container_width=True)

        # --- MÚLTIPLE ---
        with tab_multiple:
            colY, colXs = st.columns([1,2])
            with colY:
                y_linM = st.selectbox("Y (numérica común)", options=common_num, key="cmp_rlm_y")
            with colXs:
                xs_linM = st.multiselect("X (numéricas comunes)", options=[c for c in common_num if c!=y_linM], default=[c for c in common_num if c!=y_linM][:3], key="cmp_rlm_xs")

            if len(xs_linM)==0:
                st.info("Selecciona al menos 1 X común.")
            else:
                tabs = st.tabs(list(COUNTRY_FILES.keys()))
                for t, c in zip(tabs, COUNTRY_FILES.keys()):
                    with t:
                        dfi = dfs[c].select_dtypes(include=['float','float64','int','int64'])
                        if any(col not in dfi.columns for col in [y_linM] + xs_linM):
                            st.warning("Columna(s) faltante(s) en este país.")
                            continue
                        X, y, dropped = _clean_xy(dfi, y_linM, xs_linM)
                        if len(y) < max(3, len(xs_linM)+1):
                            st.warning("Datos insuficientes tras limpieza.")
                            continue
                        m = LinearRegression().fit(X, y)
                        y_pred = m.predict(X)
                        r2 = m.score(X, y); r = float(np.sqrt(abs(r2)))
                        coef_tab = pd.DataFrame({"Variable": ["Intercepto"] + xs_linM,
                                                 "Coeficiente": [m.intercept_] + list(m.coef_)})
                        st.markdown(f"**{c}** · R²: `{r2:.2f}` · R: `{r:.2f}`")
                        st.dataframe(coef_tab, use_container_width=True)

    # ============= NO LINEAL COMPARADA =============
    elif subview == "Regresión no lineal comparada":
        if len(common_num)==0:
            st.error("No hay variables numéricas en común en los 4 datasets."); st.stop()

        colA, colB = st.columns(2)
        with colA:
            y_nl = st.selectbox("Y (numérica común)", options=common_num, key="cmp_rnl_y")
        with colB:
            x_nl = st.selectbox("X (numérica común)", options=[c for c in common_num if c!=y_nl], key="cmp_rnl_x")

        modelos = [
            "Función cuadrática (a*x**2 + b*x + c)",
            "Función exponencial (a*np.exp(-b*x)+c)",
            "Función potencia (a*x**b)",
            "Función cúbica (a*x**3 + b*x**2 + c*x + d)"
        ]
        modelo_sel = st.selectbox("Modelo no lineal", options=modelos, key="cmp_rnl_model")

        def func_cuad(x, a, b, c): return a*x**2 + b*x + c
        def func_cub(x, a, b, c, d): return a*x**3 + b*x**2 + c*x + d
        def func_exp(x, a, b, c): return a * np.exp(-b * x) + c
        def func_pot(x, a, b): return a * np.power(x, b)

        cols = st.columns(4)
        for i, c in enumerate(COUNTRY_FILES.keys()):
            dfi = dfs[c].select_dtypes(include=['float','float64','int','int64'])
            if any(col not in dfi.columns for col in [x_nl, y_nl]):
                with cols[i]: st.warning("Columnas no disponibles."); continue
            df_nl = dfi[[x_nl, y_nl]].replace([np.inf,-np.inf], np.nan).dropna()
            if len(df_nl) < 3:
                with cols[i]: st.warning("Datos insuficientes."); continue
            x = df_nl[x_nl].to_numpy(dtype=float)
            y = df_nl[y_nl].to_numpy(dtype=float)
            sort_idx = np.argsort(x); xs = x[sort_idx]

            try:
                if modelo_sel.startswith("Función cuadrática"):
                    pars, _ = curve_fit(func_cuad, x, y, maxfev=20000)
                    y_pred = func_cuad(x, *pars); y_line = func_cuad(xs, *pars)
                elif modelo_sel.startswith("Función cúbica"):
                    pars, _ = curve_fit(func_cub, x, y, maxfev=30000)
                    y_pred = func_cub(x, *pars); y_line = func_cub(xs, *pars)
                elif modelo_sel.startswith("Función exponencial"):
                    pars, _ = curve_fit(func_exp, x, y, maxfev=30000)
                    y_pred = func_exp(x, *pars); y_line = func_exp(xs, *pars)
                else:
                    mask = (x>0) & (y>0) & np.isfinite(x) & np.isfinite(y)
                    if mask.sum() < 3:
                        with cols[i]: st.warning("x>0 e y>0 insuficientes para potencia."); continue
                    xp, yp = x[mask], y[mask]
                    pars, _ = curve_fit(func_pot, xp, yp, maxfev=20000)
                    xs_safe = np.clip(xs, 1e-12, None); x_safe = np.clip(x, 1e-12, None)
                    y_pred = func_pot(x_safe, *pars); y_line = func_pot(xs_safe, *pars)

                r2 = r2_score(y, y_pred); r = float(np.sqrt(abs(r2)))
                fig = px.scatter(x=x, y=y, labels={"x": x_nl, "y": y_nl}, opacity=0.6, title=None)
                fig.add_trace(go.Scatter(x=xs, y=y_line, mode="lines", name="Ŷ", line=dict(width=2)))
                with cols[i]:
                    st.markdown(f"**{c}**  \nR²: `{r2:.3f}` · R: `{r:.3f}`")
                    st.plotly_chart(fig, use_container_width=True)
            except RuntimeError as e:
                with cols[i]: st.error(f"No convergió: {e}")
            except Exception as e:
                with cols[i]: st.error(f"Error: {e}")

# FOOTER
st.markdown("---")
st.markdown("""
<div style="text-align:center; opacity:0.8; font-size:0.9rem;">
© Proyecto para Gestión de Proyectos — Dashboard creado por <b>Los Guaranies</b> con ayuda de IA y profe Freddy/Malu.  
<br> Construido con Streamlit, Plotly y Python.
</div>
""", unsafe_allow_html=True)



Overwriting Dashboard_Final.py
