In [6]:
!pip install streamlit

Defaulting to user installation because normal site-packages is not writeable


In [9]:
pip install streamlit-plotly-events


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [10]:
%%writefile despliegue.py
import streamlit as st
import plotly.express as px
import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats

# ===================== CUSTOMIZACIÓN DEL DASHBOARD =====================
import plotly.express as px
import plotly.graph_objects as go
import matplotlib as mpl
import seaborn as sns

charts_palette = ["#4458A6", "#7B4BA3", "#2F3B66", "#9768D1", "#4A5176"]

px.defaults.template = "plotly_dark"
px.defaults.color_discrete_sequence = charts_palette
px.defaults.width = None
px.defaults.height = None

mpl.rcParams.update({
    "font.family": "Times New Roman",
    "axes.titlesize": 18,
    "axes.labelsize": 13,
    "xtick.labelsize": 11,
    "ytick.labelsize": 11,
})
sns.set_palette(charts_palette)

def style_plotly(fig: go.Figure) -> go.Figure:
    fig.update_layout(
        font=dict(family="Times New Roman", size=16),
        title_font=dict(family="Times New Roman", size=22),
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
        margin=dict(l=10, r=10, t=40, b=10),
    )
    # Opcional: quitar borde de marcadores en barras/dispersiones para look más limpio
    fig.update_traces(marker_line_width=0)
    return fig

palette = charts_palette
palette2 = ["#4458A6", "#2F3B66", "#8F2D56"]

# Helper para mostrar cualquier figura con el estilo unificado
def show(fig: go.Figure):
    fig = style_plotly(fig)
    st.plotly_chart(fig, use_container_width=True)

######################################################
# Definimos la instancia
@st.cache_resource
######################################################
# Creamos la función de carga de datos
def load_data():
    # Lectura del archivo csv
    df = pd.read_csv("Listings_sin_atipicos.csv")
    return df

df = load_data()
#columns_to_drop = ['Unnamed: 0', 'Unnamed: 0.1', 'id', 'scrape_id', 'host_id']
#df.drop(columns_to_drop, axis=1, inplace=True)


# ================== HELPERS GLOBALES ==================
def vc_df(series, name_col):
    """Devuelve tabla de frecuencias con nombre estándar"""
    t = series.value_counts(dropna=False).reset_index()
    t.columns = [name_col, 'count']
    t[name_col] = t[name_col].astype(object).where(t[name_col].notna(), 'Unknown').astype(str)
    return t

def to_num_money(series):
    """Convierte precios tipo '$1,234' a float"""
    s = series.astype(str).str.strip()
    s = s.str.replace('$', '', regex=False).str.replace(',', '', regex=False)
    return pd.to_numeric(s, errors='coerce')

def to_num_pct(series):
    """Convierte porcentajes tipo '95%' a float"""
    s = series.astype(str).str.strip()
    s = s.str.replace('%', '', regex=False).str.replace(',', '.', regex=False)
    return pd.to_numeric(s, errors='coerce')

palette = ['#8F2D56', '#218380', '#FBB13C', '#73D2DE']

# ================== LIMPIEZAS / COLUMNAS AUX ==================
for col in ['availability_365', 'review_scores_value', 'bathrooms', 'beds']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

if 'host_acceptance_rate' in df.columns:
    df['host_acceptance_rate'] = to_num_pct(df['host_acceptance_rate'])

if 'host_is_superhost' in df.columns:
    df['_superhost'] = (
        df['host_is_superhost'].astype(str).str.lower()
        .map({'t': True, 'true': True, '1': True, 'f': False, 'false': False, '0': False})
    )
else:
    df['_superhost'] = np.nan

# ---- Rangos de precio ----
if '_price_num' in df.columns and df['_price_num'].notna().sum() > 0:
    try:
        qbins = pd.qcut(df['_price_num'], q=5, duplicates='drop')
    except Exception:
        qbins = pd.cut(
            df['_price_num'],
            bins=np.linspace(df['_price_num'].min(), df['_price_num'].max(), 6),
            include_lowest=True
        )
    df['_price_bin'] = qbins

    def bin_label(iv):
        if pd.isna(iv):
            return 'Unknown'
        L = int(np.floor(iv.left)) if np.isfinite(iv.left) else iv.left
        R = int(np.ceil(iv.right)) if np.isfinite(iv.right) else iv.right
        return f"{L:,} – {R:,}"
    uniq_bins = sorted(df['_price_bin'].dropna().unique(), key=lambda x: x.left)
    price_labels_map = {iv: bin_label(iv) for iv in uniq_bins}

    df['_price_bin_str'] = df['_price_bin'].map(lambda iv: price_labels_map.get(iv, 'Unknown')).astype(str)
else:
    df['_price_bin_str'] = 'Unknown'

# ---- Review scores a categorías tipo Likert ----
if 'review_scores_value' in df.columns and df['review_scores_value'].notna().sum() > 0:
    rmin = float(np.nanmin(df['review_scores_value']))
    rmax = float(np.nanmax(df['review_scores_value']))
    bins_r = np.linspace(max(3.8, rmin), min(5.1, max(rmax, 5.0)), 6)
    labels_r = ['Malo (3.8–4.1)', 'Regular (4.1–4.3)', 'Bueno (4.4–4.6)',
                'Muy Bueno (4.7–4.9)', 'Excelente (5)']
    df['_rev_cat'] = pd.cut(df['review_scores_value'], bins=bins_r,
                            labels=labels_r, include_lowest=True)
else:
    df['_rev_cat'] = np.nan

# ---- Top-5 para filtros ----
if 'property_type' in df.columns:
    top5_prop = df['property_type'].value_counts().head(5).index.astype(str).tolist()
else:
    top5_prop = []

if 'neighbourhood_cleansed' in df.columns:
    top5_neigh = df['neighbourhood_cleansed'].value_counts().head(5).index.astype(str).tolist()
else:
    top5_neigh = []

# ================== SIDEBAR: MENÚ + FILTROS (cerrado) + APLICAR A dff ==================
st.sidebar.title("Airbnb Mexico City")

# Menú principal
View = st.sidebar.selectbox(
    "Tipo de Análisis",
    ["Extracción de Características", "Regresión Lineal",
     "Regresión No Lineal", "Regresión Logística"],
    key="menu_tipo_analisis"
)

# Submenú
Variable_Cat = None
if View == "Extracción de Características":
    Variable_Cat = st.sidebar.selectbox(
        "Características",
        ["Host info", "Property type", "Overall"],
        key="menu_caracteristicas"
    )

# --- Asegura top-5 antes de usarlos ---
if 'property_type' in df.columns:
    top5_prop = df['property_type'].astype(str).value_counts().head(5).index.tolist()
else:
    top5_prop = []

if 'neighbourhood_cleansed' in df.columns:
    top5_neigh = df['neighbourhood_cleansed'].astype(str).value_counts().head(5).index.tolist()
else:
    top5_neigh = []

# --- Panel de Filtros (CERRADO) ---
with st.sidebar.expander("Filtros", expanded=False):

    # Superhost (multiselect)
    sup_options = ['True', 'False']
    sup_sel = st.multiselect(
        "Host es Superhost",
        sup_options,
        default=sup_options,
        key="filtro_superhost"
    )

    # Precio (slider)
    if '_price_num' in df.columns and df['_price_num'].notna().any():
        min_price = int(np.floor(df['_price_num'].min()))
        max_price = int(np.ceil(df['_price_num'].max()))
        price_range = st.slider(
            "Rango de Precio (moneda original)",
            min_value=min_price,
            max_value=max_price,
            value=(min_price, max_price),
            step=max(1, (max_price - min_price) // 100),
            key="price_slider"
        )
    else:
        price_range = None
        st.info("No hay precios numéricos disponibles para el slider.")

    # Property Type (checkbox + multiselect)
    use_property_filter = st.checkbox("Filtrar por Property Type (Top-5)", value=False, key="chk_property")
    if 'property_type' in df.columns:
        pt_options = top5_prop if len(top5_prop) > 0 else sorted(
            df['property_type'].astype(str).unique().tolist()
        )
    else:
        pt_options = []
    if use_property_filter:
        pt_sel = st.multiselect(
            "Selecciona Property Type",
            pt_options,
            default=pt_options,
            key="filtro_property"
        )
    else:
        pt_sel = pt_options  # no filtra (toma todo)

    # Neighbourhood (checkbox + multiselect)
    use_neigh_filter = st.checkbox("Filtrar por Neighbourhood (Top-5)", value=False, key="chk_neigh")
    if 'neighbourhood_cleansed' in df.columns:
        nb_options = top5_neigh if len(top5_neigh) > 0 else sorted(
            df['neighbourhood_cleansed'].astype(str).unique().tolist()
        )
    else:
        nb_options = []
    if use_neigh_filter:
        nb_sel = st.multiselect(
            "Selecciona Neighbourhood",
            nb_options,
            default=nb_options,
            key="filtro_neighbourhood"
        )
    else:
        nb_sel = nb_options  # no filtra (toma todo)

    # Review Scores (checkbox + multiselect)
    use_review_filter = st.checkbox("Filtrar por Review Scores Value", value=False, key="chk_review")
    rv_options_all = ['Malo (3.8–4.1)', 'Regular (4.1–4.3)',
                      'Bueno (4.4–4.6)', 'Muy Bueno (4.7–4.9)', 'Excelente (5)']
    rv_existing = sorted([x for x in rv_options_all if '_rev_cat' in df.columns and x in df['_rev_cat'].astype(str).unique()])
    if use_review_filter:
        rv_sel = st.multiselect(
            "Selecciona Review Scores",
            rv_existing or ['(sin datos)'],
            default=rv_existing or [],
            key="filtro_review"
        )
    else:
        rv_sel = rv_existing  # no filtra (toma todo)

# ================== APLICAR FILTROS A COPIA ==================
dff = df.copy()

# Superhost
if '_superhost' not in dff.columns and 'host_is_superhost' in dff.columns:
    dff['_superhost'] = (
        dff['host_is_superhost'].astype(str).str.lower()
        .map({'t': True, 'true': True, '1': True, 'f': False, 'false': False, '0': False})
    )
if sup_sel and set(sup_sel) != set(['True', 'False']) and '_superhost' in dff.columns:
    sel_bools = [s.lower() == 'true' for s in sup_sel]
    dff = dff[dff['_superhost'].isin(sel_bools)]

# Precio (slider)
if price_range is not None and '_price_num' in dff.columns:
    dff = dff[(dff['_price_num'] >= price_range[0]) & (dff['_price_num'] <= price_range[1])]

# Property type (activo solo si hay opciones)
if pt_options and 'property_type' in dff.columns:
    dff = dff[dff['property_type'].astype(str).isin(pt_sel)]

# Neighbourhood (activo solo si hay opciones)
if nb_options and 'neighbourhood_cleansed' in dff.columns:
    dff = dff[dff['neighbourhood_cleansed'].astype(str).isin(nb_sel)]

# Review scores (activo solo si hay opciones)
if rv_existing and '_rev_cat' in dff.columns:
    dff = dff[dff['_rev_cat'].astype(str).isin(rv_sel)]

# ======================
# GRÁFICAS SOLO SI ES HOST INFO
# ======================
if Variable_Cat == "Host info":
    st.subheader("Información del Anfitrión")
    # ====== Fila 1 ======
    c1, c2 = st.columns(2)

    # Host response time (Barra)
    with c1:
        st.subheader("Host Response Time")
        if 'host_response_time' in dff.columns:
            tabla = dff['host_response_time'].value_counts(dropna=False).reset_index(name='count') \
                                            .rename(columns={'index': 'host_response_time'})
            filtro = tabla[tabla['count'] > 100]
            fig = px.bar(
                filtro.sort_values('count', ascending=False),
                x='host_response_time', y='count',
                title='Host Response Time',
                text='count',
                color_discrete_sequence=palette
            )
            fig.update_layout(xaxis_title=None, yaxis_title='Frecuencia')
            fig.update_xaxes(tickangle=45)
            show(fig)

        else:
            st.warning("No existe la columna 'host_response_time'.")

    # Host is superhost (Pie)
    with c2:
        st.subheader("Host is Superhost")
        if 'host_is_superhost' in dff.columns:
            tabla = dff['host_is_superhost'].value_counts(dropna=False).reset_index(name='count') \
                                            .rename(columns={'index': 'host_is_superhost'})
            fig = px.pie(
                tabla,
                names='host_is_superhost', values='count',
                title='Host is Superhost',
                hole=0.25,
                color_discrete_sequence=palette[:2]
            )
            fig.update_traces(textposition='inside', textinfo='percent+label')
            show(fig)
        else:
            st.warning("No existe la columna 'host_is_superhost'.")

    # ====== Fila 2 ======
    c3, c4 = st.columns(2)

    # Host identity verified (Pie)
    with c3:
        st.subheader("Host Identity Verified")
        if 'host_identity_verified' in dff.columns:
            tabla = dff['host_identity_verified'].value_counts(dropna=False).reset_index(name='count') \
                                                .rename(columns={'index': 'host_identity_verified'})
            fig = px.pie(
                tabla,
                names='host_identity_verified', values='count',
                title='Host Identity Verified',
                hole=0.25,
                color_discrete_sequence=palette[:2]
            )
            fig.update_traces(textposition='inside', textinfo='percent+label')
            show(fig)
        else:
            st.warning("No existe la columna 'host_identity_verified'.")

    # Has availability (Pie)
    with c4:
        st.subheader("Has Availability")
        if 'has_availability' in dff.columns:
            tabla = dff['has_availability'].value_counts(dropna=False).reset_index(name='count') \
                                        .rename(columns={'index': 'has_availability'})
            fig = px.pie(
                tabla,
                names='has_availability', values='count',
                title='Host Has Availability',
                hole=0.25,
                color_discrete_sequence=palette[:2]
            )
            fig.update_traces(textposition='inside', textinfo='percent+label')
            show(fig)
        else:
            st.warning("No existe la columna 'has_availability'.")

    # ====== Fila 3 ======
    c5, c6 = st.columns(2)

    # Host location (Barra)
    with c5:
        st.subheader("Host Location")
        if 'host_location' in dff.columns:
            dff_loc = dff.copy()
            pat_cdmx = r'(ciudad\s+de\s+m[ée]xico|cdmx|mexico\s+city|m[ée]xico\s*,?\s*d\.?\s*f\.?)'
            mask = dff_loc['host_location'].astype(str).str.contains(pat_cdmx, case=False, na=False, regex=True)
            dff_loc.loc[mask, 'host_location'] = 'CDMX'
            dff_loc['host_location'] = dff_loc['host_location'].fillna('Unknown')
            dff_loc['host_location'] = dff_loc['host_location'].replace(r'^\s*(nan|none|unknown)?\s*$', 'Unknown', regex=True)

            tabla = dff_loc['host_location'].value_counts(dropna=False).reset_index(name='count') \
                                            .rename(columns={'index': 'host_location'})
            filtro = tabla[tabla['count'] > 200]
            fig = px.bar(
                filtro.sort_values('count', ascending=False),
                x='host_location', y='count',
                title='Host Location',
                text='count',
                color_discrete_sequence=palette
            )
            fig.update_layout(xaxis_title=None, yaxis_title='Frecuencia')
            fig.update_xaxes(tickangle=45)
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.warning("No existe la columna 'host_location'.")

    # Host acceptance rate (Pie con bins)
    with c6:
        st.subheader("Host Acceptance Rate")
        col = 'host_acceptance_rate'
        if col in dff.columns and dff[col].notna().sum() > 0:
            min1 = float(np.nanmin(dff[col]))
            max1 = float(np.nanmax(dff[col]))
            # 5 categorías tipo Likert; si min==max, mostramos una sola
            if min1 == max1:
                tabla = pd.DataFrame({'_har_cat': [f'{min1:.1f}'], 'count': [dff[col].notna().sum()]})
            else:
                bins = np.linspace(min1, max1, 6)
                labels = ['Malo', 'Regular', 'Bueno', 'Muy Bueno', 'Excelente']
                dff['_har_cat'] = pd.cut(dff[col], bins=bins, labels=labels, include_lowest=True)
                tabla = dff['_har_cat'].value_counts(dropna=False).reset_index(name='count') \
                                    .rename(columns={'index': '_har_cat'})

            fig = px.pie(
                tabla,
                names='_har_cat',   # ← ahora SIEMPRE existe
                values='count',
                title='Host Acceptance Rate',
                hole=0.25,
                color_discrete_sequence=palette
            )
            fig.update_traces(textposition='inside', textinfo='percent+label')
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.warning("No hay datos numéricos en 'host_acceptance_rate'.")


# ======================
# OPCIÓN 2: PROPERTY TYPE
# ======================
elif Variable_Cat == "Property type":
    st.subheader("Características de la Propiedad")

    # ===== Fila 1: Property type & Room type =====
    c1, c2 = st.columns(2)

    with c1:
        st.write("**Tipo de Propiedad**")
        tabla = df['property_type'].value_counts(dropna=False).reset_index(name='count') \
                                .rename(columns={'index': 'property_type'})
        top5 = tabla.head(10)
        fig = px.bar(
            top5,
            x='property_type', y='count',
            title='Property Type',
            text='count',
            color_discrete_sequence=[palette[0]]
        )
        fig.update_traces(textposition='outside')
        fig.update_layout(xaxis_title=None, yaxis_title='Frecuencia', xaxis_tickangle=45)
        st.plotly_chart(fig, use_container_width=True)

    with c2:
        st.write("**Tipo de Habitación**")
        tabla = df['room_type'].value_counts(dropna=False).reset_index(name='count') \
                            .rename(columns={'index': 'room_type'})
        fig = px.bar(
            tabla,
            x='room_type', y='count',
            title='Room Type',
            text='count',
            color_discrete_sequence=[palette[1]]
        )
        fig.update_traces(textposition='outside')
        fig.update_layout(xaxis_title=None, yaxis_title='Frecuencia')
        st.plotly_chart(fig, use_container_width=True)

    # ===== Fila 2: Beds & Bathrooms =====
    c3, c4 = st.columns(2)

    with c3:
        st.write("**Número de Camas (Beds)**")
        df['_beds_cat'] = pd.cut(
            df['beds'],
            bins=np.linspace(0, df['beds'].max() + 1, 5),
            labels=['1 cama', '2 camas', '3 camas', '4+ camas'],
            include_lowest=True
        )
        tabla = df['_beds_cat'].value_counts(dropna=False).reset_index(name='count') \
                            .rename(columns={'index': '_beds_cat'})
        fig = px.pie(
            tabla,
            names='_beds_cat',
            values='count',
            title='Distribución de Camas',
            color_discrete_sequence=palette
        )
        st.plotly_chart(fig, use_container_width=True)

    with c4:
        st.write("**Número de Baños (Bathrooms)**")
        df['_bath_cat'] = pd.cut(
            df['bathrooms'],
            bins=np.linspace(0, df['bathrooms'].max() + 1, 5),
            labels=['1 baño', '2 baños', '3 baños', '4+ baños'],
            include_lowest=True
        )
        tabla = df['_bath_cat'].value_counts(dropna=False).reset_index(name='count') \
                            .rename(columns={'index': '_bath_cat'})
        fig = px.pie(
            tabla,
            names='_bath_cat',
            values='count',
            title='Distribución de Baños',
            color_discrete_sequence=palette
        )
        st.plotly_chart(fig, use_container_width=True)

    # ===== Fila 3: Accommodates (Barras) =====
    st.write("**Capacidad de Alojamiento (Accommodates)**")
    tabla = df['accommodates'].value_counts(dropna=False).reset_index(name='count') \
                            .rename(columns={'index': 'accommodates'}) \
                            .sort_values(by='accommodates')
    fig = px.bar(
        tabla,
        x='accommodates', y='count',
        title='Número de Personas que Acepta',
        text='count',
        color_discrete_sequence=[palette[2]]
    )
    fig.update_traces(textposition='outside')
    fig.update_layout(xaxis_title='Capacidad', yaxis_title='Frecuencia')
    st.plotly_chart(fig, use_container_width=True)

# ======================
# OPCIÓN 3: OVERALL
# ======================
elif Variable_Cat == "Overall":
    st.subheader("Resumen General")

    # -------- Fila 1: Neighbourhood | Instant Bookable --------
    c1, c2 = st.columns(2)
    with c1:
        st.subheader("Neighbourhood Cleansed")
        if 'neighbourhood_cleansed' in dff.columns:
            tabla = dff['neighbourhood_cleansed'].value_counts(dropna=False).reset_index(name='count') \
                                                .rename(columns={'index': 'neighbourhood_cleansed'})
            fig = px.bar(
                tabla.sort_values('count', ascending=False),
                x='neighbourhood_cleansed', y='count',
                title='Neighbourhood Cleansed',
                text='count',
                color_discrete_sequence=palette
            )
            fig.update_layout(xaxis_title=None, yaxis_title='Frecuencia')
            fig.update_xaxes(tickangle=45)
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.warning("La columna 'neighbourhood_cleansed' no está en el dataset.")

    with c2:
        st.subheader("Instant Bookable")
        if 'instant_bookable' in dff.columns:
            tabla = dff['instant_bookable'].value_counts(dropna=False).reset_index(name='count') \
                                            .rename(columns={'index': 'instant_bookable'})
            fig = px.pie(
                tabla,
                names='instant_bookable', values='count',
                title='Instant Bookable',
                hole=0.25,
                color_discrete_sequence=palette[:2]
            )
            fig.update_traces(textposition='inside', textinfo='percent+label')
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.warning("La columna 'instant_bookable' no está en el dataset.")

    # -------- Fila 2: Price Distribution | Availability 365 --------
    c3, c4 = st.columns(2)

    with c3:
        st.subheader("Price Distribution")
        if '_price_num' in dff.columns and dff['_price_num'].notna().sum() > 0:
            fig = px.histogram(
                dff, x='_price_num',
                title='Price Distribution',
                nbins=50,
                color_discrete_sequence=[palette[0]]
            )
            fig.update_layout(xaxis_title='Precio (moneda original)', yaxis_title='Frecuencia')
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.warning("No hay datos numéricos en 'price' después de convertir.")

    with c4:
        st.subheader("Availability 365")
        if 'availability_365' in dff.columns and dff['availability_365'].notna().sum() > 0:
            dff['_av_cat'] = pd.cut(
                dff['availability_365'],
                bins=np.linspace(0, 365.1, 6),
                labels=['1-73', '74-146', '147-219', '220-292', '293-365'],
                include_lowest=True
            )
            tabla = dff['_av_cat'].value_counts(dropna=False).reset_index(name='count') \
                                    .rename(columns={'index': '_av_cat'})
            fig = px.bar(
                tabla.sort_values('count', ascending=False),
                x='_av_cat', y='count',
                title='Availability 365 Distribution',
                text='count',
                color_discrete_sequence=palette
            )
            fig.update_layout(xaxis_title=None, yaxis_title='Frecuencia')
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.warning("No hay datos numéricos en 'availability_365'.")

    # -------- Fila 3: Review Scores (centrado) --------
    c5, _ = st.columns(2)

    with c5:
        st.subheader("Review Scores Value")
        if 'review_scores_value' in dff.columns and dff['review_scores_value'].notna().sum() > 0:
            # Asegura _rev_cat en dff si no existe o quedó vacío
            if '_rev_cat' not in dff.columns or dff['_rev_cat'].isna().all():
                rmin = float(np.nanmin(dff['review_scores_value']))
                rmax = float(np.nanmax(dff['review_scores_value']))
                bins_r = np.linspace(max(3.8, rmin), min(5.1, max(rmax, 5.0)), 6)
                labels_r = ['Malo (3.8–4.1)', 'Regular (4.1–4.3)', 'Bueno (4.4–4.6)', 'Muy Bueno (4.7–4.9)', 'Excelente (5)']
                dff['_rev_cat'] = pd.cut(
                    dff['review_scores_value'],
                    bins=bins_r, labels=labels_r, include_lowest=True
                )

            tabla = dff['_rev_cat'].value_counts(dropna=False).reset_index(name='count') \
                                    .rename(columns={'index': '_rev_cat'})
            fig = px.pie(
                tabla,
                names='_rev_cat', values='count',
                title='Review Scores',
                hole=0.25,
                color_discrete_sequence=palette
            )
            fig.update_traces(textposition='inside', textinfo='percent+label')
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.warning("No hay datos numéricos en 'review_scores_value'.")

# =====================================================================
# ====================== VISTA: REGRESIÓN LINEAL ======================
# =====================================================================

    st.subheader(
        (f"Regresión Lineal {reg_type}")
    )


def render_regresion_lineal(dff):
    import numpy as np
    import pandas as pd
    import plotly.express as px
    import plotly.graph_objects as go
    import streamlit as st
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

    # ---------- Paletas (fallback si no existen globales) ----------
    global palette, palette2
    if "palette" not in globals():
        palette = ["#4C78A8", "#F58518", "#54A24B", "#E45756", "#72B7B2"]
    if "palette2" not in globals():
        palette2 = "Viridis"

    # ---------- Helpers ----------
    def get_numeric_cols(df_):
        return [c for c in df_.columns if pd.api.types.is_numeric_dtype(df_[c])]

    def default_target(df_):
        if '_price_num' in df_.columns and df_['_price_num'].notna().sum() > 0:
            return '_price_num'
        nums = get_numeric_cols(df_)
        return nums[0] if nums else None

    def rmse(y_true, y_pred):
        try:
            return mean_squared_error(y_true, y_pred, squared=False)
        except TypeError:
            return np.sqrt(mean_squared_error(y_true, y_pred))

    def safe_onehot():
        try:
            return OneHotEncoder(handle_unknown="ignore", drop=None, sparse_output=False)
        except TypeError:
            return OneHotEncoder(handle_unknown="ignore", drop=None, sparse=False)

    def compute_feature_importance(df, y_col, num_feats_all, cat_feats_all, standardize, log_target):
        """
        Ajuste provisional para estimar importancias por coeficiente:
        - Numéricas: |coeficiente|
        - Categóricas: norma L2 del bloque OHE por variable original
        Devuelve ranking combinado y rankings separados.
        """
        data = df.dropna(subset=[y_col]).copy()
        if data.empty:
            return [], [], []

        X_tmp = data[num_feats_all + cat_feats_all].copy()
        y_tmp = data[y_col].copy()

        if log_target:
            y_tmp = y_tmp.where(y_tmp > 0).dropna()
            X_tmp = X_tmp.loc[y_tmp.index]

        transformers = []
        if num_feats_all:
            transformers.append(("num", StandardScaler() if standardize else "passthrough", num_feats_all))
        if cat_feats_all:
            transformers.append(("cat", safe_onehot(), cat_feats_all))

        pre = ColumnTransformer(transformers=transformers, remainder="drop")
        pipe = Pipeline([("pre", pre), ("lr", LinearRegression())])

        try:
            pipe.fit(X_tmp, y_tmp)
        except Exception:
            return [], [], []

        # Obtiene nombres reales (num + OHE)
        feat_names = []
        if num_feats_all:
            feat_names += num_feats_all
        cat_names_expanded = []
        if cat_feats_all:
            ohe = pipe.named_steps["pre"].named_transformers_.get("cat", None)
            if hasattr(ohe, "get_feature_names_out"):
                cat_names_expanded = ohe.get_feature_names_out(cat_feats_all).tolist()
            feat_names += cat_names_expanded

        coefs = np.array(pipe.named_steps["lr"].coef_).ravel()
        feat_names = feat_names[:len(coefs)]

        # Importancia por variable original
        importancias = {}

        # numéricas: |coef|
        for f in num_feats_all:
            if f in feat_names:
                idx = feat_names.index(f)
                importancias[f] = abs(coefs[idx])

        # categóricas: agrupar columnas OHE del mismo prefijo
        for c in cat_feats_all:
            # columnas OHE comienzan con "c_" o "c__" según sklearn; mejor buscar por prefijo exacto "c__"
            # pero get_feature_names_out devuelve "<col>_<categoria>"
            block = [abs(coefs[i]) for i, n in enumerate(feat_names) if n.startswith(c + "_")]
            if block:
                # norma L2 (robusta a #niveles)
                importancias[c] = float(np.linalg.norm(block))

        # Rankings
        if not importancias:
            return [], [], []

        ranked = sorted(importancias.items(), key=lambda kv: kv[1], reverse=True)
        ranked_all = [k for k, _ in ranked]
        ranked_num = [k for k in ranked_all if k in num_feats_all]
        ranked_cat = [k for k in ranked_all if k in cat_feats_all]
        return ranked_all, ranked_num, ranked_cat

    # ========================= Sidebar (TODOS LOS CONTROLES) =========================
    st.sidebar.header("Configuración del modelo")

    # Tipo de regresión (lista desplegable)
    reg_type = st.sidebar.selectbox("Tipo de regresión", ["Múltiple", "Simple"], index=0)

    # Variables disponibles
    numeric_cols = get_numeric_cols(dff)
    if '_price_num' in dff.columns and '_price_num' not in numeric_cols:
        numeric_cols = ['_price_num'] + numeric_cols
    target_default = default_target(dff) or (numeric_cols[0] if numeric_cols else None)

    y_col = st.sidebar.selectbox(
        "Variable dependiente (y)",
        options=numeric_cols,
        index=(numeric_cols.index(target_default) if target_default in numeric_cols else 0)
    )

    # Candidatas
    all_num_X = [c for c in numeric_cols if c != y_col]
    cat_candidates = [c for c in ['property_type', 'room_type', 'neighbourhood_cleansed',
                                  'instant_bookable', 'host_is_superhost'] if c in dff.columns]

    # Hiper-parámetros comunes
    test_size = st.sidebar.slider("Proporción de test", 0.1, 0.4, 0.2, 0.05)
    standardize = st.sidebar.checkbox("Estandarizar numéricas", True)
    log_target = st.sidebar.checkbox("Usar log(y)", False)

    # Controles específicos según tipo
    if reg_type == "Simple":
        # Solo X numérica
        # Sugerir la más correlacionada con y
        if all_num_X:
            tmp = dff[[y_col] + all_num_X].copy()
            for c in tmp.columns:
                tmp[c] = pd.to_numeric(tmp[c], errors="coerce")
            tmp = tmp.dropna(subset=[y_col])
            if tmp.shape[0] >= 5:
                corrs = tmp.corr(numeric_only=True).get(y_col, pd.Series(dtype=float)).drop(labels=[y_col], errors="ignore")
                best = corrs.abs().sort_values(ascending=False).index.tolist()[:1]
            else:
                best = all_num_X[:1]
        else:
            best = []

        x_simple = st.sidebar.selectbox(
            "Variable X (simple, solo numérica)",
            options=all_num_X,
            index=(all_num_X.index(best[0]) if best else 0) if all_num_X else 0
        )
        num_feats = [x_simple] if all_num_X else []
        cat_feats = []

    else:
        # Múltiple: preselección por mayor coeficiente (ajuste provisional)
        ranked_all, ranked_num, ranked_cat = compute_feature_importance(
            dff, y_col, all_num_X, cat_candidates, standardize, log_target
        )
        # Fallback si no se pudo estimar: usa top por correlación numérica
        if not ranked_all:
            tmp = dff[[y_col] + all_num_X].copy()
            for c in tmp.columns:
                tmp[c] = pd.to_numeric(tmp[c], errors="coerce")
            tmp = tmp.dropna(subset=[y_col])
            if tmp.shape[0] >= 5 and all_num_X:
                corrs = tmp.corr(numeric_only=True).get(y_col, pd.Series(dtype=float)).drop(labels=[y_col], errors="ignore")
                ranked_num = corrs.abs().sort_values(ascending=False).index.tolist()
            else:
                ranked_num = all_num_X
            ranked_cat = cat_candidates

        top_k_num = min(5, len(ranked_num))
        top_k_cat = min(2, len(ranked_cat))  # pocas categóricas por defecto para evitar sobre-OHE
        default_num_multi = ranked_num[:top_k_num]
        default_cat_multi = ranked_cat[:top_k_cat]

        manual_select = st.sidebar.checkbox("Seleccionar variables manualmente", value=False)

        if manual_select:
            num_feats = st.sidebar.multiselect(
                "Variables numéricas (X)",
                options=all_num_X,
                default=default_num_multi
            )
            cat_feats = st.sidebar.multiselect(
                "Variables categóricas (X)",
                options=sorted(cat_candidates),
                default=default_cat_multi
            )
        else:
            num_feats = default_num_multi
            cat_feats = default_cat_multi
            st.sidebar.caption(
                "Variables seleccionadas automáticamente por mayor importancia de coeficientes."
            )

    # ========================= Título =========================
    st.markdown(f"<h1 style='text-align:center;'>Regresión Lineal {reg_type}</h1>", unsafe_allow_html=True)

    # ========================= Preparación y modelo =========================
    data = dff.dropna(subset=[y_col]).copy()
    used_cols = list(dict.fromkeys((num_feats or []) + (cat_feats or [])))
    if not used_cols:
        st.warning("Selecciona al menos una variable independiente.")
        return

    X = data[used_cols].copy()
    y = data[y_col].copy()

    if log_target:
        y = y.where(y > 0).dropna()
        X = X.loc[y.index]

    transformers = []
    if num_feats:
        transformers.append(("num", StandardScaler() if standardize else "passthrough", num_feats))
    if cat_feats:
        transformers.append(("cat", safe_onehot(), cat_feats))

    pre = ColumnTransformer(transformers=transformers, remainder="drop")
    pipe = Pipeline([("pre", pre), ("lr", LinearRegression())])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    pipe.fit(X_train, y_train)
    yhat_train = pipe.predict(X_train)
    yhat_test  = pipe.predict(X_test)

    inv_t = np.exp if log_target else (lambda v: v)
    y_train_eval, y_test_eval = inv_t(y_train), inv_t(y_test)
    yhat_train_eval, yhat_test_eval = inv_t(yhat_train), inv_t(yhat_test)

    rmse_tr, rmse_te = rmse(y_train_eval, yhat_train_eval), rmse(y_test_eval, yhat_test_eval)
    mae_tr, mae_te = mean_absolute_error(y_train_eval, yhat_train_eval), mean_absolute_error(y_test_eval, yhat_test_eval)
    r2_tr, r2_te = r2_score(y_train, yhat_train), r2_score(y_test, yhat_test)

    # ========================= 1) Métricas =========================
    st.markdown("## 1) Métricas del modelo")
    met = pd.DataFrame({
        "Conjunto": ["Train", "Test"],
        "RMSE (espacio original)": [rmse_tr, rmse_te],
        "MAE (espacio original)":  [mae_tr, mae_te],
        "R² (espacio del modelo)": [r2_tr, r2_te],
    })
    st.dataframe(met, use_container_width=True)

    # ========================= 2) Predicho vs Real =========================
    st.markdown("## 2) Predicho vs Real (Test)")
    df_pred = pd.DataFrame({"Real": y_test_eval, "Predicho": yhat_test_eval})
    fig_sc = px.scatter(
        df_pred, x="Real", y="Predicho", trendline="ols",
        color_discrete_sequence=palette, opacity=0.7,
        labels={"Real": "Valor real", "Predicho": "Predicción"}
    )
    fig_sc.add_trace(go.Scatter(
        x=[df_pred["Real"].min(), df_pred["Real"].max()],
        y=[df_pred["Real"].min(), df_pred["Real"].max()],
        mode="lines", name="y = x", line=dict(dash="dash")
    ))
    st.plotly_chart(fig_sc, use_container_width=True)

    # ========================= 3) Histograma de errores =========================
    st.markdown("## 3) Histograma de errores (Test)")
    errores = y_test_eval - yhat_test_eval
    fig_err = px.histogram(x=errores, nbins=40, color_discrete_sequence=palette,
                           title="Distribución de errores del modelo")
    fig_err.add_vline(x=0, line_dash="dash")
    st.plotly_chart(fig_err, use_container_width=True)

    # ========================= 4) Residuos vs Predicción =========================
    st.markdown("## 4) Residuos vs Predicción (Test)")
    df_res = pd.DataFrame({"ŷ": yhat_test_eval, "Residuo": errores})
    fig_res = px.scatter(
        df_res, x="ŷ", y="Residuo", opacity=0.7,
        color_discrete_sequence=palette, labels={"ŷ": "Predicción (ŷ)"}
    )
    fig_res.add_hline(y=0, line_dash="dash")
    st.plotly_chart(fig_res, use_container_width=True)

    # ========================= 5) Importancia de variables =========================
    st.markdown("## 5) Importancia de variables (coeficientes)")
    # Nombres expandidos para OHE y mapeo a variable original
    feat_display = []
    if num_feats:
        feat_display += num_feats
    if cat_feats:
        ohe = pipe.named_steps["pre"].named_transformers_.get("cat", None)
        if hasattr(ohe, "get_feature_names_out"):
            feat_display += ohe.get_feature_names_out(cat_feats).tolist()

    coefs = np.array(pipe.named_steps["lr"].coef_).ravel()
    coefs_df = pd.DataFrame({
        "feature": feat_display[:len(coefs)],
        "coef": coefs[:len(feat_display)]
    })
    coefs_df = coefs_df.sort_values("coef", key=lambda s: s.abs(), ascending=False)

    fig_cf = px.bar(
        coefs_df.head(25), x="coef", y="feature",
        orientation="h", color="coef", color_continuous_scale=palette2,
        title="Coeficientes más importantes (|coef|)"
    )
    st.plotly_chart(fig_cf, use_container_width=True)

if View == "Regresión Lineal":
     render_regresion_lineal(dff)

# =====================================================================
# ==================== VISTA: REGRESIÓN NO LINEAL =====================
# =====================================================================
def render_regresion_no_lineal(dff):
    import numpy as np
    import pandas as pd
    import plotly.express as px
    import plotly.graph_objects as go
    import streamlit as st

    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures, StandardScaler
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    from sklearn.pipeline import make_pipeline
    from sklearn.compose import ColumnTransformer
    from scipy.optimize import curve_fit  # para sigmoide
    import warnings

    # ---------- Paletas fallback ----------
    global palette, palette2
    if "palette" not in globals():
        palette = ["#4C78A8", "#F58518", "#54A24B", "#E45756", "#72B7B2"]
    if "palette2" not in globals():
        palette2 = "Viridis"

    # ---------- Helpers ----------
    def rmse(y_true, y_pred):
        try:
            return mean_squared_error(y_true, y_pred, squared=False)
        except TypeError:
            return np.sqrt(mean_squared_error(y_true, y_pred))

    def metrics_safe(y_true, y_pred):
        y_true = np.asarray(y_true, dtype=float)
        y_pred = np.asarray(y_pred, dtype=float)
        m = np.isfinite(y_true) & np.isfinite(y_pred)
        if m.sum() < 2:
            return np.nan, np.nan, np.nan
        return (
            rmse(y_true[m], y_pred[m]),
            mean_absolute_error(y_true[m], y_pred[m]),
            r2_score(y_true[m], y_pred[m])
        )

    def drop_na_num(df_, cols):
        df2 = df_[list(cols)].copy()
        for c in cols:
            df2[c] = pd.to_numeric(df2[c], errors='coerce')
        df2 = df2.replace([np.inf, -np.inf], np.nan).dropna()
        return df2

    def get_numeric_cols(df_):
        return [c for c in df_.columns if pd.api.types.is_numeric_dtype(df_[c])]

    def default_target(df_):
        if '_price_num' in df_.columns and df_['_price_num'].notna().sum() > 0:
            return '_price_num'
        nums = get_numeric_cols(df_)
        return nums[0] if nums else None

    def rank_by_coef(df, y_col, X_cols):
        """
        Ajuste lineal con estandarización para obtener |coef| por variable y rankear.
        Devuelve lista de variables ordenadas desc por importancia.
        """
        dat = drop_na_num(df, [y_col] + X_cols)
        if dat.empty:
            return []
        y = dat[y_col].to_numpy().astype(float)
        X = dat[X_cols].to_numpy().astype(float)
        pipe = make_pipeline(StandardScaler(with_mean=True, with_std=True), LinearRegression())
        try:
            pipe.fit(X, y)
            coefs = np.abs(pipe.named_steps["linearregression"].coef_).ravel()
        except Exception:
            # Fallback: usar |corr| con y
            coefs = np.abs(np.corrcoef(np.c_[y.reshape(-1, 1), X], rowvar=False)[0, 1:])
        order = np.argsort(coefs)[::-1]
        return [X_cols[i] for i in order]

    # ================= Sidebar =================
    # ================= Sidebar =================
    st.sidebar.header("Configuración – Regresión No Lineal")

    # Tipo de regresión (Simple o Múltiple)
    reg_type = st.sidebar.selectbox("Tipo de regresión", ["Múltiple", "Simple"], index=0)

    # Variables disponibles
    numeric_cols = [c for c in dff.columns if pd.api.types.is_numeric_dtype(dff[c])]
    if '_price_num' in dff.columns and '_price_num' not in numeric_cols:
        numeric_cols = ['_price_num'] + numeric_cols

    if not numeric_cols:
        st.warning("No hay columnas numéricas disponibles.")
        return

    # Variable dependiente
    y_default = '_price_num' if '_price_num' in numeric_cols else numeric_cols[0]
    y_col = st.sidebar.selectbox(
        "Variable dependiente (y)",
        options=numeric_cols,
        index=(numeric_cols.index(y_default) if y_default in numeric_cols else 0)
    )

    # Variables predictoras candidatas
    num_X_all = [c for c in numeric_cols if c != y_col]

    # Función para rankear por coeficientes absolutos
    def rank_by_coef(df, y_col, X_cols):
        from sklearn.linear_model import LinearRegression
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import make_pipeline
        import numpy as np

        df2 = df[[y_col] + X_cols].copy()
        for c in X_cols:
            df2[c] = pd.to_numeric(df2[c], errors="coerce")
        df2 = df2.dropna()
        if df2.empty:
            return []

        X = df2[X_cols].to_numpy().astype(float)
        y = df2[y_col].to_numpy().astype(float)
        model = make_pipeline(StandardScaler(), LinearRegression())
        model.fit(X, y)
        coefs = np.abs(model.named_steps["linearregression"].coef_).ravel()
        order = np.argsort(coefs)[::-1]
        return [X_cols[i] for i in order]

    # Ranking automático por coeficientes
    ranked = rank_by_coef(dff, y_col, num_X_all) if num_X_all else []

    # --- Selección de variables según tipo ---
    if reg_type == "Simple":
        st.sidebar.subheader("Configuración – Regresión Simple")

        manual_select = st.sidebar.checkbox("Seleccionar variable manualmente", value=False)

        if manual_select:
            x_col = st.sidebar.selectbox(
                "Variable independiente (x)",
                options=num_X_all,
                index=0 if num_X_all else None
            )
            used_num_feats = [x_col]
        else:
            best_x = ranked[0] if ranked else (num_X_all[0] if num_X_all else None)
            used_num_feats = [best_x] if best_x else []
            st.sidebar.caption(f"Variable seleccionada automáticamente: **{used_num_feats[0]}**")

    else:
        st.sidebar.subheader("Configuración – Regresión Múltiple")

        manual_select = st.sidebar.checkbox("Seleccionar variables manualmente", value=False)

        if manual_select:
            used_num_feats = st.sidebar.multiselect(
                "Variables independientes (X)",
                options=num_X_all,
                default=ranked[:5] if ranked else num_X_all[:5]
            )
        else:
            used_num_feats = ranked[:5] if ranked else num_X_all[:5]
            st.sidebar.caption("Variables seleccionadas automáticamente por mayor importancia de coeficientes.")


    # Título
    st.markdown(f"<h1 style='text-align:center;'>Regresión No Lineal {reg_type}</h1>", unsafe_allow_html=True)

    # ================= Preparación de datos base =================
    if reg_type == "Simple":
        if not used_num_feats:
            st.warning("No hay variable X seleccionada.")
            return
        base = drop_na_num(dff, [used_num_feats[0], y_col])
        if base.empty:
            st.warning("No hay datos válidos para las columnas seleccionadas.")
            return
        x = base[[used_num_feats[0]]].to_numpy().astype(float)
        y = base[y_col].to_numpy().astype(float)
    else:
        if not used_num_feats:
            st.warning("No hay variables predictoras seleccionadas automáticamente.")
            return
        base = drop_na_num(dff, used_num_feats + [y_col])
        if base.empty:
            st.warning("No hay datos válidos para las columnas seleccionadas.")
            return
        x = base[used_num_feats].to_numpy().astype(float)   # multivariado
        y = base[y_col].to_numpy().astype(float)

    # =============== Modelos (automático, sin sliders) ===============
    # Para Polinomial elegimos automáticamente el mejor grado en 1..6 por RMSE.
    DEG_MIN, DEG_MAX = 1,6

    results = []
    warnings.filterwarnings("ignore", category=RuntimeWarning)

    # ---- Lineal (siempre) ----
    lin = LinearRegression().fit(x, y)
    yhat_lin = lin.predict(x)
    results.append({
        "name": "Lineal",
        "yhat": yhat_lin,
        "rmse": rmse(y, yhat_lin),
        "mae": mean_absolute_error(y, yhat_lin),
        "r2": r2_score(y, yhat_lin)
    })

    # ---- Polinomial (mejor grado 1..6) ----
    best_deg = 1
    best_rmse = np.inf
    for d in range(1, DEG_MAX + 1):
        poly = PolynomialFeatures(degree=d, include_bias=False)
        Xp = poly.fit_transform(x)
        mdl = LinearRegression().fit(Xp, y)
        yhat_d = mdl.predict(Xp)
        rmse_d = rmse(y, yhat_d)
        if np.isfinite(rmse_d) and rmse_d < best_rmse:
            best_rmse = rmse_d
            best_deg = d
            best_poly = poly
            best_poly_mdl = mdl
            best_poly_yhat = yhat_d

    results.append({
        "name": f"Polinomial (g={best_deg})",
        "yhat": best_poly_yhat,
        "rmse": best_rmse,
        "mae": mean_absolute_error(y, best_poly_yhat),
        "r2": r2_score(y, best_poly_yhat)
    })

    # ---- Exponencial / Logarítmico / Sigmoide solo si Simple ----
    res_exp = res_log = res_sig = None
    if reg_type == "Simple":
        xv = x.ravel()

        # Exponencial: y = A * exp(bx)  <=> ln(y) = ln(A) + b x
        mask_ypos = y > 0
        if mask_ypos.sum() >= 5:
            y_pos = y[mask_ypos]
            x_pos = xv[mask_ypos]
            ln_y = np.log(y_pos)
            lin_exp = LinearRegression().fit(x_pos.reshape(-1, 1), ln_y)
            yhat_exp_full = np.exp(lin_exp.intercept_ + lin_exp.coef_[0] * xv)
            r_rmse, r_mae, r_r2 = metrics_safe(y, yhat_exp_full)
            res_exp = {"name": "Exponencial", "yhat": yhat_exp_full, "rmse": r_rmse, "mae": r_mae, "r2": r_r2}
            results.append(res_exp)

        # Logarítmico: y = a + b ln(x)
        mask_xpos = xv > 0
        if mask_xpos.sum() >= 5:
            x_pos = xv[mask_xpos]
            y_pos2 = y[mask_xpos]
            ln_x = np.log(x_pos.reshape(-1, 1))
            lin_log = LinearRegression().fit(ln_x, y_pos2)
            yhat_log_full = np.full_like(y, np.nan, dtype=float)
            yhat_log_full[mask_xpos] = lin_log.predict(np.log(xv[mask_xpos].reshape(-1, 1))).ravel()
            r_rmse, r_mae, r_r2 = metrics_safe(y, yhat_log_full)
            res_log = {"name": "Logarítmico", "yhat": yhat_log_full, "rmse": r_rmse, "mae": r_mae, "r2": r_r2}
            results.append(res_log)

        # Sigmoide: y = L / (1 + exp(-k(x-x0)))
        try:
            def sigmoid(xv, L, k, x0): return L / (1.0 + np.exp(-k * (xv - x0)))
            L0 = np.nanmax(y) if np.isfinite(np.nanmax(y)) else 1.0
            k0 = 1.0 / (np.nanstd(xv) + 1e-8)
            x00 = np.nanmedian(xv)
            popt, _ = curve_fit(sigmoid, xv, y, p0=[L0, k0, x00], maxfev=20000)
            yhat_sig = sigmoid(xv, *popt)
            r_rmse, r_mae, r_r2 = metrics_safe(y, yhat_sig)
            res_sig = {"name": "Sigmoide", "yhat": yhat_sig, "rmse": r_rmse, "mae": r_mae, "r2": r_r2}
            results.append(res_sig)
        except Exception:
            pass

    # ================= Comparativa de modelos =================
    comp = pd.DataFrame([{"Modelo": r["name"], "RMSE": r["rmse"], "MAE": r["mae"], "R²": r["r2"]} for r in results])
    comp["rank_rmse"] = comp["RMSE"].rank(method="min")
    comp["rank_r2"] = (-comp["R²"]).rank(method="min")
    comp["score_mix"] = comp["rank_rmse"] + comp["rank_r2"]
    comp = comp.sort_values(["rank_rmse", "rank_r2"]).reset_index(drop=True)

    best_name = comp.loc[0, "Modelo"]
    st.markdown("### Comparativa de modelos")
    st.dataframe(comp[["Modelo", "RMSE", "MAE", "R²"]], use_container_width=True)
    st.success(f"Mejor modelo (criterio mixto RMSE & R²): {best_name}")

    # ================= Gráficos =================
    if reg_type == "Simple":
        # Dispersión + curvas estimadas
        st.markdown("### Ajuste de todos los modelos")
        xv = x.ravel()
        x_grid = np.linspace(np.nanmin(xv), np.nanmax(xv), 200).reshape(-1, 1)

        def predict_on_grid(model_name):
            if model_name == "Lineal":
                return LinearRegression().fit(x, y).predict(x_grid).ravel()
            elif model_name.startswith("Polinomial"):
                poly_g = PolynomialFeatures(degree=best_deg, include_bias=False)
                Xg = poly_g.fit_transform(x_grid)
                return LinearRegression().fit(best_poly.fit_transform(x), y).predict(Xg).ravel()
            elif model_name == "Exponencial" and res_exp is not None:
                # re-entrena en ln(y) con y>0
                mask_ypos = y > 0
                ln_y = np.log(y[mask_ypos])
                lin_exp = LinearRegression().fit(x[mask_ypos], ln_y)
                return np.exp(lin_exp.intercept_ + lin_exp.coef_[0] * x_grid.ravel())
            elif model_name == "Logarítmico" and res_log is not None:
                out = np.full_like(x_grid.ravel(), np.nan, dtype=float)
                maskg = x_grid.ravel() > 0
                lin_log = LinearRegression().fit(np.log(x[ (x.ravel()>0) ]), y[(x.ravel()>0)])
                out[maskg] = lin_log.predict(np.log(x_grid[maskg])).ravel()
                return out
            elif model_name == "Sigmoide" and res_sig is not None:
                def sigmoid(xv, L, k, x0): return L / (1.0 + np.exp(-k * (xv - x0)))
                try:
                    L0 = np.nanmax(y); k0 = 1.0/(np.nanstd(xv)+1e-8); x00 = np.nanmedian(xv)
                    popt, _ = curve_fit(sigmoid, xv, y, p0=[L0, k0, x00], maxfev=20000)
                    return sigmoid(x_grid.ravel(), *popt)
                except Exception:
                    return np.full_like(x_grid.ravel(), np.nan)
            return np.full_like(x_grid.ravel(), np.nan)

        fig_all = px.scatter(base, x=used_num_feats[0], y=y_col, opacity=0.5,
                             title="Ajustes no lineales", color_discrete_sequence=palette)
        for r in results:
            y_grid = predict_on_grid(r["name"])
            fig_all.add_trace(go.Scatter(x=x_grid.ravel(), y=y_grid, mode="lines",
                                         name=r["name"], line=dict(width=2)))
        st.plotly_chart(fig_all, use_container_width=True)
    else:
        # En múltiple, mostramos Predicho vs Real para los dos modelos aplicables (Lineal y Polinomial)
        st.markdown("### Predicho vs Real (múltiple)")
        df_pred = pd.DataFrame({"Real": y, "Lineal": yhat_lin, f"Polinomial(g={best_deg})": results[1]["yhat"]})
        # Scatter Lineal
        fig_sc1 = px.scatter(df_pred, x="Real", y="Lineal", trendline="ols",
                             title="Predicho vs Real – Lineal", color_discrete_sequence=palette, opacity=0.7)
        fig_sc1.add_trace(go.Scatter(x=[df_pred["Real"].min(), df_pred["Real"].max()],
                                     y=[df_pred["Real"].min(), df_pred["Real"].max()],
                                     mode="lines", name="y=x", line=dict(dash="dash")))
        st.plotly_chart(fig_sc1, use_container_width=True)
        # Scatter Polinomial
        fig_sc2 = px.scatter(df_pred, x="Real", y=f"Polinomial(g={best_deg})", trendline="ols",
                             title=f"Predicho vs Real – Polinomial (g={best_deg})",
                             color_discrete_sequence=palette, opacity=0.7)
        fig_sc2.add_trace(go.Scatter(x=[df_pred["Real"].min(), df_pred["Real"].max()],
                                     y=[df_pred["Real"].min(), df_pred["Real"].max()],
                                     mode="lines", name="y=x", line=dict(dash="dash")))
        st.plotly_chart(fig_sc2, use_container_width=True)

    # ================= Curva RMSE vs Grado (1..6 fijo) =================
    st.markdown("### Error (RMSE) vs grado polinomial")
    degs = list(range(1, DEG_MAX + 1))
    rmses = []
    for d in degs:
        poly_d = PolynomialFeatures(degree=d, include_bias=False)
        Xd = poly_d.fit_transform(x)
        mdl = LinearRegression().fit(Xd, y)
        yhat_d = mdl.predict(Xd)
        rmses.append(rmse(y, yhat_d))
    fig_deg = px.line(x=degs, y=rmses, markers=True,
                      labels={"x": "Grado", "y": "RMSE"},
                      title="RMSE vs Grado (Polinomial)",
                      color_discrete_sequence=palette)
    st.plotly_chart(fig_deg, use_container_width=True)

    # ================= Residuos del mejor modelo =================
    yhat_best = [r for r in results if r["name"] == best_name][0]["yhat"]
    resid = y - yhat_best

    st.markdown("### Histograma de residuos (mejor modelo)")
    fig_hist = px.histogram(x=resid, nbins=50, labels={"x": "Residuo"},
                            title=f"Distribución de residuos – {best_name}")
    fig_hist.add_vline(x=0, line_dash="dash")
    st.plotly_chart(fig_hist, use_container_width=True)

    st.markdown("### Mapa de calor de residuos (ŷ vs residuo)")
    fig_dh = px.density_heatmap(x=yhat_best, y=resid,
                                labels={"x": "ŷ (predicho)", "y": "Residuo"},
                                title=f"Residuo vs ŷ – {best_name}",
                                nbinsx=40, nbinsy=40, color_continuous_scale=palette2)
    st.plotly_chart(fig_dh, use_container_width=True)


# Mostrar vista no lineal
if View == "Regresión No Lineal":
    render_regresion_no_lineal(dff)

# =====================================================================
# ===================== VISTA: REGRESIÓN LOGÍSTICA ====================
# =====================================================================
def render_regresion_logistica(dff):
    import numpy as np
    import pandas as pd
    import streamlit as st
    import plotly.express as px
    import plotly.graph_objects as go

    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import (
        confusion_matrix, roc_curve, auc,
        precision_recall_curve, average_precision_score,
        precision_score, recall_score, f1_score, accuracy_score
    )

    st.subheader("Regresión Logística")

    # ---------------- Helpers ----------------
    def is_binary_series(s: pd.Series) -> bool:
        """Detecta columnas binarias (incluye True/False, 0/1, 't'/'f'/'true'/'false')."""
        vals = pd.unique(s.dropna())
        if len(vals) == 0:
            return False
        # numéricas 0/1
        if pd.api.types.is_numeric_dtype(s):
            u = pd.unique(pd.to_numeric(s, errors="coerce").dropna())
            return set(np.unique(u)).issubset({0, 1})
        # booleanas
        if pd.api.types.is_bool_dtype(s):
            return True
        # strings tipo true/false
        vs = pd.Series(vals).astype(str).str.lower().str.strip()
        bin_like = {"0","1","true","false","t","f","yes","no","y","n"}
        return vs.isin(list(bin_like)).all()

    def to_binary(s: pd.Series) -> pd.Series:
        """Convierte una serie binaria a {0,1} de forma robusta."""
        if pd.api.types.is_bool_dtype(s):
            return s.astype(int)
        if pd.api.types.is_numeric_dtype(s):
            return (pd.to_numeric(s, errors="coerce") > 0).astype(int)
        # string mapping
        m = {"true":1,"t":1,"yes":1,"y":1,"1":1,
             "false":0,"f":0,"no":0,"n":0,"0":0}
        return s.astype(str).str.lower().str.strip().map(m).astype("Int64").astype(int)

    def safe_metrics(y_true, y_prob, thr):
        """Métricas con umbral; ignora pares no finitos."""
        y_true = np.asarray(y_true, dtype=float)
        y_prob = np.asarray(y_prob, dtype=float)
        m = np.isfinite(y_true) & np.isfinite(y_prob)
        if m.sum() == 0:
            return dict(acc=np.nan, prec=np.nan, rec=np.nan, f1=np.nan)
        y_pred = (y_prob[m] >= thr).astype(int)
        return dict(
            acc = accuracy_score(y_true[m], y_pred),
            prec= precision_score(y_true[m], y_pred, zero_division=0),
            rec = recall_score(y_true[m], y_pred, zero_division=0),
            f1  = f1_score(y_true[m], y_pred, zero_division=0)
        )

    # ---------------- Sidebar: Controles ----------------
    st.sidebar.markdown("### Controles — Logística")

    # Sugerir targets binarios disponibles
    candidate_targets = [c for c in dff.columns if is_binary_series(dff[c])]
    if not candidate_targets:
        st.warning("No encuentro columnas binarias en tus datos. Crea una y vuelve a esta vista (p. ej. has_availability, host_is_superhost, etc.).")
        return

    target_col = st.sidebar.selectbox("Variable objetivo (binaria)", candidate_targets, key="lg_target")

    # Features: numéricas y categóricas
    num_cols = [c for c in dff.columns if pd.api.types.is_numeric_dtype(dff[c]) and c != target_col]
    cat_cols = [c for c in dff.columns if (not pd.api.types.is_numeric_dtype(dff[c])) and c != target_col]

    sel_num = st.sidebar.multiselect("Variables numéricas (X)", num_cols, default=num_cols[:5], key="lg_num")
    sel_cat = st.sidebar.multiselect("Variables categóricas (X)", cat_cols, default=[c for c in cat_cols if c in ["room_type","property_type","neighbourhood_cleansed"]], key="lg_cat")

    test_size = st.sidebar.slider("Test size", 0.1, 0.4, 0.2, 0.05, key="lg_test")
    standardize = st.sidebar.checkbox("Estandarizar numéricas", value=True, key="lg_std")
    balanced = st.sidebar.checkbox("Balancear clases (class_weight='balanced')", value=True, key="lg_bal")
    C_val = st.sidebar.number_input("Regularización (C, mayor = menos regularización)", min_value=0.001, value=1.0, step=0.5, key="lg_C")
    thr = st.sidebar.slider("Umbral de decisión", 0.0, 1.0, 0.50, 0.01, key="lg_thr")
    normalize_cm = st.sidebar.checkbox("Normalizar matriz de confusión", value=True, key="lg_cm_norm")

    used_cols = sel_num + sel_cat
    if not used_cols:
        st.info("Selecciona al menos una variable independiente (numérica o categórica).")
        return

    # ---------------- Preparación de datos ----------------
    df_local = dff.copy()

    # y binaria robusta
    y_raw = df_local[target_col]
    if not is_binary_series(y_raw):
        st.warning("La columna objetivo no parece binaria después de limpiar.")
        return
    y_all = to_binary(y_raw)

    # X
    X_all = df_local[used_cols].copy()

    # Quitamos filas con NA en X o y
    data = pd.concat([X_all, y_all.rename("__y__")], axis=1).replace([np.inf, -np.inf], np.nan).dropna()
    if data.empty:
        st.warning("Sin datos válidos tras eliminar NaN/inf en X o y.")
        return

    X = data[used_cols]
    y = data["__y__"].astype(int).to_numpy()

    # Preprocesamiento
    transformers = []
    if sel_num:
        transformers.append(("num", StandardScaler() if standardize else "passthrough", sel_num))
    if sel_cat:
        try:
            ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        except TypeError:
            ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
        transformers.append(("cat", ohe, sel_cat))

    pre = ColumnTransformer(transformers=transformers, remainder="drop")

    lr_args = dict(max_iter=1000, C=float(C_val), solver="lbfgs")
    if balanced:
        lr_args["class_weight"] = "balanced"

    pipe = Pipeline(steps=[
        ("pre", pre),
        ("clf", LogisticRegression(**lr_args))
    ])

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=float(test_size), random_state=42, stratify=y)

    # Fit
    pipe.fit(X_train, y_train)

    # Probabilidades y predicción
    y_proba_test = pipe.predict_proba(X_test)[:, 1]
    y_pred_test  = (y_proba_test >= thr).astype(int)

    # ---------------- Métricas con el umbral elegido ----------------
    mets = safe_metrics(y_test, y_proba_test, thr)
    st.markdown("### Métricas (Test) con umbral seleccionado")
    st.write(pd.DataFrame([{
        "Accuracy": round(mets["acc"], 4),
        "Precision": round(mets["prec"], 4),
        "Recall": round(mets["rec"], 4),
        "F1": round(mets["f1"], 4)
    }]))

    # ---------------- Matriz de Confusión ----------------
    if normalize_cm:
        cm = confusion_matrix(y_test, y_pred_test, normalize="true")
        cm_title = "Matriz de Confusión (normalizada)"
        zmin, zmax, fmt = 0, 1, ".2f"
    else:
        cm = confusion_matrix(y_test, y_pred_test)
        cm_title = "Matriz de Confusión"
        zmin, zmax, fmt = 0, cm.max(), "d"

    fig_cm = px.imshow(
        cm, text_auto=True, zmin=zmin, zmax=zmax, color_continuous_scale= palette2,
        labels=dict(x="Predicho", y="Real", color="Valor"), title=cm_title
    )
    fig_cm.update_xaxes(tickmode="array", tickvals=[0,1], ticktext=["0","1"])
    fig_cm.update_yaxes(tickmode="array", tickvals=[0,1], ticktext=["0","1"])
    st.plotly_chart(fig_cm, use_container_width=True)

    # ---------------- Curva ROC y AUC ----------------
    from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score, precision_score, recall_score, f1_score, accuracy_score

    fpr, tpr, _ = roc_curve(y_test, y_proba_test)
    roc_auc = auc(fpr, tpr)
    fig_roc = go.Figure()
    fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name=f"ROC (AUC={roc_auc:.3f})"))
    fig_roc.add_trace(go.Scatter(x=[0,1], y=[0,1], mode="lines", name="Azar", line=dict(dash="dash")))
    fig_roc.update_layout(title="Curva ROC", xaxis_title="FPR", yaxis_title="TPR")
    show(fig_roc)

    # ---------------- Curva Precision–Recall ----------------
    prec, rec, _ = precision_recall_curve(y_test, y_proba_test)
    ap = average_precision_score(y_test, y_proba_test)
    fig_pr = go.Figure()
    fig_pr.add_trace(go.Scatter(x=rec, y=prec, mode="lines", name=f"PR (AP={ap:.3f})", line=dict(color=palette[1])))
    fig_pr.update_layout(title="Curva Precision–Recall", xaxis_title="Recall", yaxis_title="Precision", )
    st.plotly_chart(fig_pr, use_container_width=True)

    # ---------------- Distribución de probabilidades ----------------
    fig_hist = px.histogram(x=y_proba_test, nbins=40, title="Distribución de probabilidades predichas (Test)")
    fig_hist.add_vline(x=thr, line_dash="dash", line_color="black")
    fig_hist.update_xaxes(title="P(clase=1)")
    st.plotly_chart(fig_hist, use_container_width=True)

    # ---------------- Curva de umbral (Precision/Recall/F1 vs threshold) ----------------
    thresholds = np.linspace(0.0, 1.0, 101)
    precs, recs, f1s = [], [], []
    for t in thresholds:
        m = safe_metrics(y_test, y_proba_test, t)
        precs.append(m["prec"]); recs.append(m["rec"]); f1s.append(m["f1"])
    fig_thr = go.Figure()
    fig_thr.add_trace(go.Scatter(x=thresholds, y=precs, mode="lines", name="Precision", line=dict(color=palette[0])))
    fig_thr.add_trace(go.Scatter(x=thresholds, y=recs, mode="lines", name="Recall", line=dict(color=palette[1])))
    fig_thr.add_trace(go.Scatter(x=thresholds, y=f1s, mode="lines", name="F1", line=dict(color=palette[2])))
    fig_thr.add_vline(x=thr, line_dash="dash", line_color="black")
    fig_thr.update_layout(title="Precision / Recall / F1 vs Umbral", xaxis_title="Umbral", yaxis_title="Score")
    st.plotly_chart(fig_thr, use_container_width=True)

    # ---------------- Importancia de variables (coeficientes) ----------------
    # Recuperar nombres de features tras el preprocesamiento
    feat_names = []
    if sel_num:
        feat_names += sel_num
    if sel_cat:
        ohe = pipe.named_steps["pre"].named_transformers_.get("cat")
        if hasattr(ohe, "get_feature_names_out"):
            feat_names += ohe.get_feature_names_out(sel_cat).tolist()
        else:
            # compat versiones muy viejas
            for c in sel_cat:
                feat_names.append(c)

    coefs = pipe.named_steps["clf"].coef_.ravel()
    coef_df = pd.DataFrame({"feature": feat_names[:len(coefs)], "coef": coefs})
    coef_df = coef_df.sort_values("coef", key=lambda s: s.abs(), ascending=False)

    fig_coef = px.bar(coef_df.head(30), x="coef", y="feature", orientation="h",
                      title="Importancia de variables (coeficientes)", color="coef",
                      color_continuous_scale= palette2)
    st.plotly_chart(fig_coef, use_container_width=True)

    # ---------------- Predicción vs Real (prob vs etiqueta) ----------------
    fig_sc = px.strip(
        x=y_test.astype(int), y=y_proba_test,
        labels={"x": "Real (0/1)", "y": "P(clase=1)"},
        title="Probabilidades predichas por clase real",
        color_discrete_sequence= [palette[1]]
    )
    fig_sc.add_hline(y=thr, line_dash="dash")
    st.plotly_chart(fig_sc, use_container_width=True)

# Mostrar vista logística
if View == "Regresión Logística":
    render_regresion_logistica(dff)

Overwriting despliegue.py
