In [1]:
import pandas as pd
import numpy as np

In [2]:
df_hoteles = pd.read_csv('./data/hoteles-entrena.csv')

In [3]:
df_hoteles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52981 entries, 0 to 52980
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           52981 non-null  object 
 1   lead_time                       52981 non-null  int64  
 2   stays_in_weekend_nights         52981 non-null  int64  
 3   stays_in_week_nights            52981 non-null  int64  
 4   adults                          52981 non-null  int64  
 5   children                        52981 non-null  object 
 6   meal                            52981 non-null  object 
 7   country                         52681 non-null  object 
 8   market_segment                  52981 non-null  object 
 9   distribution_channel            52981 non-null  object 
 10  is_repeated_guest               52981 non-null  int64  
 11  previous_cancellations          52981 non-null  int64  
 12  previous_bookings_not_canceled  

### Entendiendo las variables

Vamos a hacer un pequeño análisis exploratorio de los datos, empezando por entender los valores de las diferentes variables. 

In [4]:
# Identificamos las variables categóricas (tipo object)
categorical_columns = df_hoteles.select_dtypes(include=['object']).columns
print(f"Variables categóricas encontradas: {len(categorical_columns)}")
print(f"Nombres: {list(categorical_columns)}")
print("\n" + "="*60)
print("NÚMERO DE CATEGORÍAS POR VARIABLE:")
print("="*60)

# Para cada variable categórica, contamos cuántas categorías únicas tiene
for col in categorical_columns:
    unique_count = df_hoteles[col].nunique()
    print(f"{col:30} | {unique_count:4} categorías")
    
print("\n" + "="*60)
print("DETALLE DE LAS CATEGORÍAS:")
print("="*60)

# Mostramos las categorías de cada variable
for col in categorical_columns:
    print(f"\n{col.upper()}:")
    unique_values = df_hoteles[col].unique()
    print(f"  Categorías ({len(unique_values)}): {list(unique_values)}")
    
    # Si hay valores nulos, los mostramos
    null_count = df_hoteles[col].isnull().sum()
    if null_count > 0:
        print(f"  Valores nulos: {null_count}")

Variables categóricas encontradas: 12
Nombres: ['hotel', 'children', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type', 'required_car_parking_spaces', 'arrival_date']

NÚMERO DE CATEGORÍAS POR VARIABLE:
hotel                          |    2 categorías
children                       |    2 categorías
meal                           |    5 categorías
country                        |  146 categorías
market_segment                 |    7 categorías
distribution_channel           |    5 categorías
reserved_room_type             |    9 categorías
assigned_room_type             |   10 categorías
deposit_type                   |    3 categorías
customer_type                  |    4 categorías
required_car_parking_spaces    |    2 categorías
arrival_date                   |  793 categorías

DETALLE DE LAS CATEGORÍAS:

HOTEL:
  Categorías (2): ['Resort_Hotel', 'City_Hotel']

CHILDREN:
  Categorías (2): ['none',

In [5]:
#Describimos las variables numéricas con media, std, min, max, y percentiles
df_hoteles.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
lead_time,52981.0,79.968913,90.959584,0.0,9.0,45.0,124.0,737.0
stays_in_weekend_nights,52981.0,0.928559,0.98873,0.0,0.0,1.0,2.0,18.0
stays_in_week_nights,52981.0,2.463544,1.907669,0.0,1.0,2.0,3.0,42.0
adults,52981.0,1.828901,0.511329,0.0,2.0,2.0,2.0,4.0
is_repeated_guest,52981.0,0.043223,0.203361,0.0,0.0,0.0,0.0,1.0
previous_cancellations,52981.0,0.016176,0.280051,0.0,0.0,0.0,0.0,13.0
previous_bookings_not_canceled,52981.0,0.201959,1.825034,0.0,0.0,0.0,0.0,72.0
booking_changes,52981.0,0.293822,0.732195,0.0,0.0,0.0,0.0,20.0
agent,44338.0,94.40931,114.38034,1.0,9.0,14.0,240.0,535.0
company,3939.0,191.722265,132.407868,6.0,51.0,192.0,270.0,541.0


#### Pequeñas modificaciones en variables

In [6]:
#Convertir la variable objetivo a numérica
df_hoteles['children'] = df_hoteles['children'].map({'children': 1, 'none': 0})
df_hoteles['children'].head()

0    0
1    0
2    0
3    0
4    0
Name: children, dtype: int64

In [7]:

#Nulos en country
df_hoteles['country'] = df_hoteles['country'].fillna('Unknown')

#Fecha de llegada: Convertimos a datetime 
df_hoteles['arrival_date'] = pd.to_datetime(df_hoteles['arrival_date'], format='%Y-%m-%d')
#Extraemos el mes de llegada
df_hoteles['arrival_month'] = df_hoteles['arrival_date'].dt.month
#Extraemos año
df_hoteles['arrival_year'] = df_hoteles['arrival_date'].dt.year
#Extraemos día de la semana
df_hoteles['arrival_dayofweek'] = df_hoteles['arrival_date'].dt.dayofweek

#Para el primer análisis usaremos arrival_month.


### Primer intento

Utilizaremos sólo algunas variables numéricas como `lead_time`, `stays_in_weekend_nights`, `stays_in_week_nights`, `adults`, `arrival_month`,`hotel`,`meal`.

De todas ellas, sólo hotel y meal son categóricas, con pocas categorías. 

In [8]:
#Codificamos categorías. 
X = df_hoteles[['lead_time','stays_in_weekend_nights','stays_in_week_nights',
                'adults','arrival_month','hotel','meal']]
X = pd.get_dummies(X, columns=['hotel','meal'], drop_first=True)
y = df_hoteles['children']

In [9]:
#conjunto de entrenamiento y test
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_cw = LogisticRegression(max_iter=1000,  class_weight='balanced')
model_cw.fit(X_train, y_train)

y_pred = model_cw.predict(X_test)
print(classification_report(y_test,y_pred, zero_division=0))



              precision    recall  f1-score   support

           0       0.96      0.43      0.59      9716
           1       0.11      0.80      0.20       881

    accuracy                           0.46     10597
   macro avg       0.54      0.61      0.39     10597
weighted avg       0.89      0.46      0.56     10597



In [10]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Crear SMOTE
smote = SMOTE(random_state=42)

# Ajustar y transformar SOLO en entrenamiento
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Antes del SMOTE:", y_train.value_counts())
print("Después del SMOTE:", y_train_res.value_counts())

# Entrenar modelo
model_smote = LogisticRegression(max_iter=1000)
model_smote.fit(X_train_res, y_train_res)

# Evaluar
y_pred = model_smote.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))

Antes del SMOTE: children
0    38917
1     3467
Name: count, dtype: int64
Después del SMOTE: children
0    38917
1    38917
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       0.93      0.50      0.65      9730
           1       0.09      0.55      0.16       867

    accuracy                           0.51     10597
   macro avg       0.51      0.53      0.40     10597
weighted avg       0.86      0.51      0.61     10597



### Archivo de Prueba

In [16]:
import pandas as pd
from datetime import datetime

# Cargar nuevo CSV
df_nuevo = pd.read_csv('./data/hoteles-prueba.csv')


In [17]:
# Rellenar nulos en country
df_nuevo['country'] = df_nuevo['country'].fillna('Unknown')

# Convertir fecha
df_nuevo['arrival_date'] = pd.to_datetime(df_nuevo['arrival_date'], format='%Y-%m-%d')
df_nuevo['arrival_month'] = df_nuevo['arrival_date'].dt.month

# Seleccionar las mismas columnas predictoras que usaste
X_new = df_nuevo[['lead_time','stays_in_weekend_nights','stays_in_week_nights',
                  'adults','arrival_month','hotel','meal']]

# Codificar categóricas igual que antes
X_new = pd.get_dummies(X_new, drop_first=True)

# Alinear columnas con las del modelo entrenado
X_new = X_new.reindex(columns=X.columns, fill_value=0)


In [18]:
# Probabilidades
probs = model_smote.predict_proba(X_new)[:,1]  # probabilidad de clase 1 (con niños)


In [19]:
# Crear DataFrame de salida
df_salida = pd.DataFrame({
    'id': df_nuevo['id'],
    'prob': probs
})
df_salida.head()

Unnamed: 0,id,prob
0,1,0.506519
1,2,0.525687
2,3,0.33122
3,4,0.525687
4,5,0.528959


In [20]:
fecha = datetime.now().strftime('%y%m%d')  # formato yy_mm_dd
nombre_archivo = f'./data/Salida_{fecha}.csv'

df_salida.to_csv(nombre_archivo, index=False)
print(f'Se guardó el archivo: {nombre_archivo}')


Se guardó el archivo: ./data/Salida_250928.csv
