In [1]:
import pandas as pd
path_data_dir = "../../data/inputs/" 
train_data_file = path_data_dir + "hoteles-entrena.csv" 
test_data_file = path_data_dir + "hoteles-prueba.csv"
#train data reanding
train_data = pd.read_csv(train_data_file, sep=",")
print('')
print('train data info:')
train_data.info()
test_data = pd.read_csv(test_data_file, sep=",")
print('')
print('test data info:')
test_data.info()


train data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52981 entries, 0 to 52980
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           52981 non-null  object 
 1   lead_time                       52981 non-null  int64  
 2   stays_in_weekend_nights         52981 non-null  int64  
 3   stays_in_week_nights            52981 non-null  int64  
 4   adults                          52981 non-null  int64  
 5   children                        52981 non-null  object 
 6   meal                            52981 non-null  object 
 7   country                         52681 non-null  object 
 8   market_segment                  52981 non-null  object 
 9   distribution_channel            52981 non-null  object 
 10  is_repeated_guest               52981 non-null  int64  
 11  previous_cancellations          52981 non-null  int64  
 12  previous_booki

In [2]:
train_data['children'] = train_data['children'].apply(lambda x: 0 if x == 'none' else 1)
type(train_data['children'])

pandas.core.series.Series

In [3]:
missing_in_test = set(train_data.columns) - set(test_data.columns)
print("Columns in train_data but not in test_data:", missing_in_test)

Columns in train_data but not in test_data: {'children'}


In [4]:
# Convert children to numeric
train_data['children'] = pd.to_numeric(train_data['children'], errors='coerce')

# Define target: presence of children (0 or 1)
train_data['has_children'] = (train_data['children'] > 0).astype(int)
y = train_data['has_children']
train_data['children'].isna().sum()


0

In [5]:
train_data['children'].shape
print(train_data['children'])

0        0
1        0
2        0
3        0
4        0
        ..
52976    0
52977    0
52978    0
52979    0
52980    0
Name: children, Length: 52981, dtype: int64


Create a Pipeline for preprocessing and modeling

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Define features (exclude 'children')
X = train_data.drop(columns=['children', 'has_children'])

# Split categorical/numerical
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64','float64']).columns.tolist()

# Imputers (no data evalable set median value for numerical, most_frequent for categorical)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Model
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit
clf.fit(X_train, y_train)

print("Validation accuracy:", clf.score(X_val, y_val))


Validation accuracy: 0.9333773709540436


In [7]:
#  prep test data for pipeline}
df_test = test_data.copy()




In [8]:

# Predict on validation set
y_pred = clf.predict(df_test)




In [9]:
# Predict probabilities for validation set (y=1 means has children)
y_proba = clf.predict_proba(df_test)[:, 1]
X_test = df_test.copy()
X_test['id'] = range(1, len(df_test) + 1)

In [10]:
# Create a  DataFrame for submissio 
df_submission = pd.DataFrame({
    'id': X_test['id'],
    'prob': y_proba
})
df_submission.head()
df_submission.shape

(22185, 2)

In [11]:
from datetime import datetime
date = datetime.now().strftime('%y%m%d')  # format yy_mm_dd
new_sumbission_file = f'../../data/outputs/submission_rdf_{date}.csv'

df_submission.to_csv(new_sumbission_file, index=False)
print(f'sumbission saved to: {new_sumbission_file}')

sumbission saved to: ../../data/outputs/submission_rdf_250929.csv
