In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import mlflow
from mlflow.models import infer_signature

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier


from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix
)

# Funciones de Yellow Brick
from yellowbrick.classifier import (
    ConfusionMatrix
)

# Importar xgboost
import xgboost as xgb

sns.set_palette("colorblind")
SEED = 42

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://0.0.0.0:5000")

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load clean data

Data with te following cleaning will be loaded:
- Downsampling

In [31]:
from mlops_bootcamp_team10 import dataset

X, y = dataset.main()

[32m2024-09-18 13:20:17.280[0m | [1mINFO    [0m | [36mmlops_bootcamp_team10.dataset[0m:[36mmain[0m:[36m37[0m - [1mProcessing dataset...[0m
[32m2024-09-18 13:20:17.280[0m | [1mINFO    [0m | [36mmlops_bootcamp_team10.dataset[0m:[36mmain[0m:[36m38[0m - [1mInput path /Users/xochitl.cedillo/Internal_proyects/MLOps - Bootcamp/MLOps-Bootcamp-Team10/mlops-bootcamp-team10/data/raw/hotel_bookings.csv[0m


100%|██████████| 1/1 [00:00<00:00,  8.74it/s]


[32m2024-09-18 13:20:17.720[0m | [1mINFO    [0m | [36mmlops_bootcamp_team10.data.clean[0m:[36mdownsample[0m:[36m13[0m - [1mDownsampling data...[0m
[32m2024-09-18 13:20:17.797[0m | [34m[1mDEBUG   [0m | [36mmlops_bootcamp_team10.data.clean[0m:[36mdownsample[0m:[36m20[0m - [34m[1mCancelled data size: 1415168[0m
[32m2024-09-18 13:20:17.800[0m | [34m[1mDEBUG   [0m | [36mmlops_bootcamp_team10.data.clean[0m:[36mdownsample[0m:[36m21[0m - [34m[1mNot Canccelled data size: 1415168[0m
[32m2024-09-18 13:20:17.835[0m | [32m[1mSUCCESS [0m | [36mmlops_bootcamp_team10.dataset[0m:[36mmain[0m:[36m44[0m - [32m[1mProcessing dataset complete.[0m
[32m2024-09-18 13:20:17.836[0m | [1mINFO    [0m | [36mmlops_bootcamp_team10.dataset[0m:[36mmain[0m:[36m47[0m - [1mExporting dataset.[0m
[32m2024-09-18 13:20:23.129[0m | [32m[1mSUCCESS [0m | [36mmlops_bootcamp_team10.dataset[0m:[36mmain[0m:[36m49[0m - [32m[1mSuccess! Dataset exported to /U

### Pipeline -> Data Preparation

#### Feature selection

The features with better results were:

- Categorical: 
```python
        'hotel',
        'market_segment',
        'deposit_type',
        'customer_type',
        'meal'
```
- Numerical: 
```python
        'arrival_date_week_number',
        'stays_in_weekend_nights',
        'stays_in_week_nights',
        'lead_time',
        'total_of_special_requests',
        'is_repeated_guest',
        'previous_cancellations',
        'previous_bookings_not_canceled',
```

In [47]:
from mlops_bootcamp_team10.data import prep

variable_selection_numeric = [
    'arrival_date_week_number',
    'stays_in_weekend_nights',
    'stays_in_week_nights',
    'lead_time',
    'total_of_special_requests',
    'is_repeated_guest',
    'previous_cancellations',
    'previous_bookings_not_canceled',
]
variable_selection_categoric = [
    'hotel',
    'market_segment',
    'deposit_type',
    'customer_type',
    'meal'
]
preprocessor = prep.Preprocessor(
    categorical_cols=variable_selection_categoric, 
    numerical_cols=variable_selection_numeric,
)

In [37]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

### Pipeline ->  Model Training

In [49]:
params = {
    'n_estimators': 110,
    'min_samples_split': 4,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'max_depth': 25,
}

classifier = RandomForestClassifier(**params)

# Define the complete pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),      # Step 1: Preprocessing
    ('classifier', classifier), # Step 2: Model training
])

In [40]:
preprocessor.fit(X_train, y_train)

AttributeError: 'Preprocessor' object has no attribute 'categorical_cols'

In [35]:
# Fit the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
predictions = pipeline.predict(X_test)

AttributeError: 'Preprocessor' object has no attribute 'categorical_cols'

### Pipeline -> Model Validation