# 0. Development Setting

In [None]:
#  !pip install pymysql boto3 scikit-learn

In [1]:
!pip list

Package                      Version
---------------------------- ---------
absl-py                      1.4.0
anyio                        3.6.2
argon2-cffi                  21.3.0
argon2-cffi-bindings         21.2.0
arrow                        1.2.3
asttokens                    2.2.1
astunparse                   1.6.3
attrs                        22.2.0
backcall                     0.2.0
beautifulsoup4               4.11.2
bleach                       6.0.0
boto3                        1.26.72
botocore                     1.29.72
cachetools                   5.3.0
certifi                      2022.12.7
cffi                         1.15.1
charset-normalizer           3.0.1
comm                         0.1.2
debugpy                      1.6.6
decorator                    5.1.1
defusedxml                   0.7.1
easydict                     1.10
executing                    1.2.0
fastjsonschema               2.16.2
flatbuffers                  23.1.21
fqdn                         1.5.1

In [2]:
!python -V

Python 3.9.16


In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import os 

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

In [3]:
import tensorflow as tf

import pandas as pd 
import numpy as np
import pymysql
import boto3

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from smart_open import open as s_open
from dotenv import load_dotenv
from easydict import EasyDict

from datetime import datetime
import time

load_dotenv()

True

In [4]:
settings = EasyDict()

settings.AWS_ACCESS_KEY_ID = os.environ["AWS_ACCESS_KEY_ID"]
settings.AWS_SECRET_ACCESS_KEY = os.environ["AWS_SECRET_ACCESS_KEY"]
settings.AWS_REGION_NAME = os.environ["REGION_NAME"]
settings.AWS_ACCOUNT_ID = os.environ["AWS_ACCOUNT_ID"]
settings.AWS_BUCKET_NAME = "genia-bucket"
settings.AWS_ATHENA_OUTPUT_LOCATION = "athena/quries"
settings.AWS_ATHENA_DATABASE = "mini_db"

In [5]:
class Boto3Client(object):
    aws_access_key_id = settings.AWS_ACCESS_KEY_ID
    aws_secret_access_key = settings.AWS_SECRET_ACCESS_KEY
    region_name = settings.AWS_REGION_NAME
    bucket_name = settings.AWS_BUCKET_NAME
    athena_database = settings.AWS_ATHENA_DATABASE
    athena_output_location = settings.AWS_ATHENA_OUTPUT_LOCATION
    
    service_name = None
    
    @classmethod
    def get_client(cls):
        options = dict(
            aws_access_key_id=Boto3Client.aws_access_key_id,
            aws_secret_access_key=Boto3Client.aws_secret_access_key,
            region_name=Boto3Client.region_name,
        )
        return boto3.client(cls.service_name, **options)

In [6]:
import io

class S3Client(Boto3Client):
    
    service_name = "s3"
    
    @staticmethod
    def get_s3_df(file_name: str):
        
        clnt = S3Client.get_client()
        obj = clnt.get_object(
                Bucket=S3Client.bucket_name,
                Key=file_name
            )
        
        if ".csv" in file_name:
            return pd.read_csv(obj["Body"])
        
        if ".parquet" in file_name:
            return pd.read_parquet(io.BytesIO(obj["Body"].read()), engine="pyarrow")
    
    @staticmethod
    def upload_s3_df(df: pd.DataFrame, file_name: str):
        try:
            clnt = S3Client.get_client()
            file_name = f"s3://{S3Client.bucket_name}/{file_name}"
            with s_open(file_name, "wb", transport_params=dict(client=clnt)) as out_file:
                df.to_parquet(out_file, engine="pyarrow", compression="gzip", index=False)
            return True
        except Exception as e:
            print("Error occured: ", str(e))
            return False

In [9]:
class AthenaClient(Boto3Client):
    
    service_name = "athena"
    
    output_location = f"s3://{Boto3Client.bucket_name}/{Boto3Client.athena_output_location}"
    
    @staticmethod
    def get_athena_query_exec_id(sql: str):
        
        clnt = AthenaClient.get_client()
        response = clnt.start_query_execution(
            QueryString=sql,
            QueryExecutionContext={"Database": AthenaClient.athena_database},
            ResultConfiguration={"OutputLocation": AthenaClient.output_location},
        )
        # response 내의 StatusCode == 200 확인
        return response["QueryExecutionId"]
    
    @staticmethod
    def collect_query_result(query_exec_id: str):
        
        clnt = AthenaClient.get_client()
        
        WAIT = ["QUEUED", "RUNNING"]
        SUCCESS = ["SUCCEEDED"]
        FAILED = ["FAILED", "CANCELLED"]
        
        while True:
            try:
                result = clnt.get_query_execution(QueryExecutionId=query_exec_id)
                status = result["QueryExecution"]["Status"]["State"]

                if status in SUCCESS:
                    query_result_path = f"{AthenaClient.output_location}/{query_exec_id}.csv"
                    query_result_path = query_result_path.replace(f"s3://{AthenaClient.bucket_name}/", "")
                    return S3Client.get_s3_df(query_result_path)

                if status in FAILED:
                    print(f"FAILED!!! -> {status}")
                    break

                if status in WAIT:
                    print(f"Still Running... -> {status}")
                    time.sleep(0.5)
                    continue

                print(f"unexpected status... -> {status}")
                break

            except Exception as e:
                print(str(e))
                break
        return False
    
    @staticmethod
    def get_athena_sql(sql: str):
        query_exec_id = AthenaClient.get_athena_query_exec_id(sample_sql)
        return AthenaClient.collect_query_result(query_exec_id)

# 1. Load Dataset (Amazon Athena)

In [10]:
# sample_sql = 'SELECT * FROM "mini_db"."teacher-hotel" limit 10000;'
sample_sql = 'SELECT * FROM "mini_db"."teacher-hotel";'
df = AthenaClient.get_athena_sql(sample_sql)
df

Still Running... -> QUEUED
Still Running... -> RUNNING
Still Running... -> RUNNING
Still Running... -> RUNNING
Still Running... -> RUNNING
Still Running... -> RUNNING


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,yyyy,mm
0,Resort Hotel,0,0,2015,October,40,1,0,1,3,...,,0,Transient-Party,120.00,0,2,Check-Out,2015-10-02,2015,10
1,Resort Hotel,1,63,2015,October,40,1,0,2,2,...,,0,Transient,68.40,0,2,Canceled,2015-09-08,2015,10
2,Resort Hotel,0,44,2015,October,40,1,0,3,2,...,,0,Transient,118.50,0,2,Check-Out,2015-10-04,2015,10
3,Resort Hotel,0,19,2015,October,40,1,0,3,2,...,,0,Transient,87.00,0,1,Check-Out,2015-10-04,2015,10
4,Resort Hotel,0,57,2015,October,40,1,0,3,1,...,,0,Transient,48.30,1,2,Check-Out,2015-10-04,2015,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114460,City Hotel,0,129,2017,May,22,31,1,4,2,...,,0,Transient,145.80,0,0,Check-Out,2017-06-05,2017,5
114461,City Hotel,0,24,2017,May,22,31,2,5,1,...,,0,Transient,76.29,0,0,Check-Out,2017-06-07,2017,5
114462,City Hotel,0,2,2017,May,22,31,2,6,1,...,,0,Transient,65.00,0,0,Check-Out,2017-06-08,2017,5
114463,City Hotel,0,38,2017,May,22,30,2,7,2,...,,0,Transient,154.44,0,0,Check-Out,2017-06-08,2017,5


In [11]:
df.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date', 'yyyy', 'mm'],
      dtype='object')

# 2. Preprocessing (EDA)

In [12]:
df.describe()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,yyyy,mm
count,114465.0,114465.0,114465.0,114465.0,114465.0,114465.0,114465.0,114465.0,114461.0,114465.0,...,114465.0,114465.0,98417.0,6724.0,114465.0,114465.0,114465.0,114465.0,114465.0,114465.0
mean,0.370489,102.557681,2016.120264,26.915913,15.800778,0.921006,2.480688,1.849797,0.098016,0.007871,...,0.138296,0.216162,86.028186,189.653331,2.42102,99.145541,0.062578,0.555925,2016.120264,6.490202
std,0.482938,106.347854,0.700094,13.837811,8.779636,0.998451,1.906144,0.582562,0.387911,0.097311,...,1.496095,0.644191,110.448592,131.297639,17.962526,48.266533,0.2455,0.782283,0.700094,3.141477
min,0.0,0.0,2015.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,6.0,0.0,-6.38,0.0,0.0,2015.0,1.0
25%,0.0,17.0,2016.0,16.0,8.0,0.0,1.0,2.0,0.0,0.0,...,0.0,0.0,9.0,66.5,0.0,68.0,0.0,0.0,2016.0,4.0
50%,0.0,68.0,2016.0,27.0,16.0,1.0,2.0,2.0,0.0,0.0,...,0.0,0.0,14.0,179.0,0.0,92.0,0.0,0.0,2016.0,6.0
75%,1.0,157.0,2017.0,39.0,23.0,2.0,3.0,2.0,0.0,0.0,...,0.0,0.0,229.0,270.0,0.0,122.0,0.0,1.0,2017.0,9.0
max,1.0,737.0,2017.0,53.0,31.0,19.0,50.0,55.0,10.0,10.0,...,71.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0,2017.0,12.0


In [13]:
SELECTED_COLUMN = [
    "hotel", "lead_time", "stays_in_weekend_nights", "stays_in_week_nights",
    "adults", "children", "babies", "meal", 
    "market_segment", "distribution_channel",
    "is_repeated_guest", "previous_cancellations",
    "previous_bookings_not_canceled", "reserved_room_type",
    "booking_changes", "deposit_type",
    "days_in_waiting_list", "customer_type", 
    "required_car_parking_spaces", "total_of_special_requests", 
]


In [14]:
X_data = df[SELECTED_COLUMN]
X_data

Unnamed: 0,hotel,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests
0,Resort Hotel,0,0,1,3,0.0,0,BB,Online TA,TA/TO,0,0,0,A,0,No Deposit,0,Transient-Party,0,2
1,Resort Hotel,63,0,2,2,0.0,0,BB,Online TA,TA/TO,0,0,0,E,0,No Deposit,0,Transient,0,2
2,Resort Hotel,44,0,3,2,1.0,0,HB,Direct,Direct,0,0,0,D,0,No Deposit,0,Transient,0,2
3,Resort Hotel,19,0,3,2,0.0,0,BB,Online TA,TA/TO,0,0,0,E,0,No Deposit,0,Transient,0,1
4,Resort Hotel,57,0,3,1,0.0,0,BB,Online TA,TA/TO,0,0,0,A,0,No Deposit,0,Transient,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114460,City Hotel,129,1,4,2,0.0,0,BB,Online TA,TA/TO,0,0,0,D,0,No Deposit,0,Transient,0,0
114461,City Hotel,24,2,5,1,0.0,0,BB,Online TA,TA/TO,0,0,0,D,4,No Deposit,0,Transient,0,0
114462,City Hotel,2,2,6,1,0.0,0,BB,Offline TA/TO,TA/TO,0,0,0,A,0,No Deposit,0,Transient,0,0
114463,City Hotel,38,2,7,2,0.0,0,BB,Direct,Direct,0,0,0,A,1,No Deposit,0,Transient,0,0


## Required Columns
* market_segment
* distribution_channel

## !Do not add column named "reservation_status" for training

In [15]:
X_data_cp = X_data.copy()

In [16]:
# CATEGORY_COLUMNS = ["hotel", "meal", "market_segment", "distribution_channel", "reserved_room_type", "deposit_type", "customer_type", "reservation_status"]
CATEGORY_COLUMNS = ["hotel", "meal", "market_segment", "distribution_channel", "reserved_room_type", "deposit_type", "customer_type"]
lbe_dict = dict()

for col in CATEGORY_COLUMNS:
    lbe = LabelEncoder()
    X_data_cp.loc[:, col] = lbe.fit_transform(X_data.loc[:, col])
    X_data_cp[col] = X_data_cp[col].astype(str)
    lbe_dict[col] = lbe

X_data = X_data_cp
X_data

  X_data_cp.loc[:, col] = lbe.fit_transform(X_data.loc[:, col])
  X_data_cp.loc[:, col] = lbe.fit_transform(X_data.loc[:, col])
  X_data_cp.loc[:, col] = lbe.fit_transform(X_data.loc[:, col])
  X_data_cp.loc[:, col] = lbe.fit_transform(X_data.loc[:, col])
  X_data_cp.loc[:, col] = lbe.fit_transform(X_data.loc[:, col])
  X_data_cp.loc[:, col] = lbe.fit_transform(X_data.loc[:, col])
  X_data_cp.loc[:, col] = lbe.fit_transform(X_data.loc[:, col])


Unnamed: 0,hotel,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests
0,1,0,0,1,3,0.0,0,0,6,3,0,0,0,0,0,0,0,3,0,2
1,1,63,0,2,2,0.0,0,0,6,3,0,0,0,4,0,0,0,2,0,2
2,1,44,0,3,2,1.0,0,2,3,1,0,0,0,3,0,0,0,2,0,2
3,1,19,0,3,2,0.0,0,0,6,3,0,0,0,4,0,0,0,2,0,1
4,1,57,0,3,1,0.0,0,0,6,3,0,0,0,0,0,0,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114460,0,129,1,4,2,0.0,0,0,6,3,0,0,0,3,0,0,0,2,0,0
114461,0,24,2,5,1,0.0,0,0,6,3,0,0,0,3,4,0,0,2,0,0
114462,0,2,2,6,1,0.0,0,0,5,3,0,0,0,0,0,0,0,2,0,0
114463,0,38,2,7,2,0.0,0,0,3,1,0,0,0,0,1,0,0,2,0,0


In [17]:
X_data = pd.get_dummies(X_data)

In [18]:
X_data = X_data.replace(np.nan, 0)

In [20]:
scaler = MinMaxScaler()
X_data_scaled = scaler.fit_transform(X_data)
X_data_scaled

array([[0.        , 0.        , 0.02      , ..., 0.        , 0.        ,
        1.        ],
       [0.08548168, 0.        , 0.04      , ..., 0.        , 1.        ,
        0.        ],
       [0.05970149, 0.        , 0.06      , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.0027137 , 0.10526316, 0.12      , ..., 0.        , 1.        ,
        0.        ],
       [0.05156038, 0.10526316, 0.14      , ..., 0.        , 1.        ,
        0.        ],
       [0.05156038, 0.10526316, 0.12      , ..., 0.        , 1.        ,
        0.        ]])

In [21]:
y_data = df["is_canceled"]
y_data

0         0
1         1
2         0
3         0
4         0
         ..
114460    0
114461    0
114462    0
114463    0
114464    0
Name: is_canceled, Length: 114465, dtype: int64

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X_data_scaled,
    y_data,
    test_size=0.3,
    random_state=42,
    stratify=y_data,
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((80125, 50), (34340, 50), (80125,), (34340,))

# 3. DeepLearning Model Architectrure

In [23]:
tf.__version__

'2.11.0'

In [24]:
model = tf.keras.models.Sequential()

input_layer = tf.keras.Input(shape=X_train.shape[1:], name="InputLayer")

y = tf.keras.layers.Dense(256, activation="relu")(input_layer)
y = tf.keras.layers.Dropout(0.3)(y)

y = tf.keras.layers.Dense(64, activation="relu")(y)
y = tf.keras.layers.Dropout(0.2)(y)

y = tf.keras.layers.Dense(16, activation="relu")(y)
y = tf.keras.layers.Dropout(0.2)(y)

y = tf.keras.layers.Dense(4, activation="relu")(y)
y = tf.keras.layers.Dropout(0.1)(y)

output_layer = tf.keras.layers.Dense(1, activation="sigmoid", name="OutputLayer")(y)

model = tf.keras.Model(inputs=input_layer, outputs=output_layer, name="FuncModel")
model.summary()

Model: "FuncModel"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 InputLayer (InputLayer)     [(None, 50)]              0         
                                                                 
 dense (Dense)               (None, 256)               13056     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                16448     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 16)                1040      
                                                                 
 dropout_2 (Dropout)         (None, 16)                0 

# 4. Train Model

In [25]:
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    mode="max",
    patience=10,
    restore_best_weights=True,
    verbose=1,
)


history = model.fit(
    X_train, 
    y_train,
    epochs=100,
    validation_split=0.25,
    batch_size=256,
    callbacks=[early_stopping],
    verbose=2,
)

Epoch 1/100
235/235 - 2s - loss: 0.5354 - accuracy: 0.7354 - val_loss: 0.4697 - val_accuracy: 0.7677 - 2s/epoch - 10ms/step
Epoch 2/100
235/235 - 1s - loss: 0.4732 - accuracy: 0.7818 - val_loss: 0.4518 - val_accuracy: 0.7886 - 1s/epoch - 5ms/step
Epoch 3/100
235/235 - 1s - loss: 0.4584 - accuracy: 0.7920 - val_loss: 0.4419 - val_accuracy: 0.7931 - 1s/epoch - 5ms/step
Epoch 4/100
235/235 - 1s - loss: 0.4482 - accuracy: 0.7968 - val_loss: 0.4342 - val_accuracy: 0.8019 - 1s/epoch - 5ms/step
Epoch 5/100
235/235 - 1s - loss: 0.4405 - accuracy: 0.7983 - val_loss: 0.4277 - val_accuracy: 0.8018 - 1s/epoch - 5ms/step
Epoch 6/100
235/235 - 1s - loss: 0.4347 - accuracy: 0.8003 - val_loss: 0.4274 - val_accuracy: 0.7985 - 1s/epoch - 5ms/step
Epoch 7/100
235/235 - 1s - loss: 0.4298 - accuracy: 0.8025 - val_loss: 0.4192 - val_accuracy: 0.8034 - 1s/epoch - 5ms/step
Epoch 8/100
235/235 - 1s - loss: 0.4267 - accuracy: 0.8029 - val_loss: 0.4118 - val_accuracy: 0.8069 - 1s/epoch - 5ms/step
Epoch 9/100
235

# 5. Evaluate Model & Hyperparameter Tuning

In [26]:
"%.4f" % model.evaluate(X_test, y_test)[1]



'0.8241'

In [40]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, [1 if elem[0] >= 0.5 else 0 for elem in model.predict(X_test)])



0.7871144094439166

# 6. Inference Model & Upload S3

In [27]:
sample_sql = 'SELECT * FROM "mini_db"."test-test_hotel";'

test_df = AthenaClient.get_athena_sql(sample_sql)
test_df

Still Running... -> QUEUED
Still Running... -> RUNNING


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,1,122,2017,August,31,1,0,1,2,...,No Deposit,240.0,,0,Transient,209.00,0,0,Canceled,2017-04-03
1,Resort Hotel,1,78,2017,August,31,1,0,3,2,...,No Deposit,240.0,,0,Transient,230.00,0,1,Canceled,2017-05-16
2,Resort Hotel,1,67,2017,August,31,1,0,4,1,...,No Deposit,314.0,,0,Transient,188.60,0,1,Canceled,2017-05-28
3,Resort Hotel,1,69,2017,August,31,1,0,4,2,...,No Deposit,242.0,,0,Transient,230.00,0,1,Canceled,2017-07-11
4,Resort Hotel,1,198,2017,August,31,1,0,4,2,...,No Deposit,240.0,,0,Transient,157.31,0,0,Canceled,2017-01-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4920,City Hotel,0,23,2017,August,35,30,2,5,2,...,No Deposit,394.0,,0,Transient,96.14,0,0,Check-Out,2017-09-06
4921,City Hotel,0,102,2017,August,35,31,2,5,3,...,No Deposit,9.0,,0,Transient,225.43,0,2,Check-Out,2017-09-07
4922,City Hotel,0,34,2017,August,35,31,2,5,2,...,No Deposit,9.0,,0,Transient,157.71,0,4,Check-Out,2017-09-07
4923,City Hotel,0,109,2017,August,35,31,2,5,2,...,No Deposit,89.0,,0,Transient,104.40,0,0,Check-Out,2017-09-07


In [28]:
test_data = test_df[SELECTED_COLUMN]
test_data_cp = test_data.copy()

In [29]:
for col in CATEGORY_COLUMNS:
    test_data_cp.loc[:, col] = lbe_dict[col].transform(test_data.loc[:, col])
    test_data_cp[col] = test_data_cp[col].astype(str)

test_data = test_data_cp
test_data = pd.get_dummies(test_data)
test_data = test_data.replace(np.nan, 0)
test_data

  test_data_cp.loc[:, col] = lbe_dict[col].transform(test_data.loc[:, col])
  test_data_cp.loc[:, col] = lbe_dict[col].transform(test_data.loc[:, col])
  test_data_cp.loc[:, col] = lbe_dict[col].transform(test_data.loc[:, col])
  test_data_cp.loc[:, col] = lbe_dict[col].transform(test_data.loc[:, col])
  test_data_cp.loc[:, col] = lbe_dict[col].transform(test_data.loc[:, col])
  test_data_cp.loc[:, col] = lbe_dict[col].transform(test_data.loc[:, col])
  test_data_cp.loc[:, col] = lbe_dict[col].transform(test_data.loc[:, col])


Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,...,reserved_room_type_6,reserved_room_type_7,reserved_room_type_9,deposit_type_0,deposit_type_1,deposit_type_2,customer_type_0,customer_type_1,customer_type_2,customer_type_3
0,122,0,1,2,0.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,78,0,3,2,0.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,67,0,4,1,0.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,69,0,4,2,0.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,198,0,4,2,0.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4920,23,2,5,2,0.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4921,102,2,5,3,0.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4922,34,2,5,2,0.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4923,109,2,5,2,0.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [30]:
test_data_scaled = scaler.transform(test_data)
test_data_scaled

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- distribution_channel_4
- market_segment_7
- reserved_room_type_8


In [31]:
test_data["distribution_channel_4"] = 0
test_data["market_segment_7"] = 0
test_data["reserved_room_type_8"] = 0

test_data_scaled = scaler.transform(test_data)
test_data_scaled

ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.


In [32]:
col_list = list(test_data.columns[:-3])
adj_col_list = None 

for i in range(len(col_list)):
    if col_list[i] == "distribution_channel_3":
        adj_col_list = col_list[:i+1] + ["distribution_channel_4"] + col_list[i+1:]
        break
        
col_list = adj_col_list[:]
for i in range(len(col_list)):
    if col_list[i] == "market_segment_6":
        adj_col_list = col_list[:i+1] + ["market_segment_7"] + col_list[i+1:]
        break
        
col_list = adj_col_list[:]
for i in range(len(col_list)):
    if col_list[i] == "reserved_room_type_7":
        adj_col_list = col_list[:i+1] + ["reserved_room_type_8"] + col_list[i+1:]
        break
        
        
test_data.columns = adj_col_list
test_data

Unnamed: 0,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,...,reserved_room_type_7,reserved_room_type_8,reserved_room_type_9,deposit_type_0,deposit_type_1,deposit_type_2,customer_type_0,customer_type_1,customer_type_2,customer_type_3
0,122,0,1,2,0.0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1,78,0,3,2,0.0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,67,0,4,1,0.0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
3,69,0,4,2,0.0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4,198,0,4,2,0.0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4920,23,2,5,2,0.0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4921,102,2,5,3,0.0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4922,34,2,5,2,0.0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4923,109,2,5,2,0.0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [33]:
test_data_scaled = scaler.transform(test_data)
test_data_scaled

array([[0.16553596, 0.        , 0.02      , ..., 0.        , 0.        ,
        0.        ],
       [0.10583446, 0.        , 0.06      , ..., 0.        , 0.        ,
        0.        ],
       [0.09090909, 0.        , 0.08      , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.04613297, 0.10526316, 0.1       , ..., 0.        , 0.        ,
        0.        ],
       [0.14789688, 0.10526316, 0.1       , ..., 0.        , 0.        ,
        0.        ],
       [0.27815468, 0.10526316, 0.14      , ..., 0.        , 0.        ,
        0.        ]])

In [34]:
predicted = model.predict(test_data_scaled)
predicted



array([[0.99597925],
       [0.0819718 ],
       [0.05791949],
       ...,
       [0.0032025 ],
       [0.99136275],
       [0.99975187]], dtype=float32)

In [35]:
cnt = 0
fin_result = [1 if y_pred[0] >= 0.5 else 0 for y_pred in predicted]
for y_pred, y_true in zip(fin_result, test_df["is_canceled"]):
    if y_true == y_pred:
        cnt += 1

cnt / len(predicted)

0.6229441624365483

In [39]:
from sklearn.metrics import roc_auc_score

roc_auc_score(test_df["is_canceled"], fin_result)

0.61890589067125

In [36]:
X_data.columns

Index(['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights',
       'adults', 'children', 'babies', 'is_repeated_guest',
       'previous_cancellations', 'previous_bookings_not_canceled',
       'booking_changes', 'days_in_waiting_list',
       'required_car_parking_spaces', 'total_of_special_requests', 'hotel_0',
       'hotel_1', 'meal_0', 'meal_1', 'meal_2', 'meal_3', 'meal_4',
       'market_segment_0', 'market_segment_1', 'market_segment_2',
       'market_segment_3', 'market_segment_4', 'market_segment_5',
       'market_segment_6', 'market_segment_7', 'distribution_channel_0',
       'distribution_channel_1', 'distribution_channel_2',
       'distribution_channel_3', 'distribution_channel_4',
       'reserved_room_type_0', 'reserved_room_type_1', 'reserved_room_type_2',
       'reserved_room_type_3', 'reserved_room_type_4', 'reserved_room_type_5',
       'reserved_room_type_6', 'reserved_room_type_7', 'reserved_room_type_8',
       'reserved_room_type_9', 'deposit_type

In [37]:
for prev, aft in zip(X_data.columns, test_data.columns):
    if prev == aft: continue
    print(prev, aft)

In [38]:
test_data.columns

Index(['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights',
       'adults', 'children', 'babies', 'is_repeated_guest',
       'previous_cancellations', 'previous_bookings_not_canceled',
       'booking_changes', 'days_in_waiting_list',
       'required_car_parking_spaces', 'total_of_special_requests', 'hotel_0',
       'hotel_1', 'meal_0', 'meal_1', 'meal_2', 'meal_3', 'meal_4',
       'market_segment_0', 'market_segment_1', 'market_segment_2',
       'market_segment_3', 'market_segment_4', 'market_segment_5',
       'market_segment_6', 'market_segment_7', 'distribution_channel_0',
       'distribution_channel_1', 'distribution_channel_2',
       'distribution_channel_3', 'distribution_channel_4',
       'reserved_room_type_0', 'reserved_room_type_1', 'reserved_room_type_2',
       'reserved_room_type_3', 'reserved_room_type_4', 'reserved_room_type_5',
       'reserved_room_type_6', 'reserved_room_type_7', 'reserved_room_type_8',
       'reserved_room_type_9', 'deposit_type