# Importing Libraries

In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline

In [None]:
#get train data
train_data_path ='train_data_evaluation_part_2.csv'
train = pd.read_csv(train_data_path, index_col=0)

#get test data
test_data_path ='test_data_evaluation_part2.csv'
test = pd.read_csv(test_data_path, index_col=0)

#combine dataset to preprocess
combined = train.append(test)
combined.reset_index(inplace=True)

combined.head()
combined.drop(['ID','index'], inplace=True, axis=1)

In [None]:
train.columns

Index(['ID', 'Nationality', 'Age', 'DaysSinceCreation', 'AverageLeadTime',
       'LodgingRevenue', 'OtherRevenue', 'BookingsCanceled',
       'BookingsNoShowed', 'BookingsCheckedIn', 'PersonsNights', 'RoomNights',
       'DaysSinceLastStay', 'DaysSinceFirstStay', 'DistributionChannel',
       'MarketSegment', 'SRHighFloor', 'SRLowFloor', 'SRAccessibleRoom',
       'SRMediumFloor', 'SRBathtub', 'SRShower', 'SRCrib', 'SRKingSizeBed',
       'SRTwinBed', 'SRNearElevator', 'SRAwayFromElevator',
       'SRNoAlcoholInMiniBar', 'SRQuietRoom'],
      dtype='object')

In [None]:
# code to encode Nationality categorical Object Column into count/freq encoding
# let's obtain the counts for each one of the labels in variable Natinality
# let's capture this in a dictionary that we can use to re-map the labels

combined.Nationality.value_counts().to_dict()


# And now let's replace each label in Natinality by its count
# first we make a dictionary that maps each label to the counts

combined_frequency_map = combined.Nationality.value_counts().to_dict()

# and now we replace X2 labels in the dataset df
combined.Nationality = combined.Nationality.map(combined_frequency_map)


In [None]:
# Eliminate outliers
combined= combined[combined['LodgingRevenue'] < 10000]
combined= combined[combined['OtherRevenue'] < 3000]
combined = combined[combined['Age'] >= 0 ]
combined = combined[combined['Age'] < 100 ]
combined = combined[combined['AverageLeadTime'] >= 0] 
combined = combined[combined['AverageLeadTime'] < 500]

In [None]:
# Step 1 -> train/test/split
X_train,X_test,y_train,y_test = train_test_split(combined.drop(columns=['BookingsCheckedIn']),
                                                 combined['BookingsCheckedIn'],
                                                 test_size=0.1,
                                                random_state=42)

In [None]:
X_train.head()

Unnamed: 0,Nationality,Age,DaysSinceCreation,AverageLeadTime,LodgingRevenue,OtherRevenue,BookingsCanceled,BookingsNoShowed,PersonsNights,RoomNights,...,SRMediumFloor,SRBathtub,SRShower,SRCrib,SRKingSizeBed,SRTwinBed,SRNearElevator,SRAwayFromElevator,SRNoAlcoholInMiniBar,SRQuietRoom
55053,1524,76.0,239,320,452.0,68.0,0,0,8,4,...,0,0,0,0,0,1,0,0,0,0
2693,12418,43.0,1026,14,79.0,9.0,0,0,1,1,...,0,0,0,0,1,0,0,0,0,0
78203,2725,50.0,63,0,0.0,0.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
66729,10232,41.0,147,0,0.0,0.0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
41582,12418,34.0,401,17,393.15,82.15,0,0,15,5,...,0,0,0,0,0,0,0,0,0,0


In [None]:
y_train.sample(5)

48770    0
56769    0
63806    1
67766    1
81141    1
Name: BookingsCheckedIn, dtype: int64

In [None]:
X_train.nunique()

Nationality               87
Age                       94
DaysSinceCreation       1095
AverageLeadTime          401
LodgingRevenue          9434
OtherRevenue            4133
BookingsCanceled           6
BookingsNoShowed           4
PersonsNights             52
RoomNights                44
DaysSinceLastStay       1104
DaysSinceFirstStay      1105
DistributionChannel        4
MarketSegment              7
SRHighFloor                2
SRLowFloor                 2
SRAccessibleRoom           2
SRMediumFloor              2
SRBathtub                  2
SRShower                   2
SRCrib                     2
SRKingSizeBed              2
SRTwinBed                  2
SRNearElevator             2
SRAwayFromElevator         2
SRNoAlcoholInMiniBar       2
SRQuietRoom                2
dtype: int64

In [None]:
train['SRMediumFloor'].unique()

array([0, 1])

In [None]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[1])
],remainder='passthrough')

In [None]:
# one hot encoding
trf2 = ColumnTransformer([
    ('ohe',OneHotEncoder(sparse=False,handle_unknown='ignore'),[12,13])
],remainder='passthrough')

In [None]:
# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,36))
])

In [None]:
# train the model
from sklearn.tree import DecisionTreeRegressor
trf4 = DecisionTreeRegressor(random_state=0)

In [None]:
# create pipeline use this
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4)
])

In [None]:
# Alternate Syntax 
pipe = make_pipeline(trf1,trf2,trf3,trf4)

In [None]:
from sklearn import set_config
set_config(display='diagram')

In [None]:
pipe.fit(X_train,y_train)

In [None]:
# Explore the Pipeline
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [1])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [12, 13])]),
 'columntransformer-3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 36, None))]),
 'decisiontreeregressor': DecisionTreeRegressor(random_state=0)}

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9778588501698326

In [None]:
# Cross Validation using Pipeline
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.9782070771302227

In [None]:
# export 
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))