<a href="https://colab.research.google.com/github/SovetovAleksey/ML_in_business/blob/course_project/PipeLine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost

In [24]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier

import matplotlib.pyplot as plt

import dill

import warnings
warnings.filterwarnings('ignore')

In [26]:
data = pd.read_csv('Hotel Reservations.csv')
data.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Booking_ID                            36275 non-null  object 
 1   no_of_adults                          36275 non-null  int64  
 2   no_of_children                        36275 non-null  int64  
 3   no_of_weekend_nights                  36275 non-null  int64  
 4   no_of_week_nights                     36275 non-null  int64  
 5   type_of_meal_plan                     36275 non-null  object 
 6   required_car_parking_space            36275 non-null  int64  
 7   room_type_reserved                    36275 non-null  object 
 8   lead_time                             36275 non-null  int64  
 9   arrival_year                          36275 non-null  int64  
 10  arrival_month                         36275 non-null  int64  
 11  arrival_date   

In [42]:
data['booking_status'] = data['booking_status'].map({'Not_Canceled': 1, 'Canceled': 0})

In [43]:
num_features = ['no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights', 'required_car_parking_space', 'lead_time',
                'arrival_year', 'arrival_month', 'arrival_date', 'repeated_guest', 'no_of_previous_cancellations',
                'no_of_previous_bookings_not_canceled', 'avg_price_per_room', 'no_of_special_requests']
cat_features = ['type_of_meal_plan', 'room_type_reserved']
bin_features = ['market_segment_type']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['booking_status', 'Booking_ID'], 1), data['booking_status'],
                                                    stratify=data['booking_status'], test_size=0.3, random_state=1)

In [45]:
X_train.head(3)

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests
27030,2,0,1,3,Meal Plan 2,0,Room_Type 1,178,2018,10,24,Offline,0,0,0,106.25,1
15882,2,0,0,3,Not Selected,0,Room_Type 1,209,2018,10,5,Online,0,0,0,98.1,1
29112,2,0,1,2,Meal Plan 1,0,Room_Type 1,121,2018,6,24,Offline,0,0,0,90.0,0


In [46]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    

class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]
    
    
class OHEEncoderBin(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns].iloc[:, 0].to_frame()
    
    
class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.mode = None
    
    def fit(self, X, y=None):
        self.mode = X[self.key].mode()[0]
        return self
    
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.mode)
        return X
        
        
class NumImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.median = None
    
    def fit(self, X, y=None):
        self.median = X[self.key].median()
        return self
    
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.median)
        return X

In [47]:
final_transformers = list()

for cat_feature in cat_features:
    cat_transformer = Pipeline([
                ('imputer', TextImputer(key=cat_feature)),
                ('selector', FeatureSelector(column=cat_feature)),
                ('ohe', OHEEncoder(key=cat_feature))
            ])
    final_transformers.append((cat_feature, cat_transformer))
    
for num_feature in num_features:
    num_transformer = Pipeline([
                ('imputer', NumImputer(key=num_feature)),
                ('selector', NumberSelector(key=num_feature))
            ])
    final_transformers.append((num_feature, num_transformer))
    
for bin_feature in bin_features:
    bin_transformer = Pipeline([
                ('imputer', TextImputer(key=bin_feature)),
                ('selector', FeatureSelector(column=bin_feature)),
                ('ohebin', OHEEncoderBin(key=bin_feature))
            ])
    final_transformers.append((bin_feature, bin_transformer))
        
feats = FeatureUnion(final_transformers)

pipeline = Pipeline([
        ('features', feats),
        ('classifier', CatBoostClassifier(depth=3, iterations=500, learning_rate=0.11, random_state=1, verbose=False))
        ])

In [48]:
pipeline.fit(X_train, y_train)

preds = pipeline.predict_proba(X_test)[:, 1]
    
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
cnf_matrix = confusion_matrix(y_test, preds>thresholds[ix])
    
print('Best Threshold=%.3f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix],
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.484, F-Score=0.904, Precision=0.867, Recall=0.945


In [49]:
with open('models\catboost_class_pipeline.dill', 'wb') as f:
    dill.dump(pipeline, f)
    
X_train.to_csv('data\X_train.csv', index=None)
X_test.to_csv('data\X_test.csv', index=None)
y_train.to_csv('data\y_train.csv', index=None)
y_test.to_csv('data\y_test.csv', index=None)