In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler #you can use minmax scaler too
#Import other necessary model libraries, for this example, using Logistic Regression

In [4]:
train_data = pd.read_csv(r"../Data/Corrected Datasets/train-dataset.csv", header = 0)
test_data = pd.read_csv(r"../Data/Corrected Datasets/test-dataset.csv", header = 0)
validation_data = pd.read_csv(r"../Data/Corrected Datasets/validation-dataset.csv", header = 0)

In [23]:
data_quan_cols = ['Age','Discount_Rate', 'Room_Rate','Expected_stay_days', 
                        'Reservation_gap', 'Adults', 'Children','Babies']

In [24]:
data_cat_cols = ['Gender', 'Ethnicity', 'Educational_Level', 'Income', 'Country_region',
                 'Hotel_Type', 'Meal_Type', 'Visted_Previously',
                 'Previous_Cancellations', 'Deposit_type', 'Booking_channel',
                 'Required_Car_Parking', 'Use_Promotion']

In [25]:
quan_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])

quan_transformed = quan_pipeline.fit_transform(train_data[data_quan_cols])

In [48]:
data_pipeline = ColumnTransformer([
    ('numerical', quan_pipeline, data_quan_cols),
    ('categorical', OneHotEncoder(), data_cat_cols),
    
])

train_data_processed = data_pipeline.fit_transform(train_data)

In [49]:
validation_data_processed = data_pipeline.transform(validation_data)

In [29]:
type(train_data_processed)

numpy.ndarray

In [30]:
le = preprocessing.LabelEncoder()

In [31]:
target_train = train_data['Reservation_Status']
le.fit(target_train)

LabelEncoder()

In [32]:
target_validation = validation_data['Reservation_Status']

In [33]:
print(le.classes_)

['Canceled' 'Check-In' 'No-Show']


In [34]:
y_train = le.transform(target_train)
y_valid = le.transform(target_validation)

### Over sampling method to deal with Imbalanced data

#### 1. SMOTE (default sampling_strategy)
#### 2. SMOTE (sampling_strategy)

In [35]:
print(imblearn.__version__)

0.8.0


#### SMOTE

In [87]:
# default SMOTE
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

smote = SMOTE()

X_smote, y_smote = smote.fit_resample(train_data_processed, y_train)
print(Counter(y_train), Counter(y_smote))

model = RandomForestClassifier()
model.fit(X_smote, y_smote)
y_pred = model.predict(validation_data_processed)
print(classification_report(y_valid, y_pred))

Counter({1: 21240, 0: 4134, 2: 2125}) Counter({1: 21240, 0: 21240, 2: 21240})
              precision    recall  f1-score   support

           0       0.35      0.02      0.05       741
           1       0.59      0.99      0.74      1610
           2       0.11      0.00      0.00       398

    accuracy                           0.58      2749
   macro avg       0.35      0.34      0.26      2749
weighted avg       0.45      0.58      0.45      2749



In [88]:
from imblearn.over_sampling import SMOTE
#0: Cancelled, 1: Check--in , 2:No Show

#Oversampling
strategy = {0:4134*5, 1:21240, 2:2125*10}
smote = SMOTE(sampling_strategy=strategy) 

X_smote, y_smote = smote.fit_resample(train_data_processed, y_train)
print(Counter(y_train), Counter(y_smote))

model =RandomForestClassifier()
model.fit(X_smote, y_smote)

y_pred = model.predict(validation_data_processed)
print(classification_report(y_valid, y_pred))



Counter({1: 21240, 0: 4134, 2: 2125}) Counter({2: 21250, 1: 21240, 0: 20670})
              precision    recall  f1-score   support

           0       0.40      0.03      0.06       741
           1       0.59      0.99      0.74      1610
           2       0.20      0.00      0.00       398

    accuracy                           0.59      2749
   macro avg       0.40      0.34      0.27      2749
weighted avg       0.48      0.59      0.45      2749



### Undersampling
https://machinelearningmastery.com/undersampling-algorithms-for-imbalanced-classification/

#### 1. Near Miss - version 3 (Select Examples to keep from Majority class)
#### 2. Tomek Links (Deletes Examples from Majority class)

In [89]:
#NearMiss
from imblearn.under_sampling import NearMiss

strategy = {1: round(21240*0.5), 0: 4134, 2: 2125}
undersample = NearMiss(sampling_strategy=strategy,version=3, n_neighbors=2)
X_near , y_near = undersample.fit_resample(train_data_processed,y_train)

model = RandomForestClassifier().fit(X_near,y_near)
y_pred= model.predict(validation_data_processed)

print(classification_report(y_valid, y_pred))



              precision    recall  f1-score   support

           0       0.30      0.18      0.23       741
           1       0.59      0.82      0.69      1610
           2       0.17      0.03      0.04       398

    accuracy                           0.53      2749
   macro avg       0.35      0.34      0.32      2749
weighted avg       0.45      0.53      0.47      2749



In [91]:
# TomekLinks imbalanced-learn class
from imblearn.under_sampling import TomekLinks

#strategy = {1: round(21240*0.5), 0: 4134, 2: 2125}
undersample = TomekLinks()
X_tom , y_tom = undersample.fit_resample(train_data_processed,y_train)
model = RandomForestClassifier().fit(X_tom,y_tom)
y_pred= model.predict(validation_data_processed)

print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       741
           1       0.59      1.00      0.74      1610
           2       0.00      0.00      0.00       398

    accuracy                           0.58      2749
   macro avg       0.20      0.33      0.25      2749
weighted avg       0.34      0.58      0.43      2749



### Random Oversampling and Undersampling Method

In [55]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

over = RandomOverSampler(sampling_strategy = 'auto')
X_over, y_over = over.fit_resample(train_data_processed,y_train)
under = RandomUnderSampler(sampling_strategy='auto')
X_under, y_under = under.fit_resample(X_over, y_over)
print(Counter(y_train), Counter(y_under))

clf_under = RandomForestClassifier().fit(X_under, y_under)
print(classification_report(y_valid,
                     clf_under.predict(validation_data_processed)))

Counter({1: 21240, 0: 4134, 2: 2125}) Counter({0: 21240, 1: 21240, 2: 21240})
              precision    recall  f1-score   support

           0       0.32      0.03      0.05       741
           1       0.59      0.98      0.73      1610
           2       0.00      0.00      0.00       398

    accuracy                           0.58      2749
   macro avg       0.30      0.33      0.26      2749
weighted avg       0.43      0.58      0.44      2749



### Ensemble Method to deal with Imbalanced Data
#### Balanced Bagging Classifier

In [92]:
from imblearn.ensemble import BalancedBaggingClassifier

model = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                  sampling_strategy='auto',
                                  replacement=False,
                                  random_state=0
                                 ).fit(train_data_processed, y_train)

print(classification_report(y_valid,
                            model.predict(validation_data_processed)))


NameError: name 'DecisionTreeClassifier' is not defined

In [67]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(class_weight='balanced_subsample').fit(train_data_processed, y_train)
print(classification_report(y_valid,
                            model.predict(validation_data_processed)))


              precision    recall  f1-score   support

           0       0.50      0.00      0.00       741
           1       0.59      1.00      0.74      1610
           2       0.00      0.00      0.00       398

    accuracy                           0.59      2749
   macro avg       0.36      0.33      0.25      2749
weighted avg       0.48      0.59      0.43      2749



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
