# INTRO DMDW Final Project

## Data preparing

### Import and clean data

In [None]:
import torch
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.init as init
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('/Users/ponynie/Developer/Python_Code/IntroDMLabChula/Final_Project/Hotel Reservations.csv')
data.drop(['Booking_ID'], axis=1, inplace=True)
data.dropna(inplace=True)
data.head(5)

### Separeate categorical columns, numerical columns and label

In [None]:
categorical_columns = ['type_of_meal_plan', 
                       'room_type_reserved', 
                       'arrival_year', 
                       'market_segment_type', 
                       'required_car_parking_space']
numerical_columns = ['no_of_adults', 
                     'no_of_children', 
                     'no_of_weekend_nights', 
                     'no_of_week_nights', 
                     'lead_time', 
                     'arrival_month', 
                     'arrival_date', 
                     'repeated_guest', 
                     'no_of_previous_cancellations', 
                     'no_of_previous_bookings_not_canceled', 
                     'avg_price_per_room', 
                     'no_of_special_requests']
label_column = 'booking_status'

for category in categorical_columns:
    data[category] = data[category].astype('category')
data[label_column] = data[label_column].astype('category')

for categorical in categorical_columns:
    print(data[categorical].cat.categories, categorical)
print(data[label_column].cat.categories, "Label")

### Encode categorical columns to number and convert to Tensor

In [None]:
categorical_np = [data[i].cat.codes.values for i in categorical_columns]
categorical_data = np.stack(categorical_np, 1)
categorical_data = torch.tensor(categorical_data, dtype=torch.int64)
categorical_data[:10]

### Convert numerical columns to Tensor

In [None]:
numerical_data = np.stack([data[i].values for i in numerical_columns], 1)
numerical_data = torch.tensor(numerical_data, dtype=torch.float)
numerical_data[:10]

### Convert label to Tensor

In [None]:
outputs = torch.tensor(data[label_column].cat.codes.values).flatten()
outputs[:10]

### Check correctness of dimension

In [None]:
categorical_data.shape, numerical_data.shape, outputs.shape

### Embedding categorical columns for better perfomance

In [None]:
categorical_column_sizes = [len(data[column].cat.categories) for column in categorical_columns]
categorical_embedding_sizes = [(col_size, min(50, (col_size+1)//2)) for col_size in categorical_column_sizes]
print(categorical_embedding_sizes)

### Separate tran test dataset

In [None]:
total_records = data.shape[0]
test_records = int(total_records * .2) # 20% of the data for testing
train_records = total_records - test_records # 80% of the data for training

categorical_train_data = categorical_data[:train_records]
categorical_test_data = categorical_data[train_records:]
numerical_train_data = numerical_data[:train_records]
numerical_test_data = numerical_data[train_records:]
train_label = outputs[:train_records]
test_label = outputs[train_records:]

print(categorical_train_data.shape, categorical_test_data.shape)
print(numerical_train_data.shape, numerical_test_data.shape)
print(train_label.shape, test_label.shape)

### Define model and network topology

In [None]:
class Model(nn.Module):

    def __init__(self, embedding_size, num_numerical_cols, output_size, layers, p=0.4):
        super().__init__()
        self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
        self.embedding_dropout = nn.Dropout(p)
        self.batch_norm_num = nn.BatchNorm1d(num_numerical_cols)

        all_layers = []
        num_categorical_cols = sum((nf for ni, nf in embedding_size))
        input_size = num_categorical_cols + num_numerical_cols

        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = i

        all_layers.append(nn.Linear(layers[-1], output_size))

        self.layers = nn.Sequential(*all_layers)

    def forward(self, x_categorical, x_numerical):
        embeddings = []
        for i,e in enumerate(self.all_embeddings):
            embeddings.append(e(x_categorical[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.embedding_dropout(x)

        x_numerical = self.batch_norm_num(x_numerical)
        x = torch.cat([x, x_numerical], 1)
        x = self.layers(x)
        return x #(batch_size, output_size)

### Adjust hidden layers and instantiate model

In [None]:
hidden_layers = [200,100,50]
class_count = int(data[label_column].cat.codes.nunique())

model = Model(categorical_embedding_sizes, numerical_data.shape[1], class_count, hidden_layers, p=0.4)
print(model)

### Move all Tensor to MPS backend for GPU training acceleration

In [None]:
if torch.backends.mps.is_available():
    print ("MPS device found.")
else:
    print("MPS device not found.")

mps_device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

model.to(mps_device)
categorical_train_data = categorical_train_data.to(mps_device)
numerical_train_data = numerical_train_data.to(mps_device)
categorical_test_data = categorical_test_data.to(mps_device)
numerical_test_data = numerical_test_data.to(mps_device)
train_label = train_label.to(mps_device)
test_label = test_label.to(mps_device)  

### Adjust hyper parameter

In [None]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 300

### The Training loop

In [None]:
aggregated_losses = []

for i in range(epochs):
    i += 1
    pred = model(categorical_train_data, numerical_train_data)
    single_loss = loss_function(pred, train_label)
    aggregated_losses.append(single_loss)

    if i%25 == 1:
        print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')

    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()
print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')

aggregated_losses_on_cpu = [tensor.detach().cpu() for tensor in aggregated_losses]
plt.plot(range(epochs), aggregated_losses_on_cpu)
plt.title('Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

### Model Evaluation 

In [None]:
with torch.no_grad():
    eval = model(categorical_test_data, numerical_test_data)
    loss = loss_function(eval, test_label)
print(f'Loss: {loss:.8f}')


In [None]:
print(eval[:5])
eval = np.argmax(eval.detach().cpu(), axis=1)
print(eval[:5])

### Report and Confusion Matrix

In [None]:
test_label = test_label.detach().cpu()

print(confusion_matrix(test_label, eval))
print("--------------------------------------------------------------------------------")
print(classification_report(test_label, eval))
print("--------------------------------------------------------------------------------")
print(f"accuracy: {accuracy_score(test_label, eval)*100} %")