# **Cloud Computing**
## Assignment 3

---

## Step 1: Import Necessary Libraries

We import the required libraries to handle data, split it into training and test sets, scale features, train a logistic regression model, and evaluate its performance.

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

## Step 2: Load Dataset

We load the dataset into a pandas DataFrame to explore and process it.

In [2]:
file_path = 'hotel_bookings.csv'
df = pd.read_csv(file_path)

---

## Step 3: Data Inspection

We inspect the dataset to have a better understanding of the dataset.

In [3]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [4]:
df.describe()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119386.0,119390.0,119390.0,119390.0,119390.0,119390.0,103050.0,6797.0,119390.0,119390.0,119390.0,119390.0
mean,0.370416,104.011416,2016.156554,27.165173,15.798241,0.927599,2.500302,1.856403,0.10389,0.007949,0.031912,0.087118,0.137097,0.221124,86.693382,189.266735,2.321149,101.831122,0.062518,0.571363
std,0.482918,106.863097,0.707476,13.605138,8.780829,0.998613,1.908286,0.579261,0.398561,0.097436,0.175767,0.844336,1.497437,0.652306,110.774548,131.655015,17.594721,50.53579,0.245291,0.792798
min,0.0,0.0,2015.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,-6.38,0.0,0.0
25%,0.0,18.0,2016.0,16.0,8.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,62.0,0.0,69.29,0.0,0.0
50%,0.0,69.0,2016.0,28.0,16.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,179.0,0.0,94.575,0.0,0.0
75%,1.0,160.0,2017.0,38.0,23.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,229.0,270.0,0.0,126.0,0.0,1.0
max,1.0,737.0,2017.0,53.0,31.0,19.0,50.0,55.0,10.0,10.0,1.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [6]:
df.shape

(119390, 32)

---

## Step 4: Cleaning Dataset

In this step, we identify and remove duplicate rows.

In [7]:
# Identify duplicate rows
duplicates = df[df.duplicated()]

duplicates

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
5,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.00,0,1,Check-Out,2015-07-03
22,Resort Hotel,0,72,2015,July,27,1,2,4,2,...,No Deposit,250.0,,0,Transient,84.67,0,1,Check-Out,2015-07-07
43,Resort Hotel,0,70,2015,July,27,2,2,3,2,...,No Deposit,250.0,,0,Transient,137.00,0,1,Check-Out,2015-07-07
138,Resort Hotel,1,5,2015,July,28,5,1,0,2,...,No Deposit,240.0,,0,Transient,97.00,0,0,Canceled,2015-07-01
200,Resort Hotel,0,0,2015,July,28,7,0,1,1,...,No Deposit,240.0,,0,Transient,109.80,0,3,Check-Out,2015-07-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119349,City Hotel,0,186,2017,August,35,31,0,3,2,...,No Deposit,9.0,,0,Transient,126.00,0,2,Check-Out,2017-09-03
119352,City Hotel,0,63,2017,August,35,31,0,3,3,...,No Deposit,9.0,,0,Transient-Party,195.33,0,2,Check-Out,2017-09-03
119353,City Hotel,0,63,2017,August,35,31,0,3,3,...,No Deposit,9.0,,0,Transient-Party,195.33,0,2,Check-Out,2017-09-03
119354,City Hotel,0,63,2017,August,35,31,0,3,3,...,No Deposit,9.0,,0,Transient-Party,195.33,0,2,Check-Out,2017-09-03


In [8]:
# Count the number of duplicate rows
print(f"Number of duplicate rows: {len(duplicates)}")

Number of duplicate rows: 31994


In [9]:
# Remove duplicate rows
df_no_duplicates = df.drop_duplicates()

print(f"Number of rows after removing duplicates: {len(df_no_duplicates)}")

Number of rows after removing duplicates: 87396


---

# Step 5: Training the Model offline

### a) Preprocess the Data

In [10]:
# Step 2.1: Select important columns (features and target)
features = ['lead_time', 'previous_cancellations', 'total_of_special_requests', 'adr', 'stays_in_week_nights']  # Features are the inputs that influence the target.
target = 'is_canceled'  # The target is what we want to predict.

# Keep only these columns in the dataset and removing duplicates
df_no_duplicates = df_no_duplicates[features + [target]]

In [11]:
# Step 2.2: Handle missing values
# Replace missing values in the features with the median. The median is used because it is not affected by outliers.
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')  # Use median to fill missing values
df_no_duplicates[features] = imputer.fit_transform(df_no_duplicates[features])

# If the target variable is missing for any rows, we cannot predict those. Drop such rows.
df_no_duplicates = df_no_duplicates.dropna(subset=[target])

In [12]:
# Step 2.3: Define X (features) and y (target)
# X contains all the input features, and y contains the output we want to predict.
X = df_no_duplicates[features]
y = df_no_duplicates[target]

### b) Scale the Data

In [13]:
# Scale the features so that all the numbers are on a similar scale
# This is important because some features like 'adr' may have large values, while others may have smaller values.

scaler = StandardScaler()  # Initialize the scaler
X_scaled = scaler.fit_transform(X)  # Scale the features

### c) Split the Data

In [14]:
# Split the data into training and test sets
# Training data is used to train the model, and test data is used to check how well it performs.

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 80% of the data goes into training (X_train, y_train), and 20% into testing (X_test, y_test).

### d) Train Logistic Regression

In [15]:
# Train a Logistic Regression model
# Logistic Regression is used because it's simple, easy to interpret, and works well for binary classification problems.

# Use 'class_weight=balanced' to handle class imbalance (as one class has far more examples than the other).
logistic_model = LogisticRegression(class_weight='balanced', random_state=42)

# Fit the model to the training data (X_train and y_train)
logistic_model.fit(X_train, y_train)

### e) Make Predictions

In [16]:
# Use the trained model to make predictions on the test data
y_pred = logistic_model.predict(X_test)

### f) Evaluate the Model

In [17]:
# Check how well the model performs by using evaluation metrics
print("Logistic Regression Model Performance:")
print("Coefficients:", logistic_model.coef_)  # This shows the effect of each feature on the prediction
print("Intercept:", logistic_model.intercept_)  # This is the bias of the model
print("Accuracy:", accuracy_score(y_test, y_pred))  # How many predictions were correct
print("\nClassification Report:\n", classification_report(y_test, y_pred))  # Detailed performance metrics

Logistic Regression Model Performance:
Coefficients: [[ 0.40511806  0.30750015 -0.36742172  0.40366636  0.07474148]]
Intercept: [-0.08294645]
Accuracy: 0.641533180778032

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.65      0.73     12733
           1       0.40      0.61      0.48      4747

    accuracy                           0.64     17480
   macro avg       0.61      0.63      0.60     17480
weighted avg       0.70      0.64      0.66     17480



#### Interpretation:
- The data contains an imbalance between "not canceled" and "canceled" bookings, making it harder to predict cancellations.
- Certain features, like lead time and special requests, have a clear influence on cancellation behavior, which aligns with practical intuition.
- The model's performance reflects the challenge of predicting cancellations accurately, especially due to false positives and class imbalance.

---

In [18]:
# Import libraries
import numpy as np

# Calculate feature means and standard deviations based on X_train
feature_means = np.mean(X_train, axis=0)
feature_stds = np.std(X_train, axis=0)

# Print the calculated values
print("Feature Means:", feature_means)
print("Feature Standard Deviations:", feature_stds)

Feature Means: [-0.0005071  -0.00113819 -0.0033188  -0.00357543 -0.00017482]
Feature Standard Deviations: [1.00116478 0.99520665 0.99778344 1.01097902 0.99562047]
