#### Importing Libraries

In [None]:
pip install matplotlib


In [None]:
import pandas as pd                  # Import the pandas module and alias it as 'pd'
import matplotlib.pyplot as plt      # Import the matplotlib module's pyplot interface and alias it as 'plt'
import seaborn as sns                # Import the seaborn data visualization library and alias it as 'sns'
import warnings                      # Import the warnings module for suppressing warnings
warnings.filterwarnings('ignore')    # Ignore warning messages during runtime


#### Loading the Dataset

In [None]:
df = pd.read_csv('hotel_booking.csv')

In [None]:
df = df.drop(['name','email', 'phone-number', 'credit_card'], axis=1)

In [None]:
df.head(6)

#### Exploratory Data Analysis and Data Cleaning

In [None]:
df.tail(5)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])

In [None]:
df.describe(include= 'object')

In [None]:
# Loop through columns that contain object/string data in the dataframe's summary statistics
for col in df.describe(include='object').columns:
    
    # Print the name of the column
    print(col)
    
    # Print the unique values in that column
    print(df[col].unique())
    print('___'*40)


In [None]:
df.isnull().sum()

In [None]:
df.drop(['company', 'agent'],axis=1, inplace=True)  #Delete 'company', 'agent' col from the df
df.dropna(inplace=True)     # delete all row where have missing value.

In [None]:
df.describe()

In [None]:
df = df[df['adr']<5000]  

#### Data Analysis and visualizations

In [None]:
Cancelled_perc = df['is_canceled'].value_counts(normalize=True)
print(Cancelled_perc)

plt.figure(figsize= (5, 3))
plt.title('Reservation status count')
plt.bar(['Not canceled','Canceled'],df['is_canceled'].value_counts(), edgecolor = 'k', width = 0.4)
plt.show()

In [None]:
# Create a new plot figure with a specified size of 8 inches (width) by 4 inches (height)
plt.figure(figsize=(8,4))

# Create a countplot using seaborn (sns) with the specified data frame (df), 
# x-axis variable ('hotel'), and hue variable ('is_canceled'), 
# and use the 'Blues' color palette.
ax1 = sns.countplot(x='hotel', hue='is_canceled', data=df, palette='Blues')

# Get the legend handles and labels from the plot and store them in a variable 'legend_labels'
legend_labels = ax1.get_legend_handles_labels()

# Add a legend to the plot with the specified location outside of the plot area
ax1.legend(bbox_to_anchor=(1, 1))

# Set the title of the plot to 'Reservation status in different hotels' with a font size of 20
plt.title('Reservation status in different hotels', size=20)

# Set the label for the x-axis to 'hotel'
plt.xlabel('hotel')

# Set the label for the y-axis to 'number of reservation'
plt.ylabel('number of reservation')

# Add a legend to the plot with the specified labels 'not canceled' and 'canceled'
plt.legend(['not canceled', 'canceled'])

# Show the plot
plt.show()


In [None]:
# Subset the DataFrame to only include rows where the hotel is 'Resort Hotel'
resort_hotel = df[df['hotel'] == 'Resort Hotel']

# Count the number of canceled and not canceled bookings for the resort hotel
resort_hotel_cancel_counts = resort_hotel['is_canceled'].value_counts(normalize=True)

In [None]:
# Subset the data to only include the City Hotel
city_hotel = df[df['hotel'] == 'City Hotel']

# Calculate the number and proportion of canceled bookings for the City Hotel
# The `normalize` argument is set to `True` to show proportions instead of counts
city_hotel['is_canceled'].value_counts(normalize=True)

In [None]:
# Calculate the average ADR for each day for the resort hotel
resort_hotel = resort_hotel.groupby('reservation_status_date')[['adr']].mean()

# Calculate the average ADR for each day for the city hotel
city_hotel = city_hotel.groupby('reservation_status_date')[['adr']].mean()

In [None]:
# Set the size of the figure
plt.figure(figsize= (22,5))

# Set the title of the plot
plt.title('Average Daily Rate in City and Resort Hotel', fontsize = 30)

# Plot the Average Daily Rate for the Resort Hotel
plt.plot(resort_hotel.index, resort_hotel['adr'], label = 'Resort Hotel')

# Plot the Average Daily Rate for the City Hotel
plt.plot(city_hotel.index, city_hotel['adr'], label = 'City Hotel')

# Set the legend with the font size
plt.legend(fontsize = 20)

# Display the plot
plt.show()


In [None]:
# Extract the month from the reservation status date and add it as a new column
df['month'] = df['reservation_status_date'].dt.month

# Set the figure size and create a count plot of reservation status per month
plt.figure(figsize=(16, 5))
ax1 = sns.countplot(x='month', hue='is_canceled', data=df, palette='bright')

# Get the legend handles and labels and move the legend to the upper right corner of the plot
legend_labels = ax1.get_legend_handles_labels()
ax1.legend(bbox_to_anchor=(1, 1))

# Set the plot title, x and y labels, and legend labels
plt.title('Reservation status per month', size=20)
plt.xlabel('Month')
plt.ylabel('Number of reservations')
plt.legend(['Not canceled', 'Canceled'])

# Show the plot
plt.show()

In [None]:
# Set the figure size and title
plt.figure(figsize=(14, 7))
plt.title('ADR per month', fontsize=30)

# Subset the data to only include canceled bookings and group by month
df_canceled = df[df['is_canceled'] == 1].groupby('month')[['adr']].sum().reset_index()

# Create a bar plot of ADR per month for canceled bookings
sns.barplot(x='month', y='adr', data=df_canceled)

# Show the plot
plt.show()



In [None]:
# create a new dataframe with only the rows where is_canceled is 1
cancelled_data = df[df['is_canceled'] == 1]

# get the top 10 countries with the most cancellations
top_10_country = cancelled_data['country'].value_counts()[:10]

# create a new figure with a size of 6x6
plt.figure(figsize= (6,6))

# add a title to the pie chart
plt.title('Top 10 countries with reservations cancelled')

# create a pie chart with the top 10 countries and their counts, with a two decimal percentage
# displayed in each slice and the country names as labels
plt.pie(top_10_country, autopct='%.2f', labels=top_10_country.index)

# show the pie chart
plt.show()


In [None]:
# Count the number of reservations in each market segment
df['market_segment'].value_counts()

In [None]:
# Calculate the proportion of reservations in each market segment
df['market_segment'].value_counts(normalize=True)

In [None]:
# Calculate the proportion of canceled reservations in each market segment
cancelled_data['market_segment'].value_counts(normalize=True)

In [None]:
# Group cancelled reservations by reservation status date and calculate the average daily rate (ADR)
cancelled_df_adr = cancelled_data.groupby('reservation_status_date')[['adr']].mean()
# Reset the index to make the reservation status date a column
cancelled_df_adr.reset_index(inplace=True)
# Sort the data by reservation status date
cancelled_df_adr.sort_values('reservation_status_date', inplace=True)

# Select non-cancelled reservations and repeat the same steps as above to get the ADR over time
not_cancelled_data = df[df['is_canceled'] == 0]
not_cancelled_df_adr = not_cancelled_data.groupby('reservation_status_date')[['adr']].mean()
not_cancelled_df_adr.reset_index(inplace=True)
not_cancelled_df_adr.sort_values('reservation_status_date', inplace=True)

# Create a figure with size 20x6 inches
plt.figure(figsize=(20, 6))
# Set the title of the plot
plt.title('Average Daily Rate')
# Plot the ADR for non-cancelled reservations over time
plt.plot(not_cancelled_df_adr['reservation_status_date'], not_cancelled_df_adr['adr'], label='not cancelled')
# Plot the ADR for cancelled reservations over time
plt.plot(cancelled_df_adr['reservation_status_date'], cancelled_df_adr['adr'], label='cancelled')


In [None]:
# Filter the cancelled data by reservation status date between 2016 and September 2017
cancelled_df_adr = cancelled_df_adr[(cancelled_df_adr['reservation_status_date']>'2016') & (cancelled_df_adr['reservation_status_date']<'2017-09')]

# Filter the not cancelled data by reservation status date between 2016 and September 2017
not_cancelled_df_adr = not_cancelled_df_adr[(not_cancelled_df_adr['reservation_status_date']>'2016') & (not_cancelled_df_adr['reservation_status_date']<'2017-09')]


In [None]:
# Create a figure with a size of 20 x 6 inches
plt.figure(figsize = (20, 6))

# Add a title to the plot
plt.title('Average Daily Rate')

# Plot the line for the average daily rate of not cancelled reservations
plt.plot(not_cancelled_df_adr['reservation_status_date'], not_cancelled_df_adr['adr'], label = 'not cancelled')

# Plot the line for the average daily rate of cancelled reservations
plt.plot(cancelled_df_adr['reservation_status_date'], cancelled_df_adr['adr'], label ='cancelled')

# Add a legend to the plot with a font size of 20
plt.legend(fontsize = 20)
