# EDA Techniques
[Link to the original dataset](https://www.kaggle.com/jessemostipak/hotel-booking-demand?select=hotel_bookings.csv)

# 1 Load and prepare the dataset

In [None]:
import pandas as pd

df = pd.read_csv('source/hotel_bookings.csv')
df.head()

In [None]:
df.shape

In [None]:
df.columns

## Prepare the dataset

In [None]:
from datetime import datetime
def get_month(x):
    month_name = datetime.strptime(x, "%B")
    return month_name.month
df['arrival_date_month_number'] = df['arrival_date_month'].apply(lambda x: get_month(x))

In [None]:
df['arrival_date'] = df[['arrival_date_year','arrival_date_month_number','arrival_date_day_of_month']].apply(
    lambda x: '-'.join(x.dropna().astype(str)),
    axis=1
)
df['arrival_date'] = pd.to_datetime(df['arrival_date'])

In [None]:
def get_season(date):
    md = date.month * 100 + date.day

    if ((md > 320) and (md < 621)):
        return 'spring'
    elif ((md > 620) and (md < 923)):
        return 'summer'
    elif ((md > 922) and (md < 1223)):
        return 'fall'
    else:
        return 'winter'

In [None]:
df['arrival_season'] = df['arrival_date'].apply(lambda x: get_season(x))

# 2 Non-visual EDA

In [None]:
from pandas_profiling import ProfileReport

profile = ProfileReport(df, title="Hotel Booking Report")
profile.to_file("hotel_bookings_eda.html")

# 3 Visual EDA
## 3.1 Univariate Analysis

### Categorical Variables

1) countplot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15,6))

values = df['arrival_date_month'].value_counts()
colors = sns.color_palette('mako_r')
sns.set(font_scale=1.2)
sns.countplot(df['arrival_date_month'], palette=colors)
plt.show()

2) pie chart

In [None]:
plt.figure(figsize=(10,10))
values = df['arrival_date_month'].value_counts()
values.plot(kind='pie', colors = colors,fontsize=17, autopct='%.2f')
plt.legend(labels=values.index, loc="best")
plt.show()

### Numerical Variables

1) histogram

In [None]:
plt.figure(figsize=(15,6))
sns.set(font_scale=2)
plt.xlabel('stays_in_week_nights')
plt.ylabel('distribution')
plt.hist(df['stays_in_week_nights'], bins=50, color='#40B7AD')
plt.show()

2) distplot

In [None]:
plt.figure(figsize=(15,6))
sns.set(font_scale=2)
sns.distplot(df['stays_in_week_nights'], color='#40B7AD')
plt.show()

3) boxplot

In [None]:
plt.figure(figsize=(15,6))
sns.set(font_scale=2)
sns.boxplot(df['stays_in_week_nights'], color='#40B7AD')
plt.show()

4) violin plot

In [None]:
plt.figure(figsize=(15,6))
sns.set(font_scale=2)
sns.violinplot(df['stays_in_week_nights'], color='#40B7AD')
plt.show()

## 3.2 Bivariate Analysis

### Numerical to Numerical

1) scatter plot

In [None]:
plt.figure(figsize=(15,6))
sns.set(font_scale=2)
sns.scatterplot(df['adults'], df['stays_in_week_nights'],color='#40B7AD')
plt.show()

### Numerical to categorical

1) bar plot

In [None]:
plt.figure(figsize=(15,6))
sns.set(font_scale=2)
sns.barplot(df['adults'], df['arrival_date_month'],color='#40B7AD')
plt.show()

### Categorical to categorical

1) heatmap

In [None]:
plt.figure(figsize=(15,6))
sns.set(font_scale=2)
sns.heatmap(pd.crosstab(df['customer_type'], df['arrival_date_month']), cmap='mako_r')
plt.show()

# 4 Comet for EDA

In [None]:
# import comet_ml at the top of your file
from comet_ml import Experiment

# Create an experiment with your api key
experiment = Experiment()

In [None]:
experiment.log_dataframe_profile(df, "hotel_bookings")

In [None]:
# select families (at least two adults)
families = df[df['adults'] > 1]
families.set_index('arrival_date', inplace=True)

In [None]:
ts_families = families['adults'].groupby('arrival_date').count()

In [None]:
import time
import datetime
for i in ts_families.index:
    index = time.mktime(i.timetuple())

    experiment.log_metric("ts_families", ts_families[i], step=index)

In [None]:
experiment.end()

# 5 Sweetviz

In [None]:
import sweetviz as sv

report = sv.analyze(df)
report.show_html('report.html') # Default arguments will generate to "SWEETVIZ_REPORT.html"

In [None]:
# import comet_ml at the top of your file
from comet_ml import Experiment

# Create an experiment with your api key
experiment = Experiment()
report.log_comet(experiment)
experiment.end()