## Initial setup

In [None]:
import pandas as pd
import numpy as np

hb = pd.read_csv("../data/hotel_bookings.csv")
hb.head()

### Look at the raw dataset

In [None]:
hb.describe()

In [None]:
hb.info()

# Data Cleaning
### Missing values

In [None]:
def info_na():
    missing_values_count = hb.isnull().sum()
    if missing_values_count.sum() == 0:
        print("no missing data")

    else:
        missing_data = pd.DataFrame({
            'missing values': missing_values_count[missing_values_count > 0],
            'total entries': len(hb),
        })
        missing_data['percentage'] = round((missing_data['missing values'] / missing_data['total entries']) * 100, 4)
    
        print(missing_data, '\n')
        
    full_data_row_count = hb.dropna(axis=0)
    print(f"full data row count {full_data_row_count.shape[0]} / {hb.shape[0]}") 
        
    full_data_column_count = hb.dropna(axis=1)
    print(f"full data column count {full_data_column_count.shape[1]} / {hb.shape[1]}")
    


In [None]:
info_na()

In [None]:
# for children column we can assume that if there is no data, there were no children
hb['children'] = hb['children'].fillna(0)

# safer would be to set the values as unknown
hb['country'] = hb['country'].fillna('Unknown')

# this column contains id of an agency and will most likely not be needed for future
# analysis, but can be worth keeping for now. fill it with 0 as "unknown" for now
hb['agent'] = hb['agent'].fillna(0)

# delete company column because 94% data is missing
hb.drop(labels='company', axis=1, inplace=True)

In [None]:
info_na()

In [None]:
hb.describe()

## Data Type Correction

In [None]:
hb.dtypes

In [None]:
# convert arrival date year, month and day of month into a datetime
hb = hb.rename(columns={
    'arrival_date_year': 'year',
    'arrival_date_month': 'month',
    'arrival_date_day_of_month': 'day'
})
# months are written verbally, convert January to 1, February to 2 etc.
hb['month'] = pd.to_datetime(hb['month'], format='%B').dt.month
# aggregate all values into one column representing arrival date
hb['arrival_date'] = pd.to_datetime(hb[['year', 'month', 'day']])
# drop year month day and week, they are deprecated now
hb.drop(columns={'year', 'month', 'day', 'arrival_date_week_number'}, inplace=True)

dtype_mapping = {
    'hotel' : 'category',
    'is_canceled' : 'bool',
    'lead_time' : 'int',
    'adults': 'int',
    'children': 'int',
    'babies': 'int',
    'meal': 'category',
    'country': 'category',
    'market_segment': 'category',
    'distribution_channel': 'category',
    'is_repeated_guest': 'bool',
    'previous_cancellations': 'int',
    'previous_bookings_not_canceled': 'int',
    'reserved_room_type': 'category',
    'assigned_room_type': 'category',
    'booking_changes': 'int',
    'deposit_type': 'category',
    'agent': 'int',
    'days_in_waiting_list': 'int',
    'customer_type': 'category',
    'adr': 'float',
    'required_car_parking_spaces': 'int',
    'total_of_special_requests': 'int',
    'reservation_status': 'category',
    'reservation_status_date': 'datetime64[ns]'
}

# correct datatype if it is different
for column, dtype in dtype_mapping.items():
    if hb[column].dtype == dtype:
        continue
    if dtype == 'datetime64[ns]':
        hb[column] = pd.to_datetime(hb[column])
    else:
        hb[column] = hb[column].astype(dtype)


In [None]:
hb.dtypes 

In [None]:
# check if all values were converted
info_na()