# <div style="color:white;display:fill;border-radius:5px;background-color:#0E2031;letter-spacing:0.5px;overflow:hidden"><p style="padding:20px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Conteúdo</p></div>     
   
- Libraries
- Configurations
- Loading Data
- Glossary
- **Data Preprocessing** 
    - Missing Data
    - Duplicated Values
    - Data Types
    - Strange Values
    - Feature Engineer

## <div style="color:white;display:fill;border-radius:5px;background-color:#153656;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Libraries</p></div>

In [1]:
#pip install -Uq pandas
#!pip install -q watermark
#!pip install pycountry-convert

In [2]:
# Basic Tools
import numpy as np
import pandas as pd
from datetime import datetime as dt

# Convert Alpha3 Countries to Country Name and Continents
import pycountry
import pycountry_convert as pc


# File/ OS Tools
import os
import sys
from watermark import watermark

## <div style="color:white;display:fill;border-radius:5px;background-color:#153656;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Configurations</p></div>

In [3]:
print(watermark())

Last updated: 2023-04-23T17:53:13.356232-03:00

Python implementation: CPython
Python version       : 3.10.6
IPython version      : 8.5.0

Compiler    : GCC 11.3.0
OS          : Linux
Release     : 5.19.0-40-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 12
Architecture: 64bit



In [4]:
ATUAL_DIR = os.path.abspath(os.getcwd())
ROOT_DIR = os.path.dirname(ATUAL_DIR)

In [5]:
if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)

os.chdir(ROOT_DIR)

In [6]:
def pandas_settings():
    pd.options.display.float_format = '{:,.4f}'.format
    pd.set_option('display.expand_frame_repr', False )
    pd.set_option('display.max_columns', 35)
    pd.set_option('display.max_rows', 50)
    

pandas_settings()

## <div style="color:white;display:fill;border-radius:5px;background-color:#153656;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Loading Data</p></div>

In [7]:
for dirname, _, filenames in os.walk(os.path.join(ROOT_DIR, "data")):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/home/alysson/projects/Hotel-Booking-Cancelations/data/data_processed/hotel_bookings_processed.csv
/home/alysson/projects/Hotel-Booking-Cancelations/data/data_raw/hotel_bookings.csv


In [8]:
DATA_RAW_PATH = os.path.join(ROOT_DIR, "data", "data_raw", "hotel_bookings.csv")

In [9]:
data_raw = pd.read_csv(DATA_RAW_PATH)

In [10]:
data = data_raw.copy()

In [11]:
data.shape

(119390, 32)

## <div style="color:white;display:fill;border-radius:5px;background-color:#153656;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Glossary</p></div>

In [12]:
glossary_pt = [['Colunas', 'Significado'],
              ['hotel', 'Hotel Urbano ou Resort'],
              ['is_canceled', 'Informa se a reserva foi cancelada (1 = Cancelado, 0 = Não Cancelado)'],
              ['lead_time', 'O tempo corrente em dias do momento que a reserva foi feita até o dia reservado'],
              ['market_segment', 'Segmento'] 
              ]
#print(tabulate(glossary_pt, headers='firstrow', stralign='left', tablefmt='simple'))

## <div style="color:white;display:fill;border-radius:5px;background-color:#153656;letter-spacing:0.5px;overflow:hidden"><p style="padding:15px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Data Preprocessing</p></div>

### <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Missing Data</p></div>

In [13]:
data.isna().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [14]:
nan_replacements = {"children": 0, "agent": 0, "company": 0}
data = data.fillna(nan_replacements)

**Observações:**

- Are the NaN values because the customer did not inform the nationality when making the reservation or the system was not able to identify the user's nationality?

- Why so many missing values in the Company column?

### <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Duplicated Data</p></div>

In [15]:
data.duplicated().sum()

31994

Duplicate values here are not necessarily a problem. It is possible for example to have reservations for the same type of room on the same day that were booked on the same day. Since we don't have more information if these values refer to the same guest, we'll keep it as it is.

In [16]:
data.loc[data.duplicated(), :].sample(5)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
6195,Resort Hotel,0,225,2016,May,22,28,1,1,2,0.0,0,HB,DEU,Groups,TA/TO,0,0,0,A,E,0,No Deposit,298.0,0.0,0,Transient-Party,85.0,0,0,Check-Out,2016-05-30
51075,City Hotel,1,309,2016,May,20,13,1,2,2,0.0,0,HB,PRT,Offline TA/TO,TA/TO,0,0,0,A,A,0,No Deposit,3.0,0.0,215,Transient-Party,101.0,0,0,Canceled,2016-02-09
75647,City Hotel,1,290,2015,August,32,3,1,3,2,0.0,0,BB,PRT,Groups,TA/TO,0,1,0,A,A,0,No Deposit,1.0,0.0,0,Transient-Party,62.0,0,0,Canceled,2015-07-06
3659,Resort Hotel,0,165,2015,December,53,30,0,4,2,0.0,0,Undefined,PRT,Groups,TA/TO,0,0,0,A,A,0,No Deposit,308.0,0.0,122,Transient-Party,136.5,0,0,Check-Out,2016-01-03
78633,City Hotel,1,14,2015,October,42,15,0,1,1,0.0,0,BB,PRT,Offline TA/TO,TA/TO,0,1,0,A,A,0,No Deposit,99.0,0.0,0,Transient-Party,100.0,0,0,Canceled,2015-10-13


### <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Data Types</p></div>

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119390 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [18]:
data['reservation_status_date'] = pd.to_datetime(data['reservation_status_date'])#, format='%Y/%m/%d'

In [19]:
data = data.astype({"children": int, "agent": int, "company": int})

### <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Dados Estranhos</p></div>

In [20]:
len(data)

119390

In [21]:
#No people and reservation status equal to "check-out"
len(data.loc[((data['children'] == 0) & (data['adults'] == 0) & (data['babies'] == 0) & (data['reservation_status'] == 'Check-Out'))])

155

In [22]:
data=data.loc[~((data['children'] == 0) & (data['adults'] == 0) & (data['babies'] == 0) & (data['reservation_status'] == 'Check-Out'))]

In [23]:
#Negative ADR 
data = data.loc[~(data['adr']<0)]
len(data)

119234

### <div style="color:white;display:fill;border-radius:5px;background-color:#2E6B8E;letter-spacing:0.5px;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;text-align: center;margin:0;font-size:120%">Feature Engineer</p></div>

In [24]:
#Same meaning values according to dictionary source
data['meal'] = data['meal'].replace("Undefined", "SC")

In [25]:
#data["adr_pp"] = data["adr"] / (data["adults"] + data["children"])
data["people"] = (data["adults"] + data["children"] + data["babies"])
data['kids'] = data['children'] + data['babies']
data['days_stay'] = data['stays_in_weekend_nights'] + data['stays_in_week_nights']

In [26]:
def add_country_names(df):
    country_names = []
    for country_code in df['country']:
        try:
            country_name = pycountry.countries.get(alpha_3=country_code).name
        except AttributeError:
            country_name = None
        except LookupError:
            country_name = None
        country_names.append(country_name)
    df['country_name'] = country_names
    return df

In [27]:
def add_continent(df):
    continents = []
    for country in df['country_name']:
        try:
            country_code = pc.country_name_to_country_alpha2(country)
            continent_name = pc.country_alpha2_to_continent_code(country_code)
            continent_code = pc.convert_continent_code_to_continent_name(continent_name)
            continents.append(continent_code)
        except:
            continents.append(None)
    df['continentes'] = continents    

    return df

In [28]:
data = add_country_names(data)

In [29]:
data = add_continent(data)

In [30]:
data.loc[data['country'] == 'PRT', 'continentes'] = 'Native'

In [31]:
nan_replacements = {"children": 0, "agent": 0, "company": 0, "country":"Unknow", "country_name":"Unknow", "continentes":"Unknow"}
data = data.fillna(nan_replacements)

In [32]:
data.isna().sum().sum()

0

In [35]:
data=data.reset_index(drop=True)

In [37]:
data.to_csv(f"{ROOT_DIR}\data\data_processed\hotel_bookings_processed.csv")