In [None]:
import pandas as pd
from datetime import datetime

path_to_file = '2_bookings.csv'
bookings = pd.read_csv(path_to_file, encoding='windows-1251', sep=';')
bookings

In [5]:
# replace all symbols with lower case and spaces with '_', respectively
columns_list = bookings.columns
new_colums = []
for i in columns_list:
    new_colums.append(i.lower().replace(' ', '_'))

titles_dict = {columns_list[i]: new_colums[i] for i in range(len(columns_list))}

bookings = bookings.rename(columns=titles_dict)

In [6]:
# users from which countries have made the most successful bookings? Print top 5
bookings.query('is_canceled == 0').value_counts('country').sort_values(ascending=False).head(5)

country
PRT    21071
GBR     9676
FRA     8481
ESP     6391
DEU     6069
dtype: int64

In [8]:
bookings.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_full_date',
       'arrival_date_year', 'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'stays_total_nights', 'adults', 'children',
       'babies', 'meal', 'country', 'reserved_room_type', 'assigned_room_type',
       'customer_type', 'reservation_status', 'reservation_status_date'],
      dtype='object')

In [9]:
# for how many nights are City Hotels booked on average? resort hotel?
bookings.groupby('hotel').agg({'stays_total_nights': 'mean'}).round(2)

Unnamed: 0_level_0,stays_total_nights
hotel,Unnamed: 1_level_1
City Hotel,2.98
Resort Hotel,4.32


In [10]:
# room type is different from booked
bookings.query('assigned_room_type != reserved_room_type')

Unnamed: 0,hotel,is_canceled,lead_time,arrival_full_date,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,...,adults,children,babies,meal,country,reserved_room_type,assigned_room_type,customer_type,reservation_status,reservation_status_date
2,Resort Hotel,0,7,2015-07-01,2015,July,27,1,0,1,...,1,0.0,0,BB,GBR,A,C,Transient,Check-Out,2015-07-02
12,Resort Hotel,0,68,2015-07-01,2015,July,27,1,0,4,...,2,0.0,0,BB,USA,D,E,Transient,Check-Out,2015-07-05
15,Resort Hotel,0,68,2015-07-01,2015,July,27,1,0,4,...,2,0.0,0,BB,IRL,D,E,Transient,Check-Out,2015-07-05
17,Resort Hotel,0,12,2015-07-01,2015,July,27,1,0,1,...,2,0.0,0,BB,IRL,A,E,Transient,Check-Out,2015-07-02
18,Resort Hotel,0,0,2015-07-01,2015,July,27,1,0,1,...,2,0.0,0,BB,FRA,A,G,Transient,Check-Out,2015-07-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119273,City Hotel,0,213,2017-08-28,2017,August,35,28,1,3,...,1,0.0,0,HB,PRT,A,K,Transient-Party,Check-Out,2017-09-01
119274,City Hotel,0,213,2017-08-28,2017,August,35,28,1,3,...,1,0.0,0,HB,PRT,A,K,Transient-Party,Check-Out,2017-09-01
119289,City Hotel,0,25,2017-08-30,2017,August,35,30,0,3,...,3,0.0,0,BB,ITA,E,F,Transient,Check-Out,2017-09-02
119297,City Hotel,0,332,2017-08-31,2017,August,35,31,0,2,...,2,0.0,0,BB,GBR,D,F,Transient,Check-Out,2017-09-02


In [11]:
# what was the most frequent booking month in 2016? Has the most popular month changed in 2017?
bookings.query('arrival_date_year == 2016') \
        .value_counts('arrival_date_month') \
        .sort_values(ascending=False)


arrival_date_month
October      6203
May          5478
April        5428
September    5394
June         5292
August       5063
March        4824
July         4572
November     4454
February     3891
December     3860
January      2248
dtype: int64

In [12]:
bookings.query('arrival_date_year == 2017') \
        .value_counts('arrival_date_month') \
        .sort_values(ascending=False)

arrival_date_month
May         6313
April       5661
June        5647
July        5313
March       4970
August      4925
February    4177
January     3681
dtype: int64

In [13]:
# which month of City Hotel bookings had the most cancellations in 2015? 2016? 2017?
bookings \
        .query('hotel == "City Hotel" and is_canceled == 1') \
        .groupby('arrival_date_year', as_index=False) \
        ['arrival_date_month'].value_counts().sort_values(['arrival_date_year', 'count'])

Unnamed: 0,arrival_date_year,arrival_date_month,count
5,2015,November,301
4,2015,December,668
3,2015,July,939
2,2015,August,1232
1,2015,October,1321
0,2015,September,1543
17,2016,January,438
16,2016,February,930
15,2016,July,1043
14,2016,December,1072


In [29]:
# which of the cells adults, children, and babies has the largest mean value?
bookings[['adults', 'children', 'babies']].mean().idxmax

<bound method Series.idxmax of adults      1.856403
children    0.103890
babies      0.007949
dtype: float64>

In [30]:
# create the total_kids column by concatenating the children and babies columns
# for which type of hotels did the average value of the variable turn out to be the largest? 
bookings['total_kids'] = bookings.children + bookings.babies
bookings.groupby('hotel').total_kids.mean()

hotel
City Hotel      0.096311
Resort Hotel    0.142586
Name: total_kids, dtype: float64

In [66]:
# create a has_kids column (True if the client specified at least one child, otherwise False)
# check which user group has the highest churn rate
bookings['has_kids'] = bookings.total_kids != 0
rates = bookings.groupby('has_kids')['is_canceled'].value_counts(normalize=True)
round(rates * 100, 2)

has_kids  is_canceled
False     0              62.78
          1              37.22
True      0              65.05
          1              34.95
Name: is_canceled, dtype: float64