### Hotel Booking Analysis


In [2]:
import pyspark

In [3]:
import pandas as pd

In [4]:
df=pd.read_csv('hotel_bookings.csv')

In [5]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,no,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,7/1/2015
1,Resort Hotel,no,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,7/1/2015
2,Resort Hotel,no,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,7/2/2015
3,Resort Hotel,no,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,7/2/2015
4,Resort Hotel,no,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,7/3/2015


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  object 
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [7]:
from pyspark.sql import SparkSession

In [8]:
spark=SparkSession.builder.appName('pos').getOrCreate()

In [9]:
fd=spark.read.csv('hotel_bookings.csv',header=True,inferSchema=True)

In [10]:
fd.show()

+------------+-----------+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+------+--------+------+----+-------+--------------+--------------------+-----------------+----------------------+------------------------------+------------------+------------------+---------------+------------+-----+-------+--------------------+-------------+------+---------------------------+-------------------------+------------------+-----------------------+
|       hotel|is_canceled|lead_time|arrival_date_year|arrival_date_month|arrival_date_week_number|arrival_date_day_of_month|stays_in_weekend_nights|stays_in_week_nights|adults|children|babies|meal|country|market_segment|distribution_channel|is_repeated_guest|previous_cancellations|previous_bookings_not_canceled|reserved_room_type|assigned_room_type|booking_changes|deposit_type|agent|company|days_in_waiting_list|customer_type|   adr|required_car_parking_spaces|to

In [11]:
fd.printSchema()

root
 |-- hotel: string (nullable = true)
 |-- is_canceled: string (nullable = true)
 |-- lead_time: integer (nullable = true)
 |-- arrival_date_year: integer (nullable = true)
 |-- arrival_date_month: string (nullable = true)
 |-- arrival_date_week_number: integer (nullable = true)
 |-- arrival_date_day_of_month: integer (nullable = true)
 |-- stays_in_weekend_nights: integer (nullable = true)
 |-- stays_in_week_nights: integer (nullable = true)
 |-- adults: integer (nullable = true)
 |-- children: string (nullable = true)
 |-- babies: integer (nullable = true)
 |-- meal: string (nullable = true)
 |-- country: string (nullable = true)
 |-- market_segment: string (nullable = true)
 |-- distribution_channel: string (nullable = true)
 |-- is_repeated_guest: integer (nullable = true)
 |-- previous_cancellations: integer (nullable = true)
 |-- previous_bookings_not_canceled: integer (nullable = true)
 |-- reserved_room_type: string (nullable = true)
 |-- assigned_room_type: string (nullabl

In [12]:
fd.columns

['hotel',
 'is_canceled',
 'lead_time',
 'arrival_date_year',
 'arrival_date_month',
 'arrival_date_week_number',
 'arrival_date_day_of_month',
 'stays_in_weekend_nights',
 'stays_in_week_nights',
 'adults',
 'children',
 'babies',
 'meal',
 'country',
 'market_segment',
 'distribution_channel',
 'is_repeated_guest',
 'previous_cancellations',
 'previous_bookings_not_canceled',
 'reserved_room_type',
 'assigned_room_type',
 'booking_changes',
 'deposit_type',
 'agent',
 'company',
 'days_in_waiting_list',
 'customer_type',
 'adr',
 'required_car_parking_spaces',
 'total_of_special_requests',
 'reservation_status',
 'reservation_status_date']

In [13]:
fd.select('is_canceled')

DataFrame[is_canceled: string]

##### a)What is the overall cancellation rate for hotel bookings?


In [14]:
cancelled_booking=fd.filter('is_canceled="yes"').count()

In [15]:
cancelled_booking

44224

In [16]:
total_booking=fd.select('is_canceled').count()

In [17]:
total_booking

119390

In [18]:
cancellation_rate=(cancelled_booking/total_booking)*100

In [19]:
cancellation_rate

37.041628277075134

#### -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

##### b)What is the average length of stay (in nights) for hotel bookings?


In [20]:
fd=fd.withColumn('total_nights',fd['stays_in_weekend_nights']+fd['stays_in_week_nights'])

In [21]:
fd.agg({'total_nights':'mean'}).show()

+------------------+
| avg(total_nights)|
+------------------+
|3.4279001591423066|
+------------------+



#### -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

##### c)Which market segments contribute the most to hotel bookings?

In [37]:
dd=fd.filter('is_canceled="no"')

In [38]:
ff=dd.groupby('market_segment').count()

In [39]:
dd.select('market_segment').show()

+--------------+
|market_segment|
+--------------+
|        Direct|
|        Direct|
|        Direct|
|     Corporate|
|     Online TA|
|     Online TA|
|        Direct|
|        Direct|
|     Online TA|
|     Online TA|
|     Online TA|
|     Online TA|
|     Online TA|
| Offline TA/TO|
|     Online TA|
|     Corporate|
|        Direct|
|     Online TA|
|        Direct|
|        Direct|
+--------------+
only showing top 20 rows



In [40]:
ff.show()

+--------------+-----+
|market_segment|count|
+--------------+-----+
| Offline TA/TO|15908|
| Complementary|  646|
|        Direct|10672|
|     Corporate| 4303|
|     Online TA|35738|
|        Groups| 7714|
|      Aviation|  185|
+--------------+-----+



In [41]:
ff.agg({'count':'max'}).show()

+----------+
|max(count)|
+----------+
|     35738|
+----------+



In [42]:
ff.show()

+--------------+-----+
|market_segment|count|
+--------------+-----+
| Offline TA/TO|15908|
| Complementary|  646|
|        Direct|10672|
|     Corporate| 4303|
|     Online TA|35738|
|        Groups| 7714|
|      Aviation|  185|
+--------------+-----+



In [36]:
ff.collect()

[Row(market_segment='Offline TA/TO', count=8311),
 Row(market_segment='Complementary', count=97),
 Row(market_segment='Direct', count=1934),
 Row(market_segment='Corporate', count=992),
 Row(market_segment='Online TA', count=20739),
 Row(market_segment='Groups', count=12097),
 Row(market_segment='Aviation', count=52),
 Row(market_segment='Undefined', count=2)]

In [29]:
max_count_row = ff.groupby()

print(max_count_row)

GroupedData[grouping expressions: [], value: [market_segment: string, count: bigint], type: GroupBy]


In [89]:
ff.collect()

[Row(market_segment='Offline TA/TO', count=8311),
 Row(market_segment='Complementary', count=97),
 Row(market_segment='Direct', count=1934),
 Row(market_segment='Corporate', count=992),
 Row(market_segment='Online TA', count=20739),
 Row(market_segment='Groups', count=12097),
 Row(market_segment='Aviation', count=52),
 Row(market_segment='Undefined', count=2)]