# Expedia Case Study

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('case_study_data.csv')

In [4]:
df.head()

Unnamed: 0,search_date,arrival,departure,num_adults,num_children,search_id,hotel_id,hotel_price,is_promo,hotel_feature_1,hotel_feature_2,hotel_feature_3,hotel_feature_4,hotel_feature_5,booked
0,25,457,471,3,2,0,517,1724.38,0,64.490309,85.0,7,0,0,0
1,25,457,471,3,2,0,517,1724.38,0,64.490309,85.0,7,0,0,0
2,25,457,471,3,2,0,29771,1905.54,1,25.826343,52.0,13,1,1,0
3,25,458,463,2,0,1,517,2077.95,0,64.490309,85.0,9,0,0,0
4,25,458,463,2,0,1,40744,6822.1,0,24.959968,52.0,11,1,2,0


In [5]:
df['stay_length'] = df['departure'] - df['arrival']

In [6]:
df['stay_length'].min()

1

In [7]:
df.describe()

Unnamed: 0,search_date,arrival,departure,num_adults,num_children,search_id,hotel_id,hotel_price,is_promo,hotel_feature_1,hotel_feature_2,hotel_feature_3,hotel_feature_4,hotel_feature_5,booked,stay_length
count,46647.0,46647.0,46647.0,46647.0,46647.0,46647.0,46647.0,46647.0,46647.0,45445.0,46322.0,46647.0,46647.0,46647.0,46647.0,46647.0
mean,11.734517,126.788046,134.479902,2.37057,0.812335,5509.648659,23095.564302,2599.055621,0.50149,76.849742,63.108847,59.068729,5.982335,2.357601,0.006603,7.691856
std,7.336887,90.867556,92.517336,0.912667,0.949223,4460.689701,13480.392766,4490.091588,0.500003,41.74783,22.846125,51.378153,6.014773,1.811249,0.08099,4.336129
min,0.0,0.0,1.0,1.0,0.0,0.0,1.0,30.37,0.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,6.0,49.0,54.0,2.0,0.0,1656.0,11474.0,559.4,0.0,46.238512,52.0,16.0,1.0,1.0,0.0,5.0
50%,12.0,109.0,116.0,2.0,0.0,4488.0,23309.0,1336.6,1.0,67.712628,52.0,46.0,4.0,1.0,0.0,7.0
75%,17.0,187.0,197.0,3.0,2.0,8727.0,33668.0,3090.51,1.0,128.117134,90.0,86.0,9.0,4.0,0.0,10.0
max,25.0,493.0,501.0,8.0,6.0,16015.0,46647.0,175454.41,1.0,143.777606,100.0,287.0,36.0,7.0,1.0,28.0


In [8]:
# Note: We have some missing data for hotel_feature_1 and hotel_feature_2

## Cleaning The Data

### Missing Values

First let's remove any rows with missing data

In [9]:
print "Number of missing hotel_feature_1 entries: " + str(len(df[np.isnan(df['hotel_feature_1'])]))
print "Number of missing hotel_feature_2 entries: " + str(len(df[np.isnan(df['hotel_feature_2'])]))

Number of missing hotel_feature_1 entries: 1202
Number of missing hotel_feature_2 entries: 325


In [10]:
df[np.isnan(df['hotel_feature_1'])].head()

Unnamed: 0,search_date,arrival,departure,num_adults,num_children,search_id,hotel_id,hotel_price,is_promo,hotel_feature_1,hotel_feature_2,hotel_feature_3,hotel_feature_4,hotel_feature_5,booked,stay_length
232,20,51,52,2,0,21,4099,177.75,0,,52.0,9,13,4,0,1
275,20,51,52,2,0,21,42149,160.99,1,,95.0,27,1,0,0,1
515,20,78,81,2,1,29,4099,1993.71,0,,52.0,11,13,4,0,3
520,20,78,81,2,1,29,42149,1682.63,1,,95.0,31,1,0,0,3
531,23,315,320,4,0,31,42149,561.95,1,,95.0,43,1,0,0,5


For now we are just going to remove all rows that have missing data (in the future I might interpolate and fill in missing values with the mean)

In [11]:
len(df[df['booked'] == 1])

308

In [12]:
df = df[np.isnan(df['hotel_feature_1']) != True]
df = df[np.isnan(df['hotel_feature_2']) != True]

In [13]:
len(df)
len(df[df['booked'] == 1])

299

So I have lost 9 of the "booked = 1" rows (which is only 1/30th of the set)

# Graphing Data to Explore It

Now let's graph the columns of df to see if we have any strange looking values

### Search Date

In [14]:
search_date = df.groupby('search_date').size()

In [15]:
df['search_date'].plot.hist(orientation='vertical')

<matplotlib.axes._subplots.AxesSubplot at 0x115729750>

### Arrival and Departure

In [16]:
df['arrival'].plot.hist(orientation='vertical')


<matplotlib.axes._subplots.AxesSubplot at 0x115729750>

In [17]:
df['departure'].plot.hist(orientation='vertical')

<matplotlib.axes._subplots.AxesSubplot at 0x115729750>

### Number of Adults and Children

In [18]:
df['num_adults'].plot.hist(orientation='vertical')

<matplotlib.axes._subplots.AxesSubplot at 0x115729750>

In [19]:
df['num_children'].plot.hist(orientation='vertical')

<matplotlib.axes._subplots.AxesSubplot at 0x115729750>

Do the gaps in the above histograms suggest that there is a

### Number of Hotels

In [20]:
print "The number of hotels in our dataset is " + str(len(df['hotel_id'].unique()))

The number of hotels in our dataset is 343


### Is_Promo Ratio

In [21]:
df['is_promo'].plot.hist(orientation='vertical')

<matplotlib.axes._subplots.AxesSubplot at 0x115729750>

## Hotel Features

#### Hotel Feature 2

In [22]:
df['hotel_feature_2'].plot.hist(orientation='vertical')

<matplotlib.axes._subplots.AxesSubplot at 0x115729750>

#### Hotel Feature 3

In [23]:
df['hotel_feature_3'].plot.hist(orientation='vertical')

<matplotlib.axes._subplots.AxesSubplot at 0x115729750>

#### Hotel Feature 4

In [29]:
df['hotel_feature_4'].plot.hist(orientation='vertical')

<matplotlib.axes._subplots.AxesSubplot at 0x115729750>

#### Hotel Feature 5

In [25]:
df['hotel_feature_5'].plot.hist(orientation='vertical')

<matplotlib.axes._subplots.AxesSubplot at 0x115729750>

## Booked

In [26]:
df['booked'].plot.hist(orientation='vertical')

<matplotlib.axes._subplots.AxesSubplot at 0x115729750>

## Stay Length

In [27]:
df['stay_length'].plot.hist(orientation='vertical')

<matplotlib.axes._subplots.AxesSubplot at 0x115729750>