In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('case_study_data.csv')

In [3]:
df.head()

Unnamed: 0,search_date,arrival,departure,num_adults,num_children,search_id,hotel_id,hotel_price,is_promo,hotel_feature_1,hotel_feature_2,hotel_feature_3,hotel_feature_4,hotel_feature_5,booked
0,25,457,471,3,2,0,517,1724.38,0,64.490309,85.0,7,0,0,0
1,25,457,471,3,2,0,517,1724.38,0,64.490309,85.0,7,0,0,0
2,25,457,471,3,2,0,29771,1905.54,1,25.826343,52.0,13,1,1,0
3,25,458,463,2,0,1,517,2077.95,0,64.490309,85.0,9,0,0,0
4,25,458,463,2,0,1,40744,6822.1,0,24.959968,52.0,11,1,2,0


In [8]:
most_common_hotel = df['hotel_id'].value_counts().idxmax()
most_common_hotel

33668

In [18]:
df['hotel_ppn'] = df['hotel_price'] / (df['departure'] - df['arrival'])
df = df.round({'hotel_ppn': 2})

In [19]:
df.head()

Unnamed: 0,search_date,arrival,departure,num_adults,num_children,search_id,hotel_id,hotel_price,is_promo,hotel_feature_1,hotel_feature_2,hotel_feature_3,hotel_feature_4,hotel_feature_5,booked,hotel_ppn
0,25,457,471,3,2,0,517,1724.38,0,64.490309,85.0,7,0,0,0,123.17
1,25,457,471,3,2,0,517,1724.38,0,64.490309,85.0,7,0,0,0,123.17
2,25,457,471,3,2,0,29771,1905.54,1,25.826343,52.0,13,1,1,0,136.11
3,25,458,463,2,0,1,517,2077.95,0,64.490309,85.0,9,0,0,0,415.59
4,25,458,463,2,0,1,40744,6822.1,0,24.959968,52.0,11,1,2,0,1364.42


In [25]:
df[(df['hotel_id'] == 33668) & (df['num_adults'] == 2) & (df['num_children'] == 0) & (df['hotel_feature_3'] == 4)].head()

Unnamed: 0,search_date,arrival,departure,num_adults,num_children,search_id,hotel_id,hotel_price,is_promo,hotel_feature_1,hotel_feature_2,hotel_feature_3,hotel_feature_4,hotel_feature_5,booked,hotel_ppn
863,11,42,43,2,0,74,33668,72.32,0,69.67783,90.0,4,1,1,0,72.32
1680,16,65,75,2,0,155,33668,2173.91,1,69.67783,90.0,4,1,1,0,217.39
2075,10,22,29,2,0,181,33668,430.36,1,69.67783,90.0,4,1,1,0,61.48
2076,10,22,29,2,0,181,33668,430.36,1,69.67783,90.0,4,1,1,0,61.48
3056,23,50,57,2,0,223,33668,734.1,0,69.67783,90.0,4,1,1,0,104.87


# Regression Model

In [26]:
import graphlab

In [35]:
hotels = graphlab.SFrame('case_study_data.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,int,int,int,int,int,int,float,int,float,float,int,int,int,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [36]:
hotels = hotels[hotels['hotel_feature_1'] != None]
hotels = hotels[hotels['hotel_feature_2'] != None]

In [37]:
len(hotels)

45120

In [41]:
hotels['price_per_night'] = hotels['hotel_price'] / (hotels['departure'] - hotels['arrival'])

In [42]:
train_data, test_data = hotels.random_split(.8, seed=0)

In [43]:
print "Number of train_data entries " + str(len(train_data))
print "Number of test data entries "+ str(len(test_data))

Number of train_data entries 36136
Number of test data entries 8984


In [44]:
features = ['num_adults', 'num_children', 
            'hotel_feature_1', 'hotel_feature_2', 
            'hotel_feature_3', 'hotel_feature_4', 
            'hotel_feature_5']
model = graphlab.linear_regression.create(train_data, target='price_per_night', 
                                          features = features, validation_set=None)

In [45]:
weight_summary = model.get('coefficients')
weight_summary

name,index,value,stderr
(intercept),,350.958963548,18.0926011643
num_adults,,37.8501023842,3.57636420058
num_children,,34.9363021742,3.47390898888
hotel_feature_1,,-1.44454936866,0.0841970875517
hotel_feature_2,,-1.3456940313,0.142364097599
hotel_feature_3,,0.372748436082,0.0652479901496
hotel_feature_4,,10.8530820835,0.579477182498
hotel_feature_5,,-10.8091841155,1.83422496218


In [46]:
predictions = model.predict(train_data)

In [48]:
max(predictions)

901.3231289596091

In [50]:
min(predictions)

61.312471004148136

### Compute the RSS

In [51]:
def residual_sum_squares(model, data):
    example_predictions = model.predict(data)
    # Then compute the residuals/errors
    errs = example_predictions - data['price_per_night']
    # Then square and add them up
    RSS = sum(errs*errs)

    return(RSS)    

In [52]:
rss_predictions = residual_sum_squares(model, test_data)
rss_predictions

3779839162.9201465