# Table of Contents
 <p>

In [25]:
import os
import pandas as pd
import numpy as np
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go

import cufflinks as cf

from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer

import matplotlib.pyplot as plt
import seaborn as sns

# Set options
init_notebook_mode(connected=True)
%matplotlib inline
pd.set_option('display.max_columns', 500)

In [26]:
% ls data/
datapath='data/'

[1m[36marchives[m[m/         demographics.csv  listings.csv      venues.csv
calendar.csv      econ_state.csv    real_estate.csv


In [27]:
def reformat_prices(df, columns):
    assert isinstance(columns, list), "{} is not a list".format(columns)
    for column in columns:
        df[column] = df[column].replace( '[\$,)]','', regex=True).astype(float)
    return df

In [28]:
%%time
listings_df = pd.read_csv(datapath + 'listings.csv')
# Reformat t, f columns
for column in ['instant_bookable', 'has_availability']:
    listings_df[column] = listings_df[column].apply(lambda x: True if x == 't' else False)
listings_df = reformat_prices(listings_df, ['price'])
listings_df.head()
listings_df.info()


Columns (9) have mixed types. Specify dtype option on import or set low_memory=False.



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56417 entries, 0 to 56416
Data columns (total 29 columns):
accommodates                   56417 non-null float64
amenities                      56417 non-null object
availability_30                56417 non-null int64
bathrooms                      56238 non-null float64
bed_type                       56417 non-null object
bedrooms                       56363 non-null float64
beds                           56331 non-null float64
cancellation_policy            56417 non-null object
city                           56417 non-null object
has_availability               56417 non-null bool
host_id                        56417 non-null int64
id                             56417 non-null int64
instant_bookable               56417 non-null bool
latitude                       56417 non-null float64
longitude                      56417 non-null float64
metropolitan                   56417 non-null object
name                           56410 non-nul

In [29]:
# Removing dict structure
listings_df['amenities'] = listings_df['amenities'].apply(lambda x: x.replace("\"", "").replace("{", "").replace("}", "").split(','))
listings_df['amenities'] = listings_df['amenities'].apply(lambda x: [amenity.lower() for amenity in x])

In [30]:
# List all amenities
all_amenities = set()
for index, value in listings_df['amenities'].iteritems():
    all_amenities = all_amenities.union(set(value))

# Cleaning
all_amenities.remove('')
all_amenities.remove('translation missing: en.hosting_amenity_49')
all_amenities.remove('translation missing: en.hosting_amenity_50')
len(all_amenities)
all_amenities

90

{'24-hour check-in',
 'air conditioning',
 'baby bath',
 'baby monitor',
 'babysitter recommendations',
 'bathtub',
 'bbq grill',
 'bed linens',
 'breakfast',
 'buzzer/wireless intercom',
 'cable tv',
 'carbon monoxide detector',
 'cat(s)',
 'changing table',
 'children’s books and toys',
 'children’s dinnerware',
 'cleaning before checkout',
 'coffee maker',
 'cooking basics',
 'crib',
 'dishes and silverware',
 'dishwasher',
 'dog(s)',
 'doorman',
 'doorman entry',
 'dryer',
 'elevator in building',
 'essentials',
 'ethernet connection',
 'ev charger',
 'extra pillows and blankets',
 'family/kid friendly',
 'fire extinguisher',
 'fireplace guards',
 'first aid kit',
 'free parking on premises',
 'free parking on street',
 'game console',
 'garden or backyard',
 'gym',
 'hair dryer',
 'hangers',
 'heating',
 'high chair',
 'hot tub',
 'hot water',
 'indoor fireplace',
 'internet',
 'iron',
 'keypad',
 'kitchen',
 'laptop friendly workspace',
 'lock on bedroom door',
 'lockbox',
 'long

In [31]:
%%time
for amenities in sorted(all_amenities):
    listings_df[amenities] = listings_df['amenities'].apply(lambda x: 1 if amenities in x else 0)

CPU times: user 3.62 s, sys: 28.1 ms, total: 3.65 s
Wall time: 3.67 s


In [32]:
listings_df.iloc[0, 1]
print(listings_df.iloc[0, 29:])

['wireless internet',
 'air conditioning',
 'pool',
 'kitchen',
 'free parking on premises',
 'gym',
 'hot tub',
 'indoor fireplace',
 'heating',
 'family/kid friendly',
 'suitable for events',
 'washer',
 'dryer',
 'smoke detector',
 'carbon monoxide detector',
 'first aid kit',
 'fire extinguisher',
 'essentials',
 'shampoo',
 'lock on bedroom door',
 '24-hour check-in',
 'hangers',
 'hair dryer',
 'iron',
 'laptop friendly workspace',
 'translation missing: en.hosting_amenity_50']

24-hour check-in              1
air conditioning              1
baby bath                     0
baby monitor                  0
babysitter recommendations    0
bathtub                       0
bbq grill                     0
bed linens                    0
breakfast                     0
buzzer/wireless intercom      0
cable tv                      0
carbon monoxide detector      1
cat(s)                        0
changing table                0
children’s books and toys     0
children’s dinnerware         0
cleaning before checkout      0
coffee maker                  0
cooking basics                0
crib                          0
dishes and silverware         0
dishwasher                    0
dog(s)                        0
doorman                       0
doorman entry                 0
dryer                         1
elevator in building          0
essentials                    1
ethernet connection           0
ev charger                    0
                             ..
pack ’n 

In [33]:
reg_data = listings_df.select_dtypes(include=['int64','float64'])
reg_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56417 entries, 0 to 56416
Columns: 106 entries, accommodates to wireless internet
dtypes: float64(13), int64(93)
memory usage: 45.6 MB


In [34]:
reg_data.dropna(inplace=True)
reg_data.shape
y = reg_data['price']
x = reg_data.drop(['price', 'latitude', 'longitude'], axis=1)

cv = ShuffleSplit(n=5)
scores = cross_val_score(cv=cv,
                         X=x,
                         y=y,
                         estimator=LinearRegression(),
                         scoring=make_scorer(mean_squared_error))
print("MSE error = {}".format(scores.mean()))



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



(42854, 106)

MSE error = 66050.20170586483


In [35]:
reg = RidgeCV()
reg.fit(x, y)
coeffs = pd.DataFrame(reg.coef_, index=x.columns)
coeffs.iplot(kind='bar')

RidgeCV(alphas=(0.1, 1.0, 10.0), cv=None, fit_intercept=True, gcv_mode=None,
    normalize=False, scoring=None, store_cv_values=False)

PlotlyRequestError: No message

In [None]:
# See what drives the price
corr = reg_data.corr()
plt.figure(figsize=(12, 9));
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values);

In [None]:
trace = go.Heatmap(corr)
data = [trace]
iplot(data)

In [None]:
sep = int(reg_data.shape[0] * .8)
y_train = reg_data['price'].iloc[:sep]
x_train = reg_data.drop('price', axis=1)[:sep]

y_test = reg_data['price'].iloc[sep:]
x_test = reg_data.drop('price', axis=1)[sep:]

reg = LinearRegression()
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)

print(mean_squared_error(y_pred, y_test))