In [168]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [296]:
data_pricing = pd.read_csv('./sf_airbnb_clean.csv')

In [297]:
[x for x in data_pricing.columns if 'review' in x]

['number_of_reviews',
 'number_of_reviews_ltm',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'reviews_per_month']

In [258]:
pd.cut(data_pricing['review_scores_rating'], bins=4)

0       (80.0, 100.0]
1       (80.0, 100.0]
2       (80.0, 100.0]
3       (80.0, 100.0]
4       (80.0, 100.0]
            ...      
8106              NaN
8107              NaN
8108              NaN
8109              NaN
8110              NaN
Name: review_scores_rating, Length: 8111, dtype: category
Categories (4, interval[float64]): [(19.92, 40.0] < (40.0, 60.0] < (60.0, 80.0] < (80.0, 100.0]]

In [274]:
data_pricing.columns

Index(['id', 'host_id', 'host_response_time', 'host_response_rate',
       'host_is_superhost', 'host_listings_count', 'host_total_listings_count',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'latitude', 'longitude', 'is_location_exact',
       'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'bed_type', 'price', 'security_deposit', 'cleaning_fee',
       'guests_included', 'extra_people', 'minimum_nights', 'maximum_nights',
       'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'number_of_reviews_ltm',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication'

In [265]:
data_pricing.loc[data_pricing['host_is_superhost'] == 1]['review_scores_rating'].mean()

97.41317179248213

In [266]:
data_pricing.loc[data_pricing['host_is_superhost'] == 0]['review_scores_rating'].mean()

93.43705536653263

In [269]:
data_pricing.loc[data_pricing['host_is_superhost'] == 1]['price'].median()

150.0

In [270]:
data_pricing.loc[data_pricing['host_is_superhost'] == 0]['price'].median()

153.0

In [357]:
data_pricing.loc[data_pricing['host_is_superhost'] == 0]['listing_id'].median()

2.0

In [358]:
data_pricing.loc[data_pricing['host_is_superhost'] == 1]['listing_id'].median()

4.0

In [275]:
data_pricing.loc[data_pricing['host_is_superhost'] == 1]['host_listings_count'].median()

2.0

In [276]:
data_pricing.loc[data_pricing['host_is_superhost'] == 0]['host_listings_count'].median()

3.0

In [279]:
listings_sf = pd.read_csv('./listings_sf.csv')

In [283]:
listings_sf['last_review'] = listings_sf['last_review'].fillna(0)

In [290]:
reviews = pd.read_csv('./reviews.csv')

In [291]:
reviews['date'] = pd.to_datetime(reviews['date'])

In [342]:
reviews['year'] = pd.DatetimeIndex(reviews['date']).year
reviews['month'] = pd.DatetimeIndex(reviews['date']).month

In [347]:
number_reviews = reviews.loc[(reviews['year'] == 2019) & 
                             (reviews['month'] == 9)]['listing_id'].value_counts().reset_index()

Unnamed: 0,index,listing_id
0,33942157,23
1,364397,17
2,958507,17
3,15343159,16
4,21122475,15


In [356]:
data_pricing = number_reviews.merge(data_pricing, left_on='index', right_on='id', how='right')

0       23.0
1       17.0
2       17.0
3       16.0
4       15.0
        ... 
8106     NaN
8107     NaN
8108     NaN
8109     NaN
8110     NaN
Name: listing_id, Length: 8111, dtype: float64

In [278]:
data_pricing['host_is_superhost'].value_counts()

0.0    4557
1.0    3546
Name: host_is_superhost, dtype: int64

In [261]:
data_pricing['review_scores_rating'].value_counts()

100.0    1912
98.0      768
99.0      717
97.0      604
96.0      478
95.0      374
94.0      250
93.0      239
90.0      174
80.0      167
92.0      144
91.0      101
87.0       76
89.0       75
88.0       70
60.0       57
85.0       48
70.0       30
86.0       28
84.0       26
83.0       21
40.0       13
20.0       11
82.0       11
73.0        9
75.0        8
78.0        7
81.0        7
77.0        6
76.0        5
74.0        5
50.0        3
67.0        3
79.0        2
72.0        2
69.0        1
68.0        1
71.0        1
47.0        1
56.0        1
64.0        1
65.0        1
55.0        1
30.0        1
Name: review_scores_rating, dtype: int64

In [325]:
data = pd.read_csv('./sf_airbnb_clean.csv')
data = data.drop(['id','host_id'], axis=1)

In [326]:
dummy_encoded = pd.get_dummies(data)

In [327]:
dummy_encoded.head()

Unnamed: 0,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,is_location_exact,accommodates,...,calendar_updated_a week ago,calendar_updated_never,calendar_updated_today,calendar_updated_yesterday,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60
0,100.0,1.0,1.0,1.0,1.0,1.0,37.76931,-122.43386,1.0,3,...,0,0,0,0,0,1,0,0,0,0
1,100.0,1.0,2.0,2.0,1.0,1.0,37.75402,-122.45805,1.0,2,...,0,0,0,0,0,0,0,1,0,0
2,80.0,0.0,2.0,2.0,1.0,1.0,37.74511,-122.42102,1.0,5,...,0,0,0,0,0,0,0,1,0,0
3,86.0,1.0,10.0,10.0,1.0,1.0,37.76669,-122.4525,1.0,2,...,0,0,0,0,0,0,0,1,0,0
4,86.0,1.0,10.0,10.0,1.0,1.0,37.76487,-122.45183,1.0,2,...,0,0,0,0,0,0,0,1,0,0


In [328]:
dummy_encoded = dummy_encoded.drop(['longitude', 'latitude'], axis=1)

In [329]:
dummy_encoded = dummy_encoded.dropna(axis=0, subset=['review_scores_rating'])

In [330]:
dummy_encoded = dummy_encoded.drop(['number_of_reviews',
 'number_of_reviews_ltm',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin', 'host_listings_count', 'minimum_minimum_nights', 'host_total_listings_count',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'review_scores_communication',
 'review_scores_location', 
 'review_scores_value',
 'reviews_per_month', 'price'], axis = 1)

In [241]:
[x for x in dummy_encoded.columns if 'longitude' in x]

[]

In [242]:
X.isnull().sum().reset_index().rename(columns={'index':'colnames', 
                                                           0:'nulls'}).sort_values('nulls', ascending= False)

Unnamed: 0,colnames,nulls
0,host_response_rate,0
133,calendar_updated_3 months ago,0
113,calendar_updated_14 months ago,0
114,calendar_updated_15 months ago,0
115,calendar_updated_16 months ago,0
...,...,...
61,neighbourhood_cleansed_Parkside,0
62,neighbourhood_cleansed_Potrero Hill,0
63,neighbourhood_cleansed_Presidio,0
64,neighbourhood_cleansed_Presidio Heights,0


In [331]:
y = dummy_encoded['review_scores_rating']

In [332]:
X = dummy_encoded.drop('review_scores_rating', axis = 1)
X = X.fillna(0)

In [333]:
forest = RandomForestRegressor(random_state=42, n_estimators=1000, max_depth=50)
forest.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [334]:
columns = pd.DataFrame(X.columns,columns={'colnames'})
columns['importance'] = forest.feature_importances_

In [335]:
columns = columns.sort_values('importance', ascending=False)

In [336]:
columns.to_csv('./ratings_random_forest4.csv')

In [323]:
[x for x in X.columns if 'listings' in x]

['host_total_listings_count',
 'calculated_host_listings_count',
 'calculated_host_listings_count_entire_homes',
 'calculated_host_listings_count_private_rooms',
 'calculated_host_listings_count_shared_rooms']

In [185]:
parameters = {'max_depth':[10,20, 50]}

In [186]:
clf = GridSearchCV(forest, parameters, verbose=1, n_jobs= -1, cv=2)
clf.fit(X,y)

Fitting 2 folds for each of 3 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  1.3min finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [50, 75, 100]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=1)

In [189]:
clf.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [182]:
X.shape

(6460, 187)

In [198]:
import statsmodels.api as sm

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


In [337]:
mod = sm.OLS( y, X)

In [338]:
res = mod.fit()

In [339]:
print(res.summary())

                             OLS Regression Results                             
Dep. Variable:     review_scores_rating   R-squared:                       0.221
Model:                              OLS   Adj. R-squared:                  0.200
Method:                   Least Squares   F-statistic:                     10.81
Date:                  Sat, 07 Dec 2019   Prob (F-statistic):          2.95e-231
Time:                          18:20:45   Log-Likelihood:                -21437.
No. Observations:                  6460   AIC:                         4.321e+04
Df Residuals:                      6294   BIC:                         4.433e+04
Df Model:                           165                                         
Covariance Type:              nonrobust                                         
                                                      coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
