In [32]:
#write_csv(airbnb_df, "../data/airbnb_df.csv")

## Importing libraries

In [214]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

In [301]:
airbnb_df = pd.read_csv("../data/airbnb_df.csv")

## Handling Null Values

In [302]:
nan_df = pd.DataFrame(airbnb_df.isnull().sum()).sort_values(by=[0], ascending=False)
nan_df[nan_df[0]!=0]

Unnamed: 0,0
square_feet,2152
monthly_price,2120
weekly_price,2057
security_deposit,751
cleaning_fee,431
review_scores_value,307
review_scores_location,307
review_scores_communication,307
review_scores_checkin,307
review_scores_cleanliness,307


# Dropping Other Features

In [303]:
dates_list = ['calendar_last_scraped', 'first_review', 'last_review', 'host_since', 'calendar_updated']
airbnb_df.drop(dates, axis=1, inplace=True)

text_feat = ['host_location', 'street']
airbnb_df.drop(text_feat, axis=1, inplace=True)


In [325]:
airbnb_df['require_guest_phone_verification']

0       False
1       False
2       False
3       False
4       False
        ...  
2157    False
2158    False
2159    False
2160    False
2161    False
Name: require_guest_phone_verification, Length: 2162, dtype: bool

In [327]:
others = ['host_name', 'host_response_time', 'host_verifications', 'host_has_profile_pic', 'city', 'state', 'zipcode', 'market',
'smart_location', 'country_code', 'country', 'amenities', 'extra_people']

airbnb_df.drop(others, axis=1, inplace=True)


## Extracting Numeric and Categoric features

In [328]:
numeric_features = (airbnb_df.select_dtypes(include=[np.number]).columns)[2:].to_list() #excluding id and host_id
numeric_features.remove('price')

categorical_features = []
for i in airbnb_df.columns:
    if i not in (airbnb_df.select_dtypes(include=[np.number]).columns).to_list():
        categorical_features.append(i)

In [329]:
numeric_features

['host_response_rate',
 'host_listings_count',
 'latitude',
 'longitude',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'square_feet',
 'weekly_price',
 'monthly_price',
 'security_deposit',
 'cleaning_fee',
 'guests_included',
 'minimum_nights',
 'maximum_nights',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'number_of_reviews',
 'number_of_reviews_ltm',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'calculated_host_listings_count',
 'calculated_host_listings_count_entire_homes',
 'calculated_host_listings_count_private_rooms',
 'calculated_host_listings_count_shared_rooms',
 'reviews_per_month']

In [330]:
## Drop columns with 50% or more missing values
many_nans = list(df.columns[df.isnull().mean() > 0.5])
df.drop(many_nans, axis=1, inplace=True)

# Filling the null values in numerical features with the mean and categorical features with the mode

In [331]:
## Fill numerical missing data with mean value

imp_mean = SimpleImputer(missing_values= np.nan, strategy= 'mean')
imp_mean = imp_mean.fit(airbnb_df[numeric_features])
airbnb_df[numeric_features] = imp_mean.transform(airbnb_df[numeric_features])

## Fill nans for price using median
airbnb_df['price'] = airbnb_df['price'].fillna(airbnb_df['price'].median())

## Fill categorical missing data with most frequent value
most_freq = SimpleImputer(missing_values= np.nan, strategy= 'most_frequent')
most_freq = most_freq.fit(airbnb_df[categorical_features])
airbnb_df[categorical_features] = most_freq.transform(airbnb_df[categorical_features])

#Testing if all null values are replaced
non_nan_df = pd.DataFrame(airbnb_df.isnull().sum())
non_nan_df

Unnamed: 0,0
id,0
host_id,0
host_response_rate,0
host_is_superhost,0
host_listings_count,0
host_identity_verified,0
neighbourhood_cleansed,0
latitude,0
longitude,0
is_location_exact,0


In [332]:
non_nan_df[non_nan_df[0]!=0]

Unnamed: 0,0


In [333]:
categorical_features

['host_is_superhost',
 'host_identity_verified',
 'neighbourhood_cleansed',
 'is_location_exact',
 'property_type',
 'room_type',
 'bed_type',
 'has_availability',
 'requires_license',
 'instant_bookable',
 'is_business_travel_ready',
 'require_guest_profile_picture',
 'require_guest_phone_verification']

# Initializing Pre-processing

In [334]:
preprocessor = ColumnTransformer(
    transformers=[
        ('scale', StandardScaler(), numeric_features),
        ('ohe', OneHotEncoder(drop='first'), categorical_features)])

# Splitting Dataset

In [335]:
X = airbnb_df.drop(columns=['price', 'id', 'host_id'])
y = airbnb_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=313)

## Analysis

In [289]:
# Define models
rgr_models = {
    'SVR': GridSearchCV(SVR(kernel='rbf'),
                        param_grid = {'gamma': ['auto', 'scale'],
                                      'C': [0.1, 10, 100]},
                        cv = 3,
                        scoring = 'neg_mean_squared_error'),
    'RFR': GridSearchCV(RandomForestRegressor(),
                        param_grid = {'max_depth': [1, 5, 20]},
                        cv = 3,
                        scoring = 'neg_mean_squared_error')}

In [337]:
rfr =  GridSearchCV(RandomForestRegressor(),
                        param_grid = {'max_depth': [1, 5, 20]},
                        cv = 3,
                        scoring = 'neg_mean_squared_error')

In [345]:
rgr = make_pipeline(preprocessor, rfr)

rgr.fit(X_train, y_train.to_numpy().ravel())


Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scale',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  ['host_response_rate',
                                                   'host_listings_count',
                                                   'latitude', 'longitude',
                                                   'accommodates', 'bathrooms',
                                                   'bedrooms', 'beds',
                                                   'square_feet',
                   

In [349]:
X_test

Unnamed: 0,host_response_rate,host_is_superhost,host_listings_count,host_identity_verified,neighbourhood_cleansed,latitude,longitude,is_location_exact,property_type,room_type,...,requires_license,instant_bookable,is_business_travel_ready,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
1850,100.00000,True,6.0,True,Vieux-Moulin,46.85145,-71.23105,False,Apartment,Entire home/apt,...,False,True,False,False,False,11.0,3.0,8.0,0.0,0.80
712,91.00000,True,2.0,False,Vieux-Québec/Cap-Blanc/Colline parlementaire,46.81268,-71.21090,False,Condominium,Entire home/apt,...,False,True,False,False,False,2.0,2.0,0.0,0.0,3.18
133,100.00000,False,1.0,False,Saint-Sacrement,46.79245,-71.26210,True,Apartment,Private room,...,False,False,False,False,False,1.0,0.0,1.0,0.0,1.37
1741,100.00000,False,1.0,False,Saint-Jean-Baptiste,46.81107,-71.22101,True,Apartment,Entire home/apt,...,False,True,False,False,False,1.0,1.0,0.0,0.0,9.09
230,96.00000,False,94.0,False,Vieux-Québec/Cap-Blanc/Colline parlementaire,46.81738,-71.20566,True,Apartment,Entire home/apt,...,False,True,False,False,False,50.0,50.0,0.0,0.0,0.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259,100.00000,False,7.0,False,Lairet,46.82855,-71.24331,True,Loft,Entire home/apt,...,False,True,False,False,False,9.0,6.0,3.0,0.0,5.69
244,96.29853,False,1.0,False,Vieux-Québec/Cap-Blanc/Colline parlementaire,46.80129,-71.22144,True,House,Entire home/apt,...,False,False,False,False,False,1.0,1.0,0.0,0.0,0.18
281,100.00000,False,4.0,False,Saint-Sauveur,46.80876,-71.23556,False,Apartment,Entire home/apt,...,False,False,False,False,False,3.0,3.0,0.0,0.0,0.13
453,100.00000,False,4.0,True,Saint-Jean-Baptiste,46.80978,-71.21913,True,Apartment,Private room,...,False,False,False,False,False,4.0,1.0,3.0,0.0,7.12


In [348]:
print(f"Score: {rgr.score(X_test, y_test)}")


ValueError: Found unknown categories ['Dorm'] in column 4 during transform

In [None]:
# for model_name, model in rgr_models.items():
#     rgr = make_pipeline(preprocessor,
#                         model)
#     rgr.fit(X_train, y_train)
#     y_test[model_name] = rgr.predict(X_test)
#     print(f"{model_name} best hyperparams = {model.best_params_}.")
#     print(f"{model_name} RMSE = {np.sqrt(mean_squared_error(y_test[model_name], y_test['Age'])):.2f} years")