### Import Libraries, Read in Data and Merge into one big dataset with year indicator variable

In [29]:
import numpy as np 
import pandas as pd
from wordcloud import STOPWORDS
import string
import datetime

In [30]:
# Read in wrangled NYC airbnb data from 2016-2020

df16 = pd.read_csv("../input/math301-final-project-data/airbnb16.csv", index_col=0)
df17 = pd.read_csv("../input/math301-final-project-data/airbnb17.csv", index_col=0)
df18 = pd.read_csv("../input/math301-final-project-data/airbnb18.csv", index_col=0)
df19 = pd.read_csv("../input/math301-final-project-data/airbnb19.csv", index_col=0)
df20 = pd.read_csv("../input/math301-final-project-data/airbnb20.csv", index_col=0)

In [31]:
# Add year variable to all the data sources
for dataf,year in zip([df16,df17,df18,df19,df20], ['16','17','18','19','20']):
    dataf['year'] = '20' + year

In [32]:
# Merge data sources into one big dataset
df = pd.concat([df16,df17,df18,df19,df20])

### Encoding Features + Feature Engineering + Split into train, test data

#### Feature Creation from "name" column

In [33]:
df['name'].fillna('missing',inplace=True) # fill ONE empty value in "name" column with word "missing"

########## Basic Meta Features for Text data

# word_count
df['name_wc'] = df['name'].apply(lambda x: len(str(x).split()))

# unique_word_count
df['name_unique_wc'] = df['name'].apply(lambda x: len(set(str(x).split())))

# stop_word_count
df['name_stop_wc'] = df['name'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# mean_word_length
df['name_mean_wl'] = df['name'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# char_count
df['name_cc'] = df['name'].apply(lambda x: len(str(x)))

# punctuation_count
df['name_pc'] = df['name'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

# Drop "name" column
del df['name']

#### Drop socioeconomic, demographic variables

In [34]:
# Drop socioeconomic, demographic variables

for feat in df.columns.tolist():
    if 'avg' in feat:
        del df[feat]
    elif 'total' in feat:
        del df[feat]

#### Create features for last_review

In [35]:
# Create features for last_review (last review month, last review day)

df1 = df.copy()[df.last_review != 'No Review']
df2 = df.copy()[df.last_review == 'No Review']

df1['last_review_month'] = df1['last_review'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").month)
df1['last_review_day'] = df1['last_review'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").day)

df2['last_review_month'] = 0
df2['last_review_day'] = 0

df = pd.concat([df1,df2], axis=0, sort=True)

del df['last_review']

#### Encode categorical variables into numerical

In [36]:
train_idx =  df[df.year != '2020'].index
test_idx = df[df.year == '2020'].index

In [37]:
# Create Dummy variables for categorical features

df = pd.get_dummies(df, columns= ['neighbourhood_group', 'room_type','year'], drop_first=True)

In [38]:
# Label Encode Neighborhoods (because too many variables will be created if we take the dummy variable approach)

from sklearn.preprocessing import LabelEncoder

label_encoder= LabelEncoder()
nb_encoded = label_encoder.fit_transform(df[['neighbourhood']].values.ravel())

In [39]:
# Replace neighborhood column with integer label encoded data

df['neighbourhood'] = pd.Series(nb_encoded)

#### Split into train and test data

In [40]:
train = df.copy().iloc[train_idx] # 2016-2019
test = df.copy().iloc[test_idx] # 2020

In [44]:
# Divide into X and y vectors/arrays

X_train = train.drop(labels=['price'],axis=1).values
y_train = train[['price']].values

X_test = test.drop(labels=['price'],axis=1).values
y_test = test[['price']].values

### Baseline Models

In [78]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, median_absolute_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import KFold

In [84]:
# Simple Linear Regression

lr = LinearRegression()

print("SLR Cross Validation MSE: {}".format(round(-cross_val_score(lr, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean(),2)))
print("SLR Cross Validation Median Absolute Error: {}".format(round(-cross_val_score(lr, X_train, y_train, cv=5, scoring='neg_median_absolute_error').mean(),2)))
print("SLR Cross Validation Mean Absolute Error: {}".format(round(-cross_val_score(lr, X_train, y_train, cv=5, scoring='neg_mean_absolute_error').mean(), 2)))
print("SLR Cross Validation R-Squared: {}".format(round(-cross_val_score(lr, X_train, y_train, cv=5, scoring='r2').mean(), 2)))

lr.fit(X_train,y_train)

print("SLR Predicton MSE: {}".format(round(mean_squared_error(y_test,lr.predict(X_test)), 2 )))
print("SLR Predicton Median Absolute Error: {}".format(round(median_absolute_error(y_test, lr.predict(X_test)),2 )))
print("SLR Predicton Mean Absolute Error: {}".format(round(mean_absolute_error(y_test, lr.predict(X_test)), 2)))
print("SLR Predicton R-Squared: {}".format(round(r2_score(y_test, lr.predict(X_test)), 2 )))

SLR Cross Validation MSE: 24103.7
SLR Cross Validation Median Absolute Error: 31.33
SLR Cross Validation Mean Absolute Error: 50.59
SLR Cross Validation R-Squared: -0.15
SLR Predicton MSE: 22959.27
SLR Predicton Median Absolute Error: 30.71
SLR Predicton Mean Absolute Error: 49.62
SLR Predicton R-Squared: 0.15


In [85]:
# Ridge Regression

ridge = Ridge(alpha=0.5)

print("Ridge Cross Validation MSE: {}".format(round(-cross_val_score(ridge, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean(),2)))
print("Ridge Cross Validation Median Absolute Error: {}".format(round(-cross_val_score(ridge, X_train, y_train, cv=5, scoring='neg_median_absolute_error').mean(),2)))
print("Ridge Cross Validation Mean Absolute Error: {}".format(round(-cross_val_score(ridge, X_train, y_train, cv=5, scoring='neg_mean_absolute_error').mean(), 2)))
print("Ridge Cross Validation R-Squared: {}".format(round(-cross_val_score(ridge, X_train, y_train, cv=5, scoring='r2').mean(), 2)))

ridge.fit(X_train,y_train)

print("Ridge Predicton MSE: {}".format(round(mean_squared_error(y_test,ridge.predict(X_test)), 2 )))
print("Ridge Predicton Median Absolute Error: {}".format(round(median_absolute_error(y_test, ridge.predict(X_test)),2 )))
print("Ridge Predicton Mean Absolute Error: {}".format(round(mean_absolute_error(y_test, ridge.predict(X_test)), 2)))
print("Ridge Predicton R-Squared: {}".format(round(r2_score(y_test, ridge.predict(X_test)), 2 )))

Ridge Cross Validation MSE: 24103.69
Ridge Cross Validation Median Absolute Error: 31.33
Ridge Cross Validation Mean Absolute Error: 50.59
Ridge Cross Validation R-Squared: -0.15
Ridge Predicton MSE: 22959.25
Ridge Predicton Median Absolute Error: 30.7
Ridge Predicton Mean Absolute Error: 49.62
Ridge Predicton R-Squared: 0.15


In [86]:
# Lasso Regression

lasso = Lasso(alpha=0.1)

print("Lasso Cross Validation MSE: {}".format(round(-cross_val_score(lasso, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean(),2)))
print("Lasso Cross Validation Median Absolute Error: {}".format(round(-cross_val_score(lasso, X_train, y_train, cv=5, scoring='neg_median_absolute_error').mean(),2)))
print("Lasso Cross Validation Mean Absolute Error: {}".format(round(-cross_val_score(lasso, X_train, y_train, cv=5, scoring='neg_mean_absolute_error').mean(), 2)))
print("Lasso Cross Validation R-Squared: {}".format(round(-cross_val_score(lasso, X_train, y_train, cv=5, scoring='r2').mean(), 2)))

lasso.fit(X_train,y_train)

print("Lasso Predicton MSE: {}".format(round(mean_squared_error(y_test,lasso.predict(X_test)), 2 )))
print("Lasso Predicton Median Absolute Error: {}".format(round(median_absolute_error(y_test, lasso.predict(X_test)),2 )))
print("Lasso Predicton Mean Absolute Error: {}".format(round(mean_absolute_error(y_test, lasso.predict(X_test)), 2)))
print("Lasso Predicton R-Squared: {}".format(round(r2_score(y_test, lasso.predict(X_test)), 2 )))

Lasso Cross Validation MSE: 24129.95
Lasso Cross Validation Median Absolute Error: 31.24
Lasso Cross Validation Mean Absolute Error: 50.48
Lasso Cross Validation R-Squared: -0.15
Lasso Predicton MSE: 22983.24
Lasso Predicton Median Absolute Error: 30.59
Lasso Predicton Mean Absolute Error: 49.49
Lasso Predicton R-Squared: 0.15


In [87]:
# ElasticNet

en = ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=42, selection='cyclic', tol=0.0001, warm_start=False)

print("ElasticNet Cross Validation MSE: {}".format(round(-cross_val_score(en, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean(),2)))
print("ElasticNet Cross Validation Median Absolute Error: {}".format(round(-cross_val_score(en, X_train, y_train, cv=5, scoring='neg_median_absolute_error').mean(),2)))
print("ElasticNet Cross Validation Mean Absolute Error: {}".format(round(-cross_val_score(en, X_train, y_train, cv=5, scoring='neg_mean_absolute_error').mean(), 2)))
print("ElasticNet Cross Validation R-Squared: {}".format(round(-cross_val_score(en, X_train, y_train, cv=5, scoring='r2').mean(), 2)))

en.fit(X_train,y_train)

print("ElasticNet Predicton MSE: {}".format(round(mean_squared_error(y_test,en.predict(X_test)), 2 )))
print("ElasticNet Predicton Median Absolute Error: {}".format(round(median_absolute_error(y_test, en.predict(X_test)),2 )))
print("ElasticNet Predicton Mean Absolute Error: {}".format(round(mean_absolute_error(y_test, en.predict(X_test)), 2)))
print("ElasticNet Predicton R-Squared: {}".format(round(r2_score(y_test, en.predict(X_test)), 2 )))

ElasticNet Cross Validation MSE: 25923.21
ElasticNet Cross Validation Median Absolute Error: 42.77
ElasticNet Cross Validation Mean Absolute Error: 59.93
ElasticNet Cross Validation R-Squared: -0.08
ElasticNet Predicton MSE: 24724.06
ElasticNet Predicton Median Absolute Error: 42.34
ElasticNet Predicton Mean Absolute Error: 58.98
ElasticNet Predicton R-Squared: 0.08
