<a href="https://colab.research.google.com/github/SDS-AAU/SDS-2020/blob/master/M3/workshop1/M3_W1_AirBnb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# M3 - Deep Learning Workshop 1

In this workshop will revisit the [AirBnb dataset](http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/listings.csv.gz) that we used in M1. 

In [277]:
# Import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#from sklearn.metrics import mean_squared_error

# Import Keras libraries and metrics
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.metrics import mean_squared_error

In [None]:
# import imputation
from fancyimpute import IterativeImputer

In [None]:
# load data
listings = pd.read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/listings.csv.gz', compression='gzip')

In [None]:
listings.head()

In [None]:
listings.info(verbose=1)

cleaning and selecting data

In [None]:
# some cleaning
listings.price = listings.price.str.replace(',','')
listings.price = listings.price.str.replace('$','')
listings.price = listings.price.astype(float)

In [None]:
listings.cleaning_fee = listings.cleaning_fee.str.replace(',','')
listings.cleaning_fee = listings.cleaning_fee.str.replace('$','')
listings.cleaning_fee = listings.cleaning_fee.astype(float)

In [None]:
listings['price_total'] = listings.price + listings.cleaning_fee

In [None]:
listings_clean = listings[listings.price_total < listings.price_total.quantile(0.95)]

In [None]:
listings_clean = listings_clean[listings.number_of_reviews > 5]

In [None]:
listings_clean = listings_clean[listings_clean.property_type.isin(['Apartment','Condominium','House'])]
listings_clean = listings_clean[listings_clean.bed_type.isin(['Real Bed','Pull-out Sofa'])]

In [279]:
y = listings_clean.price_total

In [280]:
X = listings_clean.iloc[:,[39,51,52,53,54,55,56,57]]

In [281]:
#recoding to dummies
X = pd.get_dummies(X)

scaling the inputs

In [282]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X.iloc[:,:4] = scaler.fit_transform(X.iloc[:,:4])

In [None]:
X.iloc[:,:4] = IterativeImputer().fit_transform(X.iloc[:,:4])

In [None]:
X.isnull().sum()

splitting the data in train - test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

benchmark non-neural models

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model_ols = LinearRegression()
model_ols.fit(X_train, y_train)

In [None]:
model_ols.score(X_test, y_test)

In [None]:
np.sqrt(mean_squared_error(y_test, model_ols.predict(X_test)))

In [None]:
from xgboost import XGBRegressor
model_xgb = XGBRegressor()

In [None]:
model_xgb.fit(X_train, y_train)

In [None]:
model_xgb.score(X_test, y_test)

In [None]:
np.sqrt(mean_squared_error(y_test, model_xgb.predict(X_test)))

In [None]:
X_train.shape

baseline neural model

In [283]:
model = Sequential()
model.add(Dense(10,activation='relu',input_shape = (23,)))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(optimizer = 'adam', loss='mean_squared_error',
             metrics=mean_squared_error)

In [None]:
history = model.fit(X_train, 
                    y_train,
                    epochs = 200,
                    batch_size = 64,
                    verbose=2, validation_split=0.1)

In [None]:
# summarize history for accuracy
plt.plot(history.history['mean_squared_error'])
plt.plot(history.history['val_mean_squared_error'])
plt.title('model MSE')
plt.ylabel('MSE')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
model.evaluate(X_test, y_test)

In [None]:
np.sqrt(95595.8594)

In [None]:
X_train.iloc[:,:4]

## Prevent overfitting

In [None]:
model = Sequential()
model.add(Dense(256,activation='relu',input_shape = (23,)))
model.add(Dense(16,activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(optimizer = 'adam', loss='mean_squared_error',
             metrics=mean_squared_error)


In [None]:
history = model.fit(X_train, 
                    y_train,
                    epochs = 20,
                    batch_size = 32,
                    verbose=0, validation_split=0.1)

In [None]:

model.evaluate(X_test, y_test)

![](https://miro.medium.com/max/1400/0*iNI8Oc80Eunm8NgI)

https://matthewmcateer.me/blog/optimal-brain-damage/
http://yann.lecun.com/exdb/publis/pdf/lecun-90b.pdf


In [None]:
model = Sequential()
model.add(Dense(256,activation='relu',input_shape = (23,)))
model.add(Dropout(0.3)) # <<-- Added dropout
model.add(Dense(16,activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(optimizer = 'adam', loss='mean_squared_error',
             metrics=mean_squared_error)


In [None]:
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
history = model.fit(X_train, 
                    y_train,
                    epochs = 20,
                    batch_size = 32,
                    verbose=0, validation_split=0.1)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
from keras.regularizers import l1

### regualization

In [None]:
model = Sequential()
model.add(Dense(256,activation='relu',input_shape = (23,)))
model.add(Dense(16,activation='relu',  activity_regularizer=l1(0.001))) # <-- add activity regularizer
model.add(Dense(1, kernel_initializer='normal'))
model.compile(optimizer = 'adam', loss='mean_squared_error',
             metrics=mean_squared_error)

history = model.fit(X_train, 
                    y_train,
                    epochs = 20,
                    batch_size = 32,
                    verbose=0, validation_split=0.1)

In [None]:
model.evaluate(X_test, y_test)

## Bonus: Multi-branch architecture

This chunk is using the functional Keras API that is more flexible than the sequential model. Using this you can for instance use text and images together with tabular data for your model. 
Here I'm going to define two branches: One for all dummies the other one for the "normal" nummerical variables.

In [None]:
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import concatenate
from keras.utils.vis_utils import plot_model

In [None]:
# define two sets of inputs
inputA = Input(shape=(4,))
inputB = Input(shape=(19,))
# the first branch operates on the first input
x1 = Dense(8, activation="relu")(inputA)
x1 = Dense(4, activation="relu")(x1)
x1 = Model(inputs=inputA, outputs=x1)
# the second branch opreates on the second input
x2 = Dense(64, activation="relu")(inputB)
x2 = Dense(32, activation="relu")(x2)
x2 = Dense(4, activation="relu")(x2)
x2 = Model(inputs=inputB, outputs=x2)
# combine the output of the two branches
combined = concatenate([x1.output, x2.output])
# apply a FC layer and then a regression prediction on the
# combined outputs
y = Dense(2, activation="relu")(combined)
y = Dense(1, activation="linear")(y)
# our model will accept the inputs of the two branches and
# then output a single value
model = Model(inputs=[x1.input, x2.input], outputs=y)

In [None]:
model.compile(optimizer = 'adam', loss='mean_squared_error',
             metrics=mean_squared_error)

In [None]:
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
history = model.fit([X_train.iloc[:,:4],X_train.iloc[:,4:]], 
                    y_train,
                    epochs = 50,
                    batch_size = 32,
                    verbose=0, validation_split=0.1)

In [None]:
# summarize history for accuracy
plt.plot(history.history['mean_squared_error'])
plt.plot(history.history['val_mean_squared_error'])
plt.title('model MSE')
plt.ylabel('MSE')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
model.evaluate([X_test.iloc[:,:4],X_test.iloc[:,4:]], y_test)

In [None]:
np.sqrt(95384.3984)