<h1>Advanced Machine Learning course - assignment No. 1</h1>
<h2>Simone Paolo Mottadelli 820786</h2>

<hr>
<h3>Configuration of the environment</h3>
<hr>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import tensorflow as tf
import sklearn_pandas
from sklearn import preprocessing
from keras.utils.generic_utils import get_custom_objects
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Input, Model
from keras.layers import Dense, Activation, LeakyReLU
from keras.optimizers.schedules import ExponentialDecay
from keras.optimizers import Adam
from keras import backend as K

!pip install keras

<hr>
<h3>Import the dataset</h3>
<hr>

In [None]:
df_dataset = pd.read_csv("X_train.csv")
df_dataset = df_dataset.drop(columns=["Unnamed: 0"])
df_dataset["price"] = pd.read_csv("y_train.csv")["price"]
df_dataset

<hr><h3>Dataset exploration</h3><hr>

In [None]:
plt.figure(num=None, figsize=(20, 14), dpi=80, facecolor='w', edgecolor='k')

plt.subplot(3, 3, 1)
plt.hist(df_dataset["minimum_nights"], bins=50)
plt.title("minimum_nights")

plt.subplot(3, 3, 2)
plt.hist(df_dataset["number_of_reviews"], bins=50)
plt.title("number_of_reviews")

plt.subplot(3, 3, 3)
plt.hist(df_dataset["reviews_per_month"], bins=50)
plt.title("reviews_per_month")

plt.subplot(3, 3, 4)
plt.hist(df_dataset["calculated_host_listings_count"], bins=50)
plt.title("calculated_host_listings_count")

plt.subplot(3, 3, 5)
plt.hist(df_dataset["availability_365"])
plt.title("availability_365")

plt.subplot(3, 3, 6)
plt.hist(df_dataset["Private_room"])
plt.title("Private_room")

plt.subplot(3, 3, 7)
plt.hist(df_dataset["Entire_home/apt"])
plt.title("Entire_home/apt")

plt.subplot(3, 3, 8)
plt.hist(df_dataset["price"], 100)
plt.title("price")

plt.subplot(3, 3, 9)
plt.plot(df_dataset["latitude"], df_dataset["longitude"], "ro")
plt.title("geo position")
plt.xlabel("latitude")
plt.ylabel("longitude")

plt.show()

In [None]:
df_dataset.describe()

<hr><h3>Preprocessing</h3><hr>

In [None]:
# remove the rows where the price is too high (outliers)
df_dataset = df_dataset.drop(df_dataset[df_dataset.price >= np.quantile(df_dataset["price"], 0.96)].index)

# remove the rows where the price is 0 (outliers)
df_dataset = df_dataset.drop(df_dataset[df_dataset.price == 0].index)

df_dataset.describe()

In [None]:
# apply log(x + 1) transformation and normalize between 0 and 1
scaler = preprocessing.MinMaxScaler()
df_dataset["minimum_nights"] = np.log(df_dataset["minimum_nights"] + 1)
df_dataset["minimum_nights"] = ((df_dataset["minimum_nights"]-df_dataset["minimum_nights"].min())/(df_dataset["minimum_nights"].max()-df_dataset["minimum_nights"].min()))

df_dataset["number_of_reviews"] = np.log(df_dataset["number_of_reviews"] + 1)
df_dataset["number_of_reviews"] = ((df_dataset["number_of_reviews"]-df_dataset["number_of_reviews"].min())/(df_dataset["number_of_reviews"].max()-df_dataset["number_of_reviews"].min()))

df_dataset["reviews_per_month"] = np.log(df_dataset["reviews_per_month"] + 1)
df_dataset["reviews_per_month"] = ((df_dataset["reviews_per_month"]-df_dataset["reviews_per_month"].min())/(df_dataset["reviews_per_month"].max()-df_dataset["reviews_per_month"].min()))

df_dataset["calculated_host_listings_count"] = np.log(df_dataset["calculated_host_listings_count"] + 1)
df_dataset["calculated_host_listings_count"] = ((df_dataset["calculated_host_listings_count"]-df_dataset["calculated_host_listings_count"].min())/(df_dataset["calculated_host_listings_count"].max()-df_dataset["calculated_host_listings_count"].min()))

df_dataset["availability_365"] = np.log(df_dataset["availability_365"] + 1)
df_dataset["availability_365"] = ((df_dataset["availability_365"]-df_dataset["availability_365"].min())/(df_dataset["availability_365"].max()-df_dataset["availability_365"].min()))

df_dataset["latitude"] = ((df_dataset["latitude"]-df_dataset["latitude"].min())/(df_dataset["latitude"].max()-df_dataset["latitude"].min()))

df_dataset["longitude"] = ((df_dataset["longitude"]-df_dataset["longitude"].min())/(df_dataset["longitude"].max()-df_dataset["longitude"].min()))


In [None]:
plt.figure(num=None, figsize=(20, 14), dpi=80, facecolor='w', edgecolor='k')

plt.subplot(3, 3, 1)
plt.hist(df_dataset["minimum_nights"], bins=50)
plt.title("minimum_nights")

plt.subplot(3, 3, 2)
plt.hist(df_dataset["number_of_reviews"], bins=50)
plt.title("number_of_reviews")

plt.subplot(3, 3, 3)
plt.hist(df_dataset["reviews_per_month"], bins=50)
plt.title("reviews_per_month")

plt.subplot(3, 3, 4)
plt.hist(df_dataset["calculated_host_listings_count"], bins=50)
plt.title("calculated_host_listings_count")

plt.subplot(3, 3, 5)
plt.hist(df_dataset["availability_365"])
plt.title("availability_365")

plt.subplot(3, 3, 6)
plt.hist(df_dataset["Private_room"])
plt.title("Private_room")

plt.subplot(3, 3, 7)
plt.hist(df_dataset["Entire_home/apt"])
plt.title("Entire_home/apt")

plt.subplot(3, 3, 8)
plt.hist(df_dataset["price"], 100)
plt.title("price")

plt.subplot(3, 3, 9)
plt.plot(df_dataset["latitude"], df_dataset["longitude"], "ro")
plt.xlabel("latitude")
plt.ylabel("longitude")
plt.title("geo position")

plt.show()

<hr><h3>Regression</h3><hr>

In [None]:
# split dataset in train and test sets
df_train, df_test = train_test_split(df_dataset, test_size=0.10, random_state=123)

# set of activation functions
get_custom_objects().update({'leaky-relu(alpha=0.3)': Activation(LeakyReLU(alpha=0.3))})
get_custom_objects().update({'leaky-relu(alpha=0.2)': Activation(LeakyReLU(alpha=0.2))})
get_custom_objects().update({'leaky-relu(alpha=0.1)': Activation(LeakyReLU(alpha=0.1))})
act_func = ['relu', 'elu', 'leaky-relu(alpha=0.3)', 'leaky-relu(alpha=0.2)', 'leaky-relu(alpha=0.1)', 'selu', 'swish']

result = []
for act in act_func:
  print("\nEvaluating ...", act)
  nfeatures = df_train.shape[1] - 1
  model = Sequential()
  model.add(Input(shape=(nfeatures,)))
  model.add(Dense(14, activation="relu"))
  model.add(Dense(14, activation="relu"))
  model.add(Dense(14, activation="relu"))
  model.add(Dense(14, activation="relu"))
  model.add(Dense(1, activation=act))
  model.compile(optimizer="adam", loss='mse', metrics = [tf.keras.metrics.RootMeanSquaredError()])
  history = model.fit(np.array(df_train)[:,:-1], np.array(df_train)[:,-1],
          batch_size=1024,
          epochs=1000,
          verbose=0,
          validation_data=(np.array(df_test)[:,:-1], np.array(df_test)[:,-1]))
  
  result.append(history) 
  
  K.clear_session()
  del model

In [None]:
# Plot of the validation root mean squared error vs
## the train root mean squared error
plt.figure(figsize=(16,30))
    
rmse_finale_val = []
rmse_finale_tr = []
counter=1
for act_function in result:
  plt.subplot(5, 2, counter)
  plt.ylabel('RMSE')
  plt.xlabel('Epochs') 
  plt.title(act_func[counter - 1])
  plt.plot(act_function.history['val_root_mean_squared_error'])
  rmse_finale_val.append(act_function.history['val_root_mean_squared_error'][-1])
  plt.plot(act_function.history['root_mean_squared_error'])
  rmse_finale_tr.append(act_function.history['root_mean_squared_error'][-1])
  counter += 1
    
plt.show()

for i in range(len(act_func)):
  print('Validation RMSE of the activation function {} is {}'.format(act_func[i], round(rmse_finale_val[i], 5)))
  print('Train RMSE of the activation function {} is {}'.format(act_func[i], round(rmse_finale_tr[i], 5)))

<hr>
<h3>Build the model with the whole dataset</h3>
<hr>

In [None]:
nfeatures = df_dataset.shape[1] - 1
model = Sequential()
model.add(Input(shape=(nfeatures,)))
model.add(Dense(14, activation="relu"))
model.add(Dense(14, activation="relu"))
model.add(Dense(14, activation="relu"))
model.add(Dense(14, activation="relu"))
model.add(Dense(1, activation="relu"))
model.compile(optimizer='adam', loss='mse')

model.fit(np.array(df_dataset)[:,:-1], np.array(df_dataset)[:,-1],
          batch_size=1024,
          epochs=500,
          verbose=2)

<hr><h3>Preprocess X_test</h3><hr>

In [None]:
df_x_test= pd.read_csv("X_test.csv")
df_x_test = df_x_test.drop(columns=["Unnamed: 0"])

# apply log(x + 1) transformation and normalize between 0 and 1
scaler = preprocessing.MinMaxScaler()
df_x_test["minimum_nights"] = np.log(df_x_test["minimum_nights"] + 1)
df_x_test["minimum_nights"] = ((df_x_test["minimum_nights"]-df_x_test["minimum_nights"].min())/(df_x_test["minimum_nights"].max()-df_x_test["minimum_nights"].min()))

df_x_test["number_of_reviews"] = np.log(df_x_test["number_of_reviews"] + 1)
df_x_test["number_of_reviews"] = ((df_x_test["number_of_reviews"]-df_x_test["number_of_reviews"].min())/(df_x_test["number_of_reviews"].max()-df_x_test["number_of_reviews"].min()))

df_x_test["reviews_per_month"] = np.log(df_x_test["reviews_per_month"] + 1)
df_x_test["reviews_per_month"] = ((df_x_test["reviews_per_month"]-df_x_test["reviews_per_month"].min())/(df_x_test["reviews_per_month"].max()-df_x_test["reviews_per_month"].min()))

df_x_test["calculated_host_listings_count"] = np.log(df_x_test["calculated_host_listings_count"] + 1)
df_x_test["calculated_host_listings_count"] = ((df_x_test["calculated_host_listings_count"]-df_x_test["calculated_host_listings_count"].min())/(df_x_test["calculated_host_listings_count"].max()-df_x_test["calculated_host_listings_count"].min()))

df_x_test["availability_365"] = np.log(df_x_test["availability_365"] + 1)
df_x_test["availability_365"] = ((df_x_test["availability_365"]-df_x_test["availability_365"].min())/(df_x_test["availability_365"].max()-df_x_test["availability_365"].min()))

df_x_test["latitude"] = ((df_x_test["latitude"]-df_x_test["latitude"].min())/(df_x_test["latitude"].max()-df_x_test["latitude"].min()))

df_x_test["longitude"] = ((df_x_test["longitude"]-df_x_test["longitude"].min())/(df_x_test["longitude"].max()-df_x_test["longitude"].min()))

<hr><h3>Predict the prices on X_test</h3><hr>

In [None]:
predictions = np.transpose(model.predict(np.array(df_x_test))).flatten()

file = open("Simone_Paolo_Mottadelli_820786_score1.txt", "w")
for i in range(0, len(predictions)):
  file.write(str(int(np.round(predictions[i]))) + "\n")
file.close()