# Tresure Machine Learning Model Development

## External Module and Library Dependancies

In [None]:
import os
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Extract data

In [None]:
!wget https://raw.githubusercontent.com/Tresure-Bangkit2023/ml-tresure/main/data/user.csv
!wget https://raw.githubusercontent.com/Tresure-Bangkit2023/ml-tresure/main/data/tourism_with_id.csv
!wget https://raw.githubusercontent.com/Tresure-Bangkit2023/ml-tresure/main/data/tourism_rating.csv

In [None]:
#
# Read CSV and save as Pandas DataFrame
#
df_user             = pd.read_csv('user.csv')
df_tourism_with_id  = pd.read_csv('tourism_with_id.csv')
df_tourism_rating   = pd.read_csv('tourism_rating.csv')

## Transform data

In [None]:
# Inspect df_user DataFrame
df_user.head()

In [None]:
# Inspect df_tourism_with_id DataFrame
df_tourism_with_id.head()

In [None]:
# Inspect df_tourism_rating DataFrame
df_tourism_rating.head()

### User DataFrame

In [None]:
df_user.info()

In [None]:
df_user.describe()

This is the user DataFrame numeric values' details. We have the **average age** for user in this dataset is **28.7** ranging from 18 to 40.

### Tourism with ID DataFrame

In [None]:
df_tourism_with_id.info()

In [None]:
df_tourism_with_id.describe()

We have 10 columns with 437 non-null values. However, we have two unknown columns that have 0 value and a repeated index column respectively. We can drop those last 2 column. And we also have **Time_Minutes** column that only have 205 non-null values and we consider it's not helping in any chance. 

We also want to drop the **Description**, **Coordinate**, **Lat**, and **Long** columns because they are not going to be used in the machine learning model.

In [None]:
df_tourism_with_id = df_tourism_with_id.drop(columns = ['Description',
                                                        'Time_Minutes',
                                                        'Coordinate',
                                                        'Lat',
                                                        'Long',
                                                        'Unnamed: 11',
                                                        'Unnamed: 12',])
df_tourism_with_id.info()

Now that we have only 6 columns to be fed into the machine learning model. Let's inspect the first 5 entries.

In [None]:
df_tourism_with_id.head()

In [None]:
df_tourism_with_id.describe()

This is the tourism_with_id DataFrame numeric values' details. We have  **average price** of Rp24,652 ranging from Rp0 to Rp900,000 and **average rating** of 4.44 ranging from 3.4 to 5.0

### Tourism Rating DataFrame

In [None]:
df_tourism_rating.info()

So, we have 3 columns and 10000 non-null entries comprising the user and the place_id with the corresponding ratings.

In [None]:
df_tourism_rating.head()

In [None]:
df_tourism_rating.describe()

Next, normalize the ratings column using MinMax Normalization


In [None]:
df_tourism_rating['Place_Ratings'] = MinMaxScaler().fit_transform(
    np.array(df_tourism_rating['Place_Ratings']).reshape(-1,1)
)

df_tourism_rating.describe()

## Modeling


In [None]:
df = df_tourism_rating

# Train Test Split
X = df.drop(columns = ['Place_Ratings'])
y = df['Place_Ratings']

x_train, x_rem, y_train, y_rem = train_test_split(X, y, test_size = .3, random_state = 1)

x_val, x_test, y_val, y_test = train_test_split(x_rem, y_rem, test_size = .5, random_state = 1)

In [None]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

In [None]:
# Mendapatkan jumlah user
num_users = len(df_user)
print(num_users)
 
# Mendapatkan jumlah places
num_places = len(df_tourism_with_id)
print(num_places)

In [None]:
class MatrixFactorization(tf.keras.Model):

    def __init__(self, num_users, num_places, embedding_size = 128, **kwargs):
        super(MatrixFactorization, self).__init__(**kwargs)
        
        """ Attributes """
        self.num_users = num_users + 1
        self.num_places = num_places + 1
        self.embedding_size = embedding_size
        """ End of Attributes """
        
        """" Model's Layers """
        # Users Embedding Layer
        self.users_embedding = tf.keras.layers.Embedding(
            input_dim = self.num_users,
            output_dim = self.embedding_size,
            name = 'users_embedding',
            embeddings_initializer = tf.keras.initializers.HeNormal(),
            embeddings_regularizer = tf.keras.regularizers.L2(1e-6), 
            input_length=1
        )
        # Places Embedding Layer
        self.places_embedding = tf.keras.layers.Embedding(
            input_dim = self.num_places,
            output_dim = self.embedding_size,
            name = 'places_embedding',
            embeddings_initializer = tf.keras.initializers.HeNormal(),
            embeddings_regularizer = tf.keras.regularizers.L2(1e-6), 
            input_length=1
        )
        # Flatten Layer
        self.flatten = tf.keras.layers.Flatten(name = 'flatten')
        # Multiply Layer
        self.multiply = tf.keras.layers.Multiply(name = 'multiply')
        # Add Layer
        self.add = tf.keras.layers.Add(name = 'add')
        # Output Layer
        self.out = tf.keras.layers.Dense(1, activation = 'sigmoid', name = 'out')
        
        """ End of Model's Layers """
        
    def call(self, inputs):
        users, places = tf.unstack(inputs, axis = 1)

        # Call each embedding layer respectively
        users = self.users_embedding(users)
        places = self.places_embedding(places)

        # Flatten those out
        users = self.flatten(users)
        places = self.flatten(places)

        # Multiply and merge them
        matrix = self.multiply([users, places])
        
        # Pass to dense output layer with sigmoid activation
        out = self.out(matrix)

        return out

In [None]:
# Hyperparameters
EMBEDDING_SIZE = 64
EPOCHS = 150
PREFERRED_LEARNING_RATE = 2e-3
LOSS = tf.keras.losses.BinaryCrossentropy()
OPTIMIZER = tf.keras.optimizers.Adam(learning_rate = PREFERRED_LEARNING_RATE)
METRICS = [tf.keras.metrics.MeanSquaredError(), tf.keras.metrics.TopKCategoricalAccuracy()]

In [None]:
def adjust_learning_rate(x, y):
    
    model = MatrixFactorization(num_users, num_places, EMBEDDING_SIZE)
    
    lr_schedule = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-6 * 10**(epoch / 20))
    
    # Select your optimizer
    optimizer = 'adam'
    
    # Compile the model passing in the appropriate loss
    model.compile(loss = LOSS,
                  optimizer = 'adam', 
                  metrics = METRICS) 
    
    history = model.fit(x, y, epochs=100, callbacks=[lr_schedule])
    
    return history

In [None]:
# Run the training with dynamic LR
lr_history = adjust_learning_rate(x_train, y_train)

In [None]:
plt.semilogx(lr_history.history["lr"], lr_history.history["loss"])
plt.axis([1e-6, 10, 0, 10])

In [None]:
# Model Init
model = MatrixFactorization(num_users, num_places, EMBEDDING_SIZE)

# Compile the model with appropriate loss and optimizer
model.compile(loss = LOSS, optimizer = OPTIMIZER, metrics = METRICS)
model.build(input_shape = (None, 2,))
model.summary()

In [None]:
history = model.fit(
    x_train,
    y_train,
    epochs = EPOCHS,
    validation_data = (x_val, y_val)
)

In [None]:
# summarize history for accuracy
plt.plot(history.history['mean_squared_error'])
plt.plot(history.history['val_mean_squared_error'])
plt.title('model mean_squared_error')
plt.ylabel('mean_squared_error')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()


## Evaluation

In [None]:
model.evaluate(x_test, y_test, return_dict = True)

### Testing

In [None]:
# Mengambil sample user
user_id = x_test.sample(1)['User_Id'].values[0]
user_id

In [None]:
places_visited_by_user = df_tourism_rating[df_tourism_rating.User_Id == user_id]

places_not_visited = df_tourism_with_id[~df_tourism_with_id['Place_Id'].isin(places_visited_by_user.Place_Id.values)].Place_Id.values
places_not_visited = np.expand_dims(list(set(places_not_visited)), axis = 1)

user_places_array = np.hstack(
    ([[user_id]] * len(places_not_visited), places_not_visited)
)

In [None]:
ratings = model.predict(user_places_array).flatten()

In [None]:
top_ratings_indices = ratings.argsort()[-15:][::-1]
recommended_place_ids = [places_not_visited[i][0] for i in top_ratings_indices]
 
print('{}'.format('User ' + str(user_id)))
print('===' * 15,'\n')
print('----' * 15)
print('Top 10 most rated places from the user')
print('----' * 15)
 
top_place_user = (
    places_visited_by_user.sort_values(
        by = ['Place_Ratings'],
        ascending=False
    )
    .head(10)
    .Place_Id.values
)
 
df_tourism_with_id_rows = df_tourism_with_id[df_tourism_with_id['Place_Id'].isin(top_place_user)]
for row in df_tourism_with_id_rows.itertuples():
    print(row.Place_Name, ':', row.Category)

print('')
print('----' * 15)
print('Top 7 place recommendation')
print('----' * 15)
 
recommended_place = df_tourism_with_id[df_tourism_with_id['Place_Id'].isin(recommended_place_ids)]
for row, i in zip(recommended_place.itertuples(), range(15)):
    print(i+1,'.',
          row.Place_Name, '\n   ', 
          row.Category, ',', 'Harga Tiket Masuk ', 
          row.Price, ',', 'Rating Wisata ', 
          row.Rating,'\n'
         )

print('==='*15)

In [None]:
top_ratings_indices = ratings.argsort()[-15:][::-1]

In [None]:
model.save('/kaggle/working/saved_model')
!zip keras_saved_model.zip '/kaggle/working/saved_model'

In [None]:
# new_model = tf.keras.models.load_model('/kaggle/working/saved_model')

new_model = tf.saved_model.load('/kaggle/working/saved_model')
model_variables = new_model.variables
model_function = new_model.signatures["serving_default"]

In [None]:
# args: user_id, places_visited (Place_Id), 

def predict(model, user_id, places_not_visited, n = 50):
    """
    Do prediction on a single user
    
    Args:
        model                : TensorFlow loaded saved_model
        user_id              : int32,
        place_not_visited_ids: array, 
        n                    : int32, number of predictions returned
        
    Returns:
        recommended_place_ids: ndarray, containing Place_Id recommendation
    """
    
    # Create places_to_predict array by querying from tourism database
    places_not_visited = np.expand_dims(list(set(places_not_visited)), axis = 1)

    places_to_predict = np.hstack(
        ([[user_id]] * len(places_not_visited), places_not_visited)
    )
    
    # Predict
    model_function = model.signatures["serving_default"]
    ratings = model_function(tf.constant(places_to_predict))
    
    # Prediction
    top_ratings_indices = ratings.argsort()[-num_predictions:][::-1]
    recommended_place_ids = [places_not_visited[i][0] for i in top_ratings_indices]
    
    return recommended_place_ids

In [None]:
places_not_visited = df_tourism_with_id[~df_tourism_with_id['Place_Id'].isin(places_visited_by_user.Place_Id.values)].Place_Id.values

# predict(new_model, user_id, places_not_visited, 10)
new_model = tf.saved_model.load('/kaggle/working/saved_model')

predict(new_model, user_id, places_not_visited, 10)
