In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **Bibliotecas**

In [None]:
import pandas as pd
import geopandas
import os
import numpy as np
from collections import defaultdict
from numpy.linalg import norm
from numpy.linalg import inv as inverse
import scipy.sparse as sparse
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import warnings
warnings.filterwarnings('ignore')

### **Optimizer**

In [None]:
class Optimizer:

    def __init__(self):
        self._user_location_frequency = np.array([])
        self._user_time_frequency = np.array([])
        self._location_co_ocurrency = np.array([])
        self._location_time = np.array([])
        self._weight = 0.001
        self.activity_location = np.array([])
        self.activity_time = np.array([])
        self.user_activity = np.array([])
        self.activity_embedding = np.array([])
        self.target_location_embedding = np.array([])
        self.context_location_embedding = np.array([])
        self.time_slot_embedding = np.array([])

    def _create_user_location_frequency_matrix(self, users_checkins):
        placeids = users_checkins["placeid"].tolist()
        userids = users_checkins["userid"]
        total_users = len(users_checkins["userid"].unique())
        total_places = len(users_checkins["placeid"].unique())
        print(f'total_places: {total_places}, total_users: {total_users}\n')

        self._user_location_frequency = sparse.lil_matrix((total_users, total_places))

        for i in range(len(placeids)):
            self._user_location_frequency[userids[i], placeids[i]] += 1

    def _create_user_time_frequency_matrix(self, users_checkins: pd.DataFrame):
        users_checkins_sorted = users_checkins.sort_values(by=["datetime"])

        users_ids = users_checkins_sorted["userid"]
        datetimes = pd.to_datetime(users_checkins["datetime"])
        total_users = len(users_checkins["userid"].unique())

        self._user_time_frequency = np.zeros((total_users, 48))

        for i, j in zip(users_ids, datetimes):
            if j.weekday() >= 5:
                self._user_time_frequency[i][j.hour + 24] += 1
            else:
                self._user_time_frequency[i][j.hour] += 1

    def _create_location_coocurrency_matrix(self, users_checkins):
        try:
            users_checkins_sorted = users_checkins.sort_values(by=["datetime"])
            locations = users_checkins_sorted["placeid"].tolist()
            number_of_locations = len(users_checkins["placeid"].unique())

            self._location_co_ocurrency = sparse.lil_matrix(
                (number_of_locations, number_of_locations)
            )  ##location co occurency represents memory for save memory

            for i in range(len(locations)):
                for j in range(1, 6):
                    if (i - j) < 0:
                        break
                    self._location_co_ocurrency[locations[i], locations[i - j]] += 1
                for j in range(1, 6):
                    if (i + j) > len(locations) - 1:
                        break
                    self._location_co_ocurrency[locations[i], locations[j + i]] += 1
            sum_of_dl = np.sum(self._location_co_ocurrency)
            l_occurrency = np.sum(self._location_co_ocurrency, axis=1).reshape(-1, 1)
            c_occurrency = np.sum(self._location_co_ocurrency, axis=0).reshape(1, -1)

            for i in range(number_of_locations):
                line = self._location_co_ocurrency[i].toarray()
                ##PMI em subdivisoes da matriz esparsa
                self._location_co_ocurrency[i] = np.maximum(
                    np.log2(
                        np.maximum(line * sum_of_dl, 1)
                        / (l_occurrency[i] * c_occurrency)
                    ),
                    0,
                )

        except Exception as e:
            raise e

    def _create_location_time_matrix(self, users_checinks):
        locations = users_checinks["placeid"].tolist()
        datetimes = users_checinks["datetime"].tolist()
        total_locations = len(users_checinks["placeid"].unique())
        Dt = np.zeros((total_locations, 48))

        for i in range(len(locations)):
            if datetimes[i].weekday() >= 5:
                Dt[locations[i]][datetimes[i].hour + 24] += 1
            else:
                Dt[locations[i]][datetimes[i].hour] += 1

        sum_of_dt = np.sum(Dt)
        l_occurrency = np.sum(Dt, axis=1).reshape(-1, 1)
        c_occurrency = np.sum(Dt, axis=0).reshape(1, -1)

        mult = l_occurrency * c_occurrency
        mult[mult == 0] = -1

        tmp = np.maximum(Dt * sum_of_dt, 1) / mult
        tmp[tmp < 0] = 0
        self._location_time = np.maximum(np.log2(tmp), 0)

    def _objective_function(self, l2_weight):
        def first_component(l2_weight):
            first_equation = l2_weight * norm(
                (
                    self._user_location_frequency
                    - np.dot(self.user_activity, self.activity_location.T)
                )
            )

            second_equation = (1 - l2_weight) * norm(
                (
                    self._user_time_frequency
                    - np.dot(self.user_activity, self.activity_time.T)
                )
            )
            return first_equation + second_equation

        def second_component(l2_weight):
            first_equation = l2_weight * norm(
                (
                    self._location_co_ocurrency
                    - np.dot(
                        self.target_location_embedding,
                        self.context_location_embedding.T,
                    )
                )
            )
            second_equation = (1 - l2_weight) * norm(
                (
                    self._location_time
                    - np.dot(self.target_location_embedding, self.time_slot_embedding.T)
                )
            )
            return first_equation + second_equation

        def third_component(l2_weight):
            first_equation = l2_weight * norm(
                (
                    self.activity_location
                    - np.dot(self.context_location_embedding, self.activity_embedding.T)
                )
            )
            second_equation = (1 - l2_weight) * norm(
                (
                    self.activity_time
                    - np.dot(self.time_slot_embedding, self.activity_embedding.T)
                )
            )
            return first_equation + second_equation

        activity_modeling_component = first_component(l2_weight)
        trajectory_embedding_component = second_component(l2_weight)
        collaborative_learning_component = third_component(l2_weight)

        objective_function = (
            activity_modeling_component
            + trajectory_embedding_component
            + collaborative_learning_component
        )
        objective_function += self._weight * norm(self.user_activity)
        objective_function += self._weight * norm(self.activity_time)
        objective_function += self._weight * norm(self.activity_embedding)
        objective_function += self._weight * norm(self.activity_location)
        objective_function += self._weight * norm(self.context_location_embedding)
        objective_function += self._weight * norm(self.target_location_embedding)
        objective_function += self._weight * norm(self.time_slot_embedding)

        return objective_function

    def _initialize_parameters(self, checkins, K, M):
        total_locations = len(checkins["placeid"].unique())
        total_users = len(checkins["userid"].unique())
        time_slot = 48

        # print("\nDurante a construção:")
        self.activity_location = np.random.normal(size=(total_locations, K))
        # print("activity location:", self.activity_location.shape)

        self.activity_time = np.random.normal(size=(time_slot, K))
        # print("activity time:", self.activity_time.shape)

        self.user_activity = np.random.normal(size=(total_users, K))
        # print("user activity:", self.user_activity.shape)

        self.activity_embedding = np.random.normal(size=(K, M))
        # print("activity embedding:", self.activity_embedding.shape)

        self.target_location_embedding = np.random.normal(size=(total_locations, M))
        # print("target location embedding:", self.target_location_embedding.shape)

        self.context_location_embedding = np.random.normal(size=(total_locations, M))
        # print("context location embedding:", self.context_location_embedding.shape)

        self.time_slot_embedding = np.random.normal(size=(time_slot, M))
        # print("time slot embedding:", self.time_slot_embedding.shape)

    def user_activity_embedding_function(self, K, l2_weight):
        first_equation = (
            l2_weight * (self._user_location_frequency * self.activity_location)
        ) + ((1 - l2_weight) * np.dot(self._user_time_frequency, self.activity_time))
        second_equation = (
            l2_weight * np.dot(self.activity_location.T, self.activity_location)
        ) + (
            (1 - l2_weight) * np.dot(self.activity_time.T, self.activity_time)
            + (l2_weight * np.identity(K))
        )
        return np.dot(first_equation, inverse(second_equation))

    def acticity_location_embedding_function(self, K, l2_weight):
        first_equation = l2_weight * (
            (self._user_location_frequency.T * self.user_activity)
            + np.dot(self.context_location_embedding, self.activity_embedding.T)
        )
        second_equation = (
            l2_weight * np.dot(self.user_activity.T, self.user_activity)
        ) + ((self._weight + l2_weight) * np.identity(K))
        return np.dot(first_equation, inverse(second_equation))

    def activity_time_embedding_function(self, K, l2_weight):
        first_equation = (1 - l2_weight) * (
            np.dot(self._user_time_frequency.T, self.user_activity)
            + np.dot(self.time_slot_embedding, self.activity_embedding.T)
        )
        second_equation = (1 - l2_weight) * (
            np.dot(self.user_activity.T, self.user_activity)
            + (1 - self._weight + l2_weight) * np.identity(K)
        )
        return np.dot(first_equation, inverse(second_equation))

    def activity_embedding_function(self, M, l2_weight):
        first_equation = (
            l2_weight
            * np.dot(self.activity_location.T, self.context_location_embedding)
        ) + ((1 - l2_weight) * np.dot(self.activity_time.T, self.time_slot_embedding))
        second_equation = (
            (
                l2_weight
                * np.dot(
                    self.context_location_embedding.T, self.context_location_embedding
                )
            )
            + (
                (1 - l2_weight)
                * np.dot(self.time_slot_embedding.T, self.time_slot_embedding)
            )
            + (self._weight * np.identity(M))
        )
        return np.dot(first_equation, inverse(second_equation))

    def target_location_embedding_function(self, M, l2_weight):
        first_equation = (
            l2_weight * self._location_co_ocurrency * self.context_location_embedding
        ) + ((1 - l2_weight) * np.dot(self._location_time, self.time_slot_embedding))

        second_equation = (
            (
                l2_weight
                * np.dot(
                    self.context_location_embedding.T, self.context_location_embedding
                )
            )
            + (
                (1 - l2_weight)
                * np.dot(self.time_slot_embedding.T, self.time_slot_embedding)
            )
            + (self._weight * np.identity(M))
        )

        return np.dot(first_equation, inverse(second_equation))

    def context_location_embedding_function(self, M, l2_weight):
        first_equation = l2_weight * (
            self._location_co_ocurrency.T * self.target_location_embedding
            + np.dot(self.activity_location, self.activity_embedding)
        )
        second_equation = (
            l2_weight
            * (
                np.dot(self.target_location_embedding.T, self.target_location_embedding)
                + np.dot(self.activity_embedding.T, self.activity_embedding)
            )
        ) + (self._weight * np.identity(M))
        return np.dot(first_equation, inverse(second_equation))

    def time_slot_embedding_function(self, M, l2_weight):
        first_equation = (1 - l2_weight) * (
            np.dot(self._location_time.T, self.target_location_embedding)
            + np.dot(self.activity_time, self.activity_embedding)
        )
        second_equation = (
            (1 - l2_weight)
            * (
                np.dot(self.target_location_embedding.T, self.target_location_embedding)
                + np.dot(self.activity_embedding.T, self.activity_embedding)
            )
        ) + (self._weight * np.identity(M))
        return np.dot(first_equation, inverse(second_equation))

    def _optimize_parameters(self, K, M, l2_weight):
        self.user_activity = self.user_activity_embedding_function(K, l2_weight)
        self.user_activity[self.user_activity < 0] = 0

        self.activity_location = self.acticity_location_embedding_function(K, l2_weight)
        self.activity_location[self.activity_location < 0] = 0

        self.activity_time = self.activity_time_embedding_function(K, l2_weight)
        self.activity_time[self.activity_time < 0] = 0

        self.activity_embedding = self.activity_embedding_function(M, l2_weight)
        self.target_location_embedding = self.target_location_embedding_function(
            M, l2_weight
        )
        self.context_location_embedding = self.context_location_embedding_function(
            M, l2_weight
        )
        self.time_slot_embedding = self.time_slot_embedding_function(M, l2_weight)

    def start(self, checkins, l2_weight=0.1, K=10, M=100):
        print(f'\nInicando o HMRM...')
        checkins["datetime"] = pd.to_datetime(checkins["datetime"])

        self._create_user_location_frequency_matrix(checkins)
        self._create_location_coocurrency_matrix(checkins)
        self._create_user_time_frequency_matrix(checkins)
        self._create_location_time_matrix(checkins)

        print(f'\nMatrizes criadas...')

        self._initialize_parameters(checkins, K, M)

        value = 100000

        print("\nOtimizando os parâmetros")
        for i in range(10):
            print(i)
            self._optimize_parameters(K, M, l2_weight)
            objective_func = self._objective_function(l2_weight)

            # print("user activity:", self.user_activity) # theta
            # print("activity location:", self.activity_location) # Al
            # print("activity time:", self.activity_time) # At
            # print("activity embedding:", self.activity_embedding) # Ea
            # print("target location embedding:", self.target_location_embedding) # El
            # print("context location embedding:", self.context_location_embedding) # Ec
            # print("time slot embedding:", self.time_slot_embedding) # Et

            if (value - objective_func) <= 0.1:
                break
            value = objective_func

### **HMRM Baseline**

In [None]:
class HmrmBaseline:
    def __init__(self, file=None, weight=0.5, K=7, embedding_size=50):
        self.optimizer = Optimizer()
        self.input_file = file
        self.weight = weight
        self.K = K
        self.embedding_size = embedding_size

    def start(self):
        users_checkin_filename = self.input_file
        users_checkin = pd.read_csv(users_checkin_filename, index_col=False).dropna(
            axis=1
        )

        usersid = users_checkin.userid

        placeid_mapping = dict(zip(range(users_checkin['placeid'].unique().size), users_checkin['placeid'].unique()))

        users_checkin.userid = pd.factorize(users_checkin.userid)[0].astype(int)
        users_checkin.placeid = pd.factorize(users_checkin.placeid)[0].astype(int)

        self.optimizer.start(users_checkin, self.weight, self.K, self.embedding_size)

        df = pd.DataFrame(
            data=np.concatenate(
                (
                    self.optimizer.context_location_embedding,
                    self.optimizer.target_location_embedding,
                ),
                axis=1,
            )
        )

        try:
            values = []
            for i in range(df.shape[0]):
                category = users_checkin[users_checkin["placeid"] == i][
                    "category"
                ].unique()[0]

                values.append(category)

            df["category"] = values
            df['placeid'] = list(map(lambda x: placeid_mapping[x], range(df.shape[0])))

        except Exception as e:
            print('vim pro except')
            print(f'erro: {e}')
            pass

        return df

### **Gerando os embeddings gerais com hmrm**

In [None]:
def select_checkins(path, state_name):
  state_checkins = pd.read_csv(path)
  print(f'shape {state_name}-checkins: {state_checkins.shape}')
  print(f'\ncolumns {state_name}-checkins: {state_checkins.columns}')

  if 'local_datetime' in state_checkins.columns:
    state_checkins.rename(columns={'local_datetime': 'datetime'}, inplace=True)
    print(f"\nRenamed 'local_datetime' to 'datetime'")
    print(f'\ncolumns {state_name}-checkins: {state_checkins.columns}')

  checkins_per_user = state_checkins['userid'].value_counts()
  selected_users = checkins_per_user[checkins_per_user >= 40]
  users_ids = selected_users.index.unique().tolist()

  print(f'\nnumber of users_ids: {len(users_ids)}')

  filtred_checkins = state_checkins[state_checkins['userid'].isin(users_ids)]
  print(f'\nshape filtred-{state_name}: {filtred_checkins.shape}')
  print(f'\ncolumns filtred-{state_name}: {filtred_checkins.columns}')

  input_file = f'/content/drive/MyDrive/Graduacao/POC/Dados/checkins/{state_name}-filtrado/{state_name}-filtrado.csv'
  output_file = f'/content/drive/MyDrive/Graduacao/POC/Dados/embeddings/{state_name}-embeddings.csv'

  filtred_checkins.to_csv(input_file, index=False)

  if os.path.exists(input_file):
      print(f'\n{state_name}-filtered checkins saved successfully at {input_file}\n')
  else:
      print(f'\nError saving {state_name}-filtered checkins at {input_file}\n')

  return input_file, output_file

In [None]:
path_alabama = '/content/drive/MyDrive/Graduacao/POC/Dados/checkins/alabama-complete/checkins_Alabama.csv'
path_arizona = '/content/drive/MyDrive/Graduacao/POC/Dados/checkins/arizona-complete/checkins_Arizona.csv'
path_virginia = '/content/drive/MyDrive/Graduacao/POC/Dados/checkins/virginia-complete/checkins_Virginia.csv'
path_chicago = '/content/drive/MyDrive/Graduacao/POC/Dados/checkins/chicago-complete/checkins_chicago.csv'
path_florida = '/content/drive/MyDrive/Graduacao/POC/Dados/checkins/florida-complete/checkins_florida.csv'
path_georgia = '/content/drive/MyDrive/Graduacao/POC/Dados/checkins/georgia-complete/checkins_georgia.csv'
path_nova_york = '/content/drive/MyDrive/Graduacao/POC/Dados/checkins/nova_york-complete/checkins_nova_york.csv'

#### alabama

In [None]:
# alabama
input_file, output_file = select_checkins(path_alabama, 'alabama')

hmrm = HmrmBaseline(input_file, 0.1, 7, 50)
embeddings = hmrm.start()
embeddings.to_csv(output_file, index=False)

print('### EMBEDDINGS GERADOS ###')

print(embeddings.shape)
print(embeddings.columns)

shape alabama-checkins: (93402, 8)

columns alabama-checkins: Index(['userid', 'category', 'placeid', 'local_datetime', 'latitude',
       'longitude', 'country_name', 'state_name'],
      dtype='object')

number of users_ids: 418

shape filtred-alabama: (76041, 8)

columns filtred-alabama: Index(['userid', 'category', 'placeid', 'local_datetime', 'latitude',
       'longitude', 'country_name', 'state_name'],
      dtype='object')

alabama-filtered checkins saved successfully at /content/drive/MyDrive/Graduacao/POC/Dados/checkins/alabama-filtrado/alabama-filtrado.csv



KeyError: 'datetime'

#### arizona

In [None]:
# arizona
input_file, output_file = select_checkins(path_arizona, 'arizona')

hmrm = HmrmBaseline(input_file, 0.1, 7, 50)
embeddings = hmrm.start()
embeddings.to_csv(output_file, index=False)

print('### EMBEDDINGS GERADOS ###')

print(embeddings.shape)
print(embeddings.columns)

shape arizona-checkins: (188860, 8)

columns arizona-checkins: Index(['userid', 'category', 'placeid', 'local_datetime', 'latitude',
       'longitude', 'country_name', 'state_name'],
      dtype='object')

Renamed 'local_datetime' to 'datetime'

columns arizona-checkins: Index(['userid', 'category', 'placeid', 'datetime', 'latitude', 'longitude',
       'country_name', 'state_name'],
      dtype='object')

number of users_ids: 756

shape filtred-arizona: (152210, 8)

columns filtred-arizona: Index(['userid', 'category', 'placeid', 'datetime', 'latitude', 'longitude',
       'country_name', 'state_name'],
      dtype='object')


KeyboardInterrupt: 

#### virginia

In [None]:
# viriginia
input_file, output_file = select_checkins(path_virginia, 'virginia')

hmrm = HmrmBaseline(input_file, 0.1, 7, 50)
embeddings = hmrm.start()
embeddings.to_csv(output_file, index=False)

print(embeddings.shape)
print(embeddings.columns)

shape virginia-checkins: (247600, 8)

columns virginia-checkins: Index(['userid', 'category', 'placeid', 'local_datetime', 'latitude',
       'longitude', 'country_name', 'state_name'],
      dtype='object')

Renamed 'local_datetime' to 'datetime'

columns virginia-checkins: Index(['userid', 'category', 'placeid', 'datetime', 'latitude', 'longitude',
       'country_name', 'state_name'],
      dtype='object')

number of users_ids: 1059

shape filtred-virginia: (195378, 8)

columns filtred-virginia: Index(['userid', 'category', 'placeid', 'datetime', 'latitude', 'longitude',
       'country_name', 'state_name'],
      dtype='object')

virginia-filtered checkins saved successfully at /content/drive/MyDrive/Graduacao/POC/Dados/checkins/virginia-filtrado/virginia-filtrado.csv


Inicando o HMRM...
total_places: 20947, total_users: 1059



KeyboardInterrupt: 

#### chicago

In [None]:
# chicago
input_file, output_file = select_checkins(path_chicago, 'chicago')

hmrm = HmrmBaseline(input_file, 0.1, 7, 50)
embeddings = hmrm.start()
embeddings.to_csv(output_file, index=False)

print(embeddings.shape)
print(embeddings.columns)

shape chicago-checkins: (198407, 14)

columns chicago-checkins: Index(['userid', 'category', 'placeid', 'datetime', 'latitude', 'longitude',
       'country_name', 'state_name', 'geometry', 'index_right', 'TRACTCE',
       'GEOID', 'cell', 'eID'],
      dtype='object')

number of users_ids: 861

shape filtred-chicago: (137925, 14)

columns filtred-chicago: Index(['userid', 'category', 'placeid', 'datetime', 'latitude', 'longitude',
       'country_name', 'state_name', 'geometry', 'index_right', 'TRACTCE',
       'GEOID', 'cell', 'eID'],
      dtype='object')

chicago-filtered checkins saved successfully at /content/drive/MyDrive/Graduacao/POC/Dados/checkins/chicago-filtrado/chicago-filtrado.csv

(9092, 102)
Index([         0,          1,          2,          3,          4,          5,
                6,          7,          8,          9,
       ...
               92,         93,         94,         95,         96,         97,
               98,         99, 'category',  'placeid'],
   

#### georgia

In [None]:
# georgia
input_file, output_file = select_checkins(path_georgia, 'georgia')

hmrm = HmrmBaseline(input_file, 0.1, 7, 50)
embeddings = hmrm.start()
embeddings.to_csv(output_file, index=False)

print('### EMBEDDINGS GERADOS ###')

print(embeddings.shape)
print(embeddings.columns)

shape georgia-checkins: (332198, 8)

columns georgia-checkins: Index(['userid', 'category', 'placeid', 'local_datetime', 'latitude',
       'longitude', 'country_name', 'state_name'],
      dtype='object')

Renamed 'local_datetime' to 'datetime'

columns georgia-checkins: Index(['userid', 'category', 'placeid', 'datetime', 'latitude', 'longitude',
       'country_name', 'state_name'],
      dtype='object')

number of users_ids: 1159

shape filtred-georgia: (276308, 8)

columns filtred-georgia: Index(['userid', 'category', 'placeid', 'datetime', 'latitude', 'longitude',
       'country_name', 'state_name'],
      dtype='object')

georgia-filtered checkins saved successfully at /content/drive/MyDrive/Graduacao/POC/Dados/checkins/georgia-filtrado/georgia-filtrado.csv


Inicando o HMRM...
total_places: 23452, total_users: 1159


Matrizes criadas...

Otimizando os parâmetros
0
1
2
3
4
5
6
7
8
9
### EMBEDDINGS GERADOS ###
(23452, 102)
Index([         0,          1,          2,          3,   

### **SVM**

In [None]:
# validação cruzada k-fold no modelo
kf = KFold(n_splits=5, shuffle=True)
split = kf.split(features_alabama.iloc[:, 0:99], features_alabama.iloc[:, 100])
fscores, precisions, recalls = [], [], []

acc = []
precision = []
recall = []
w_avg_f = []
m_avg_f = []

w_avg_p = []
m_avg_p = []

w_avg_r = []
m_avg_r = []

for train_index, test_index in split:

    X_train, Y_train = features_alabama.loc[train_index].iloc[:,
                                                        :99], features_alabama.loc[train_index].iloc[:, 100]
    x_test, y_test = features_alabama.loc[test_index].iloc[:,
                                                    :99], features_alabama.loc[test_index].iloc[:, 100]

    model = svm.SVC(
        kernel="linear", decision_function_shape='ovo',  class_weight="balanced")
    model.fit(X_train, Y_train)

    y_predicted = model.predict(x_test)

    precision, recall, fscore, support = score(y_test, y_predicted)
    acc.append(accuracy_score(y_test, y_predicted))

    fscores.append(fscore)
    precisions.append(precision)
    recalls.append(recall)

    w_avg_f.append(f1_score(y_test, y_predicted, average='weighted'))
    m_avg_f.append(f1_score(y_test, y_predicted, average='macro'))

    w_avg_p.append(precision_score(
        y_test, y_predicted, average='weighted'))
    m_avg_p.append(precision_score(y_test, y_predicted, average='macro'))

    w_avg_r.append(recall_score(y_test, y_predicted, average='weighted'))
    m_avg_r.append(recall_score(y_test, y_predicted, average='macro'))
    class_labels = sorted(set(y_test))

In [None]:
name_columns = [x for x in class_labels]
metrics_f = pd.DataFrame(fscores, columns = name_columns)
metrics_p = pd.DataFrame(precisions, columns = name_columns)
metrics_r = pd.DataFrame(recalls, columns = name_columns)

metrics_f["accuracy"] = acc
metrics_f["macro avg"] = m_avg_f
metrics_f["weighted avg"] = w_avg_f

metrics_p["weighted avg"] = w_avg_p
metrics_p["macro avg"] = m_avg_p

metrics_r["weighted avg"] = w_avg_r
metrics_r["macro avg"] = m_avg_r

print("\nMétricas precision:")
display(metrics_p)

print("\n\nMétricas recall:")
display(metrics_r)

print("\n\nMétricas fscore:")
display(metrics_f)

In [None]:
melted_metrics_f = metrics_f[[0, 1, 2, 3, 4, 5, 6]].melt()
palette = sns.color_palette("husl", n_colors=len(melted_metrics_f["variable"].unique()))

sns.boxplot(x="variable", y="value", hue="variable", data=melted_metrics_f, palette=palette)
plt.xlabel("Metric")
plt.ylabel("Value")
plt.title("Performance Metrics by Fold")
plt.legend(title="Fold", loc="upper right")
plt.show()

Analisando as métricas, podemos concluir que o desempenho do modelo na classificação dos POIs com base no embedding gerado pelo HMRM não é muito alto. Isso sugere que o embedding pode não capturar todas as características importantes dos dados de check-in do Alabama, levando a um desempenho relativamente baixo na classificação dos POIs.

Porém isso também pode ser por causa da definição dos parâmetros do próprio hmrm, talvez seja bom estudar mais por exemplo o número de componentes latentes (k), peso, tamanho do embedding, etc. **=>** ***se for isso, tenho algumas dúvidas:***

***1. faz sentido testar diferentes valores como no exemplo comentado na main? até achar um que dê resultados melhores?***

***2. ou esses resultados são satisfatórios já que o MTL "aprenderia e melhoraria" as informações?***