In [4]:
import pandas as pd
import numpy as np

from datetime import datetime
import os

In [5]:
def get_dataset_paths(state_name):
    root_path = '/Users/vitor/Desktop/mestrado/ingred/data/output'
    path_embeddings = f'{root_path}/{state_name}/{state_name}-embeddings.csv'
    path_filtrated = f'{root_path}/{state_name}/{state_name}-filtrado.csv'

    #creatre the directory if it does not exist
    os.makedirs(f'{root_path}/{state_name}/pre-processing/', exist_ok=True)
    path_nextpoi_sequences = f'{root_path}/{state_name}/pre-processing/nextpoi-sequences.csv'
    path_nextpoi_input = f'{root_path}/{state_name}/pre-processing/nextpoi-input.csv'
    path_categorypoi_input = f'{root_path}/{state_name}/pre-processing/categorypoi-input.csv'

    paths = [path_embeddings, path_filtrated, path_nextpoi_sequences, path_nextpoi_input, path_categorypoi_input]

    return paths


def get_embeddings(path, state_name):
    embeddings = pd.read_csv(path)
    print(f'embeddings_df {state_name} shape: {embeddings.shape}')

    return embeddings

In [6]:
# alabama
paths_alabama = get_dataset_paths('alabama')
embeddings_alabama = get_embeddings(paths_alabama[0], 'alabama')

embeddings_df alabama shape: (10499, 102)


In [None]:
# arizona
paths_arizona = get_dataset_paths('arizona')
embeddings_arizona = get_embeddings(paths_arizona[0], 'arizona')

embeddings_df arizona shape: (16357, 102)


In [None]:
# virginia
paths_virginia = get_dataset_paths('virginia')
embeddings_virginia = get_embeddings(paths_virginia[0], 'virginia')

embeddings_df virginia shape: (20947, 102)


In [None]:
# chicago
paths_chicago = get_dataset_paths('chicago')
embeddings_chicago = get_embeddings(paths_chicago[0], 'chicago')

embeddings_df chicago shape: (9092, 102)


In [None]:
# georgia
paths_georgia = get_dataset_paths('georgia')
embeddings_georgia = get_embeddings(paths_georgia[0], 'georgia')

embeddings_df georgia shape: (23452, 102)


### **category-poi input preprocessing**

#### preprocessing

In [7]:
def gen_categorypoi_input(embeddings, paths):
    # essa função apenas recebe os embeddings gerados pelo hmrm e os salva em um csv
    # que é a entrada do categorypoi
    embeddings_with_category = embeddings.copy()
    embeddings_with_category = embeddings_with_category.set_index('placeid')

    categorypoi_input = embeddings_with_category.copy()
    print(f'categorypoi_input shape: {categorypoi_input.shape}')
    print(f'categorypoi_input columns: {categorypoi_input.columns}\n')

    categorypoi_input.to_csv(paths[4])

    if os.path.exists(paths[4]):
        print(f'categorypoi_input saved successfully at {paths[4]}')
    else:
        print(f'Error: Failed to save categorypoi_input at {paths[4]}')

#### generating input

In [8]:
# # alabama
gen_categorypoi_input(embeddings_alabama, paths_alabama)

categorypoi_input shape: (10499, 101)
categorypoi_input columns: Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '91', '92', '93', '94', '95', '96', '97', '98', '99', 'category'],
      dtype='object', length=101)

categorypoi_input saved successfully at /Users/vitor/Desktop/mestrado/ingred/data/output/alabama/pre-processing/categorypoi-input.csv


In [None]:
# # arizona
gen_categorypoi_input(embeddings_arizona, paths_arizona)

categorypoi_input shape: (16357, 101)
categorypoi_input columns: Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '91', '92', '93', '94', '95', '96', '97', '98', '99', 'category'],
      dtype='object', length=101)

categorypoi_input saved successfully at /content/drive/MyDrive/POC/Dados/pre-processing/arizona/categorypoi-input.csv


In [None]:
# # virginia
gen_categorypoi_input(embeddings_virginia, paths_virginia)

categorypoi_input shape: (20947, 101)
categorypoi_input columns: Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '91', '92', '93', '94', '95', '96', '97', '98', '99', 'category'],
      dtype='object', length=101)

categorypoi_input saved successfully at /content/drive/MyDrive/POC/Dados/pre-processing/virginia/categorypoi-input.csv


In [None]:
# chicago
gen_categorypoi_input(embeddings_chicago, paths_chicago)

categorypoi_input shape: (9092, 101)
categorypoi_input columns: Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '91', '92', '93', '94', '95', '96', '97', '98', '99', 'category'],
      dtype='object', length=101)

categorypoi_input saved successfully at /content/drive/MyDrive/Graduacao/POC/Dados/pre-processing/chicago/categorypoi-input.csv


In [None]:
# georgia
gen_categorypoi_input(embeddings_georgia, paths_georgia)

categorypoi_input shape: (23452, 101)
categorypoi_input columns: Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '91', '92', '93', '94', '95', '96', '97', '98', '99', 'category'],
      dtype='object', length=101)

categorypoi_input saved successfully at /content/drive/MyDrive/Graduacao/POC/Dados/pre-processing/georgia/categorypoi-input.csv


### **next-poi input preprocessing**

#### preprocessing

In [9]:
def sort_checkins(checkins):
    # ordenando os checkins pelo dia e horário de visita
    checkins = [datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in checkins]
    return sorted(checkins, key=lambda x: (x.date(), x.time()))


def generate_sequences(places_visiteds, max_sequences_per_user):
    len_sequence = len(places_visiteds)
    sequences_by_user = []

    if len_sequence <= 5:
        return 0

    for i in range(0, len_sequence, 9):

        if len_sequence >= i + 10:
            seq = places_visiteds[i:i + 10]
        else:
            seq = places_visiteds[i:len_sequence]
            seq = [-1 for _ in range(10 - len(seq))] + seq

        sequences_by_user.append(seq)

    return sequences_by_user

In [10]:
def gen_sequences_csv(users_ids, sequences, paths):
    nextpoi_sequences = pd.DataFrame(columns=['userid'] + list(range(1, 11)))
    row = 0

    for id in users_ids:
        sequencia = sequences.loc[id].iloc[0]
        tam_seq = len(sequencia)

        for i in range(tam_seq):
            trajetoria = sequencia[i]
            dados = {i + 1: trajetoria[i] for i in range(len(trajetoria))}
            dados['userid'] = id
            nextpoi_sequences.loc[row] = dados
            row += 1

    print(f'nextpoi_sequences shape: {nextpoi_sequences.shape}')

    # ao final salvo todas as sequências em um csv
    nextpoi_sequences.to_csv(paths[2], index=False)
    if os.path.exists(paths[2]):
        print(f'Success: nextpoi_sequences saved successfully at {paths[2]}\n')
    else:
        print(f'Error: Failed to save nextpoi_sequences at {paths[2]}\n')

In [11]:
def gen_nextpoi_input_csv(df_nextpoi_sequences, paths, embeddings_with_category, embeddings_without_category,
                          save_step=1000):
    nextpoi_input_file = paths[3]

    if os.path.exists(nextpoi_input_file):
        nextpoi_input = pd.read_csv(nextpoi_input_file)
        start_index = nextpoi_input.shape[0]
        print(f'Já existe e recomecei do indice {start_index} no nextpoi_input\n')
    else:
        nextpoi_input = pd.DataFrame(columns=range((100 * 9) + 10))
        new_names = list(nextpoi_input.columns[:-1]) + ['userid']
        nextpoi_input.columns = new_names
        start_index = 0

    idx = start_index
    total_rows = len(df_nextpoi_sequences)

    for i, (userid, linha) in enumerate(df_nextpoi_sequences.iterrows()):
        if idx >= total_rows:
            break

        if i < start_index:
            continue
        idxs = [f"{i}" for i in range(2, 11)]

        target = linha.loc[idxs]
        categoria_target = embeddings_with_category.loc[target]['category']
        linha = linha.drop('10')

        poi_embedding = np.array(embeddings_without_category.loc[linha])
        sample = np.append(poi_embedding, list(categoria_target) + [userid])
        nextpoi_input.loc[idx] = sample

        if idx % save_step == 0:
            nextpoi_input.to_csv(nextpoi_input_file, index=False)
            print(f'Saved {idx} rows to {nextpoi_input_file}')
            print(f'Processando linha {i} do df_nextpoi_sequences')

        idx += 1

    nextpoi_input.to_csv(nextpoi_input_file, index=False)
    print(f'Final save: {idx} rows to {nextpoi_input_file}')

In [12]:
def gen_nextpoi_input(embeddings, paths, max_sequences_per_user):
    filtered_checkins = pd.read_csv(paths[1])
    df = filtered_checkins.copy()

    # ordenando os checkins por data e horario de visita para cada usuário
    checkins_sorted = df.sort_values(by=['userid', 'datetime'])
    checkins_by_user = checkins_sorted.groupby('userid').agg(list).reset_index()
    checkins_by_user['datetime'] = checkins_by_user['datetime'].apply(sort_checkins)
    print(f'checkins_by_user shape: {checkins_by_user.shape}\n')

    # com base na ordem de visita do usuário, eu crio uma sequência de visita com 10 poi's onde o último é o que quero predizer
    checkins_sequence_by_user = checkins_by_user[['userid']].copy()
    checkins_sequence_by_user['visit_sequence'] = checkins_by_user['placeid'].apply(
        lambda x: generate_sequences(x, max_sequences_per_user))
    checkins_sequence_by_user = checkins_sequence_by_user[checkins_sequence_by_user['visit_sequence'] != 0]
    print(f'checkins_sequence_by_user shape{checkins_sequence_by_user.shape}\n')

    # sequências de cada usuário sendo indexadas pelo userid
    sequences = checkins_sequence_by_user.copy()
    sequences = sequences.set_index('userid')

    # todos os usuários que estão sendo considerados
    users_ids = sequences.index.unique()
    print(f'total users: {len(users_ids)}\n')

    # embeddings com categoria gerados pelo hmrm indexados pelo placeid
    embeddings_with_category = embeddings.copy()
    embeddings_with_category = embeddings_with_category.set_index('placeid')
    print(f'embeddings with category shape: {embeddings_with_category.shape}')

    # embeddings sem categoria gerados pelo hmrm indexados pelo placeid
    embeddings_without_category = embeddings.drop(columns=['category'])
    embeddings_without_category = embeddings_without_category.set_index('placeid')
    print(f'embeddings without category shape: {embeddings_without_category.shape}')

    # embedding de zeros para sequências com padding (modelo aprende a representar eles melhor)
    emb_zeros = [0 for _ in range(100)]

    embeddings_with_category.loc[-1] = emb_zeros + [0]
    embeddings_with_category.loc[-1, 'category'] = 'None'

    new_row = pd.DataFrame([emb_zeros], columns=embeddings_without_category.columns, index=[-1])
    embeddings_without_category = pd.concat([embeddings_without_category, new_row])
    print(f'embeddings without category shape after add emb_zeros: {embeddings_without_category.shape}\n')

    # gerando samples para as sequencias de visita de cada usuário
    gen_sequences_csv(users_ids, sequences, paths)

    # leio o csv com as sequencias em placeid e 'indexo' pelo userid da sequencia
    df_nextpoi_sequences = pd.read_csv(paths[2])
    df_nextpoi_sequences = df_nextpoi_sequences.set_index('userid')

    # gerando o csv final => input do nextpoi
    gen_nextpoi_input_csv(df_nextpoi_sequences, paths, embeddings_with_category, embeddings_without_category)
    nextpoi_input = pd.read_csv(paths[3])

    return nextpoi_input

#### generating input

##### alabama

In [13]:
# alabama
a = nextpoi_input_alabama = gen_nextpoi_input(embeddings_alabama, paths_alabama, max_sequences_per_user=50)

checkins_by_user shape: (465, 20)

checkins_sequence_by_user shape(465, 2)

total users: 465

embeddings with category shape: (10499, 101)
embeddings without category shape: (10499, 100)
embeddings without category shape after add emb_zeros: (10500, 100)

nextpoi_sequences shape: (10582, 11)
Success: nextpoi_sequences saved successfully at /Users/vitor/Desktop/mestrado/ingred/data/output/alabama/pre-processing/nextpoi-sequences.csv

def_nextpoi_sequences shape: (10582, 10)

            1      2      3      4      5      6      7      8      9     10
userid                                                                      
18       9531   9531   9532   9533   9693   9695  10599  25712  25732  25743
18      25743  25762  26573  26580  27015  26573  25712  25712  26580  26580
18      26580  29832  27015  26573  31937  34482   9695  35621  33042  26580
18      26580  27015  26573  25712  26580  25712  26573  27015  26573  26580
18      26580  46500  25712  25732  26580  25712  26573  70

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1026f63a0>>
Traceback (most recent call last):
  File "/Users/vitor/Desktop/mestrado/ingred/.venv/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Saved 2000 rows to /Users/vitor/Desktop/mestrado/ingred/data/output/alabama/pre-processing/nextpoi-input.csv
Processando linha 2000 do df_nextpoi_sequences


KeyboardInterrupt: 

##### arizona

In [None]:
# arizona
nextpoi_input_arizona = gen_nextpoi_input(embeddings_arizona, paths_arizona, max_sequences_per_user=50)

checkins_by_user shape: (756, 8)

checkins_sequence_by_user shape(756, 2)

total users: 756

embeddings with category shape: (16357, 101)
embeddings without category shape: (16357, 100)
embeddings without category shape after add emb_zeros: (16358, 100)

nextpoi_sequences shape: (17242, 11)
Success: nextpoi_sequences saved successfully at /content/drive/MyDrive/POC/Dados/pre-processing/arizona/nextpoi-sequences.csv

def_nextpoi_sequences shape: (17242, 10)

Saved 0 rows to /content/drive/MyDrive/POC/Dados/pre-processing/arizona/nextpoi-input.csv
Processando linha 0 do df_nextpoi_sequences
Saved 1000 rows to /content/drive/MyDrive/POC/Dados/pre-processing/arizona/nextpoi-input.csv
Processando linha 1000 do df_nextpoi_sequences
Saved 2000 rows to /content/drive/MyDrive/POC/Dados/pre-processing/arizona/nextpoi-input.csv
Processando linha 2000 do df_nextpoi_sequences
Saved 3000 rows to /content/drive/MyDrive/POC/Dados/pre-processing/arizona/nextpoi-input.csv
Processando linha 3000 do df_ne

##### virginia

In [None]:
# virginia
nextpoi_input_virginia = gen_nextpoi_input(embeddings_virginia, paths_virginia, max_sequences_per_user=50)

checkins_by_user shape: (1059, 8)

checkins_sequence_by_user shape(1059, 2)

total users: 1059

embeddings with category shape: (20947, 101)
embeddings without category shape: (20947, 100)
embeddings without category shape after add emb_zeros: (20948, 100)

nextpoi_sequences shape: (22189, 11)
Success: nextpoi_sequences saved successfully at /content/drive/MyDrive/POC/Dados/pre-processing/virginia/nextpoi-sequences.csv

def_nextpoi_sequences shape: (22189, 10)

Saved 0 rows to /content/drive/MyDrive/POC/Dados/pre-processing/virginia/nextpoi-input.csv
Processando linha 0 do df_nextpoi_sequences
Saved 1000 rows to /content/drive/MyDrive/POC/Dados/pre-processing/virginia/nextpoi-input.csv
Processando linha 1000 do df_nextpoi_sequences
Saved 2000 rows to /content/drive/MyDrive/POC/Dados/pre-processing/virginia/nextpoi-input.csv
Processando linha 2000 do df_nextpoi_sequences
Saved 3000 rows to /content/drive/MyDrive/POC/Dados/pre-processing/virginia/nextpoi-input.csv
Processando linha 3000 

##### chicago

In [None]:
# chicago
nextpoi_input_chicago = gen_nextpoi_input(embeddings_chicago, paths_chicago, max_sequences_per_user=50)

checkins_by_user shape: (861, 14)

checkins_sequence_by_user shape(861, 2)

total users: 861

embeddings with category shape: (9092, 101)
embeddings without category shape: (9092, 100)
embeddings without category shape after add emb_zeros: (9093, 100)

nextpoi_sequences shape: (15706, 11)
Success: nextpoi_sequences saved successfully at /content/drive/MyDrive/Graduacao/POC/Dados/pre-processing/chicago/nextpoi-sequences.csv

def_nextpoi_sequences shape: (15706, 10)

Saved 0 rows to /content/drive/MyDrive/Graduacao/POC/Dados/pre-processing/chicago/nextpoi-input.csv
Processando linha 0 do df_nextpoi_sequences
Saved 1000 rows to /content/drive/MyDrive/Graduacao/POC/Dados/pre-processing/chicago/nextpoi-input.csv
Processando linha 1000 do df_nextpoi_sequences
Saved 2000 rows to /content/drive/MyDrive/Graduacao/POC/Dados/pre-processing/chicago/nextpoi-input.csv
Processando linha 2000 do df_nextpoi_sequences
Saved 3000 rows to /content/drive/MyDrive/Graduacao/POC/Dados/pre-processing/chicago/n

##### georgia

In [None]:
# georgia
nextpoi_input_georgia = gen_nextpoi_input(embeddings_georgia, paths_georgia, max_sequences_per_user=50)

checkins_by_user shape: (1159, 8)

checkins_sequence_by_user shape(1159, 2)

total users: 1159

embeddings with category shape: (23452, 101)
embeddings without category shape: (23452, 100)
embeddings without category shape after add emb_zeros: (23453, 100)

nextpoi_sequences shape: (31197, 11)
Success: nextpoi_sequences saved successfully at /content/drive/MyDrive/Graduacao/POC/Dados/pre-processing/georgia/nextpoi-sequences.csv

def_nextpoi_sequences shape: (31197, 10)

Já existe e recomecei do indice 26001 no nextpoi_input

Saved 27000 rows to /content/drive/MyDrive/Graduacao/POC/Dados/pre-processing/georgia/nextpoi-input.csv
Processando linha 27000 do df_nextpoi_sequences
Saved 28000 rows to /content/drive/MyDrive/Graduacao/POC/Dados/pre-processing/georgia/nextpoi-input.csv
Processando linha 28000 do df_nextpoi_sequences
Saved 29000 rows to /content/drive/MyDrive/Graduacao/POC/Dados/pre-processing/georgia/nextpoi-input.csv
Processando linha 29000 do df_nextpoi_sequences
Saved 30000 r