In [1]:
import pandas as pd
import numpy as np
from shapely import wkt
import datetime as dt
import time

In [2]:
def isSubseq(seq, sub):
    n = len(seq)
    m = len(sub)
    j = 0

    for i in range(n):
        if seq[i] == sub[j]:
            j+=1
        if j == m:
            return True
    return False

In [3]:
start = time.time()

In [None]:
for city in ['D','C','B']:
    data = pd.read_csv(f'triplegs_{city}.csv')
    
    # Due to the scale of the data, limit the frequency mining to only the first month's data (30 days) to manage computational demands effectively.
    data['finished_at'] = pd.to_datetime(data['finished_at'])
    day31 = pd.date_range("1970-01-31", periods=1, freq="h")
    day31 = pd.to_datetime(day31, utc = True)
    data = data[data['finished_at']<day31[0]]
    
    data['geom'] = data['geom'].apply(wkt.loads)
    data['sequences'] = data['geom'].apply(lambda x: np.round(np.array(list(x.coords)))) # force to integer and create a list of coordinates
    data = data.reset_index(drop=True)

    print(data.shape)

    # use array to record subsequence counts with format [[elements], support, [support_indexes]]
    freq_arr = []
    sequence_count = len(data)
    min_sup = sequence_count * 0.01
    items = set()
    sequences = []
    
    for i in range(sequence_count):
        part = list(map(tuple, data['sequences'].iloc[i]))
        sequences.append(part)
    
    for i in range(sequence_count):
        for j in range(len(sequences[i])):
            items.add(tuple(sequences[i][j]))
            
    # count occurrences of candidate 1-sequences
    for item in items:
        support = 0
        support_indexes = []
        for i in range(sequence_count):
            if item in sequences[i]:
                support+=1
                support_indexes.append(i)
        if support >= min_sup:
            freq_arr.append([item, support, support_indexes])

    print(f'Frequent 1-sequences for city {city}: {len(freq_arr)}')
    
    # iterate over frequent 1-sequences
    freq_arr_1 = freq_arr.copy()
    for item_arr in freq_arr_1:
        items = set()

        # generate candidate 2-sequences
        for i in item_arr[2]:
            n = len(sequences[i])

            for j in range(n-1):
                if sequences[i][j] == item_arr[0]:
                    for k in range(j+1, n):
                        items.add((item_arr[0],sequences[i][k]))
                    break
                    
        # count occurrences of candidate 2-sequences
        for item in items:
            support = 0
            support_indexes = []
            
            for m in item_arr[2]:
                if isSubseq(sequences[m], item):
                    support+=1
                    support_indexes.append(m)
                    
            if support >= min_sup:
                freq_arr.append([item, support, support_indexes])

    freq_df = pd.DataFrame(freq_arr, columns = ['itemset', 'support', 'support_indices'])
    freq_df.to_csv(f'frequent_subsequences_{city}.csv', index = False)
    print(f'Frequent sequences for city {city}: {len(freq_arr)}')

(382178, 6)
Frequent 1-sequences for city D: 0
Frequent sequences for city D: 0
(981350, 6)
Frequent 1-sequences for city C: 6
Frequent sequences for city C: 6
(1294493, 6)


In [None]:
end = time.time()
print(f'{(end-start)/60} minutes')