In [1]:
import pandas as pd
import numpy as np
from shapely import wkt
import datetime as dt
import time

# Count support for all candidate 1-sequences

In [2]:
def isSubseq(seq, sub):
    n = len(seq)
    m = len(sub)
    j = 0

    for i in range(n):
        if seq[i] == sub[j]:
            j+=1
        if j == m:
            return True
    return False

def GSP(data, support, city):
    # Due to the scale of the data, limit the frequency mining to only the first month's data (30 days) to manage computational demands effectively.
    data['finished_at'] = pd.to_datetime(data['finished_at'])
    day31 = pd.date_range("1970-01-31", periods=1, freq="h")
    day31 = pd.to_datetime(day31, utc = True)
    data = data[data['finished_at']<day31[0]]
    
    data['geom'] = data['geom'].apply(wkt.loads)
    data['sequences'] = data['geom'].apply(lambda x: np.round(np.array(list(x.coords)))) # round to integer and create a list of coordinates
    data = data.reset_index(drop=True)

    print(data.shape)

    # use array to record subsequence counts with format [[elements], support, [support_indexes]]
    sequence_count = len(data)
    sequences = []
    items = set()
    
    for i in range(sequence_count):
        part = list(map(tuple, data['sequences'].iloc[i]))
        sequences.append(part)
        
    try:
        freq_df = pd.read_pickle(f'1-set_support_{city}.pkl')

    except:
        for i in range(sequence_count):
            for j in range(len(sequences[i])):
                items.add(tuple(sequences[i][j]))
    
        print(f'Candidate 1-sequences for city {city}: {len(items)}')
    
        freq_arr = []
        
        # count occurrences of candidate 1-sequences
        for item in items:
            support = 0
            support_indexes = []
            for i in range(sequence_count):
                if item in sequences[i]:
                    support+=1
                    support_indexes.append(i)
                    
            freq_arr.append([item, support, support_indexes])
    
        freq_df = pd.DataFrame(freq_arr, columns = ['itemset', 'support', 'support_indexes'])
        freq_df.to_pickle(f'1-set_support_{city}.pkl')

    min_sup = sequence_count * support
    freq_df = freq_df[freq_df['support']>=min_sup].reset_index(drop = True)
    print(f'Frequent 1-sequences for city {city}: {len(freq_df)}')
    
    # iterate over frequent 1-sequences
    for index in range(len(freq_df)):
        items = set()
        item_arr = freq_df.iloc[index]

        # generate candidate 2-sequences
        for i in item_arr['support_indexes']:
            n = len(sequences[i])

            for j in range(n-1):
                if sequences[i][j] == item_arr['itemset']:
                    for k in range(j+1, n):
                        items.add((item_arr['itemset'],sequences[i][k]))
                    break
                    
        # count occurrences of candidate 2-sequences
        for item in items:
            support = 0
            support_indexes = []

            # for m in range(sequence_count): # simple
            for m in item_arr['support_indexes']: # improved
                if isSubseq(sequences[m], item):
                    support+=1
                    support_indexes.append(m)
                    
            if support >= min_sup:
                freq_df = pd.concat([freq_df, pd.DataFrame([[item, support, support_indexes]], columns = ['itemset', 'support', 'support_indexes'])])

    freq_df.to_csv(f'frequent_itemset_{city}.csv', index = False)
    print(f'Frequent sequences for city {city}: {len(freq_df)}')

    return freq_df

In [None]:
start = time.time()

for city in ['D','C','B']:
    data = pd.read_csv(f'triplegs_{city}_31.csv')
    freq_df = GSP(data, 0.001, city)

end = time.time()
print(f'{(end-start)/60} minutes')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['geom'] = data['geom'].apply(wkt.loads)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sequences'] = data['geom'].apply(lambda x: np.round(np.array(list(x.coords)))) # round to integer and create a list of coordinates


(495921, 6)
Frequent 1-sequences for city D: 586
Frequent sequences for city D: 587


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['geom'] = data['geom'].apply(wkt.loads)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sequences'] = data['geom'].apply(lambda x: np.round(np.array(list(x.coords)))) # round to integer and create a list of coordinates


(1318140, 6)
Frequent 1-sequences for city C: 952


In [None]:
start = time.time()

data = pd.concat([pd.read_csv('triplegs_A_first.csv'),pd.read_csv('triplegs_A_first.csv')])
freq_df = GSP(data, 0.001, 'A')

end = time.time()
print(f'{(end-start)/60} minutes')