# Generalized Sequential Pattern mining

#### There is not a referenced library for GSP. The implementation used in this notebook does not support time constraints, such as min gap and max span.

In [1]:
from gsp import *

#### Let's use a simple dataset

In [3]:
# we can consider the first element at timestep 0, the second at timestep 1 and so on
dataset =  [
    [["a"], ["a", "b", "c"], ["a", "c"], ["c"]],
    [["a"], ["c"], ["b", "c"]],
    [["a", "b"], ["d"], ["c"], ["b"], ["c"]],
    [["a"], ["c"], ["b", "c"]]
]


In [4]:
#signature requires: the dataset, the min support and the boolean verbose
result_set = apriori(dataset, 2, verbose=False)

In [5]:
result_set

[([['a']], 4),
 ([['b']], 4),
 ([['c']], 4),
 ([['a', 'b']], 2),
 ([['b', 'c']], 3),
 ([['a'], ['b']], 4),
 ([['a'], ['c']], 4),
 ([['b'], ['c']], 2),
 ([['c'], ['b']], 3),
 ([['c'], ['c']], 4),
 ([['a'], ['b'], ['c']], 2),
 ([['a'], ['b', 'c']], 3),
 ([['a'], ['c'], ['b']], 3),
 ([['a'], ['c'], ['c']], 4),
 ([['a', 'b'], ['c']], 2),
 ([['b'], ['c'], ['c']], 2),
 ([['c'], ['b', 'c']], 2),
 ([['a'], ['c'], ['b', 'c']], 2),
 ([['a', 'b'], ['c'], ['c']], 2)]

In [6]:
#if we set verbose = True, then we can see the 2 steps: 
#candidate generation and then candidate pruning
res2 = apriori(dataset, 3, verbose=True)

Candidates generated, lvl 2: [[['a', 'b']], [['a', 'c']], [['b', 'c']], [['a'], ['a']], [['a'], ['b']], [['a'], ['c']], [['b'], ['a']], [['b'], ['b']], [['b'], ['c']], [['c'], ['a']], [['c'], ['b']], [['c'], ['c']]]
Candidates pruned, lvl 2: [[['a', 'b']], [['a', 'c']], [['b', 'c']], [['a'], ['a']], [['a'], ['b']], [['a'], ['c']], [['b'], ['a']], [['b'], ['b']], [['b'], ['c']], [['c'], ['a']], [['c'], ['b']], [['c'], ['c']]]
Result, lvl 2: [([['b', 'c']], 3), ([['a'], ['b']], 4), ([['a'], ['c']], 4), ([['c'], ['b']], 3), ([['c'], ['c']], 4)]
Candidates generated, lvl 3: [[['a'], ['b', 'c']], [['a'], ['c'], ['b']], [['a'], ['c'], ['c']], [['b', 'c'], ['b']], [['b', 'c'], ['c']], [['c'], ['b', 'c']], [['c'], ['c'], ['b']], [['c'], ['c'], ['c']]]
Candidates pruned, lvl 3: [[['a'], ['b', 'c']], [['a'], ['c'], ['b']], [['a'], ['c'], ['c']], [['c'], ['b', 'c']], [['c'], ['c'], ['b']], [['c'], ['c'], ['c']]]
Result, lvl 3: [([['a'], ['b', 'c']], 3), ([['a'], ['c'], ['b']], 3), ([['a'], ['c'],

In [7]:
res2

[([['a']], 4),
 ([['b']], 4),
 ([['c']], 4),
 ([['b', 'c']], 3),
 ([['a'], ['b']], 4),
 ([['a'], ['c']], 4),
 ([['c'], ['b']], 3),
 ([['c'], ['c']], 4),
 ([['a'], ['b', 'c']], 3),
 ([['a'], ['c'], ['b']], 3),
 ([['a'], ['c'], ['c']], 4)]

## Example with a bigger dataset

In [9]:
# datatset of point of interest in venice
with open('./datasets/sequences_of_poits.text') as f:
    content = f.readlines()
content = [ [ [event] for event in x.strip().split()[:-1] ] for x in content]

print("Number of input sequences: ", len(content))
print("Total number of events: ", sum([len(seq) for seq in content])) 

Number of input sequences:  1728
Total number of events:  8638


In [10]:
content[0]

[["Torre_dell'orologio"],
 ['Museo_Correr'],
 ['Chiesa_Santa_Maria_della_Fava'],
 ['Fontego_dei_Tedeschi'],
 ['Chiesa_di_San_Barnaba'],
 ["Gallerie_dell'Accademia"],
 ['Palazzo_Santa_Maria_del_Giglio'],
 ['Campo_San_Zaccaria']]

In [11]:
content[13]

[['Chiesa_di_San_Trovaso'], ['Museo_Correr'], ['Campo_San_Benedetto']]

In [12]:
res3 = apriori(content, 60, verbose=False)
print(len(res3))

85


In [13]:
for r in res3:
    if len(r[0]) > 2:
        print(r[0])
        print(r[1])

[['Fontego_dei_Tedeschi'], ['Palazzo_Ducale'], ['Palazzo_Ducale']]
60
[['Fontego_dei_Tedeschi'], ["Torre_dell'orologio"], ['Palazzo_Ducale']]
60
[['Museo_Correr'], ['Palazzo_Ducale'], ['Palazzo_Ducale']]
62
[['Palace_Gardens'], ['Palazzo_Ducale'], ['Palazzo_Ducale']]
61
[['Palazzo_Ducale'], ['Fontego_dei_Tedeschi'], ['Palazzo_Ducale']]
65
[['Palazzo_Ducale'], ['Museo_Correr'], ['Palazzo_Ducale']]
80
[['Palazzo_Ducale'], ['Palace_Gardens'], ['Palazzo_Ducale']]
80
[['Palazzo_Ducale'], ['Palazzo_Ducale'], ['Fontego_dei_Tedeschi']]
60
[['Palazzo_Ducale'], ['Palazzo_Ducale'], ['Palace_Gardens']]
63
[['Palazzo_Ducale'], ['Palazzo_Ducale'], ['Palazzo_Ducale']]
109
[['Palazzo_Ducale'], ['Palazzo_Ducale'], ["Torre_dell'orologio"]]
80
[['Palazzo_Ducale'], ["Torre_dell'orologio"], ['Palazzo_Ducale']]
106
[["Torre_dell'orologio"], ['Palazzo_Ducale'], ['Palazzo_Ducale']]
83
[["Torre_dell'orologio"], ['Palazzo_Ducale'], ["Torre_dell'orologio"]]
64
