# SEQUENTIAL PATTERN MINING

In [2]:
#### Importing Libraries
import numpy as np
import pandas as pd
from matplotlib import cm
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# load dataset
from numpy import dstack
from pandas import read_csv
 
# load a single file as a numpy array
def load_file(filepath):
    dataframe = pd.read_csv(filepath, header=None, delim_whitespace=True)
    return dataframe.values
 
# load a list of files, such as x, y, z data for a given variable
def load_group(filenames, prefix=''):
    loaded = list()
    for name in filenames:
        data = load_file(prefix + name)
        loaded.append(data)
    # stack group so that features are the 3rd dimension
    loaded = np.dstack(loaded)
    return loaded
 
# load a dataset group, such as train or test
def load_dataset(group):
    filepath = group + '/Inertial Signals/'
    # load all 9 files as a single array
    filenames = list()
    # total acceleration
    filenames += ['total_acc_x_'+group+'.txt', 'total_acc_y_'+group+'.txt', 'total_acc_z_'+group+'.txt']
    # body acceleration
    filenames += ['body_acc_x_'+group+'.txt', 'body_acc_y_'+group+'.txt', 'body_acc_z_'+group+'.txt']
    # body gyroscope
    filenames += ['body_gyro_x_'+group+'.txt', 'body_gyro_y_'+group+'.txt', 'body_gyro_z_'+group+'.txt']
    # load input data
    X = load_group(filenames, filepath)
    # load class output
    y = load_file(group + '/y_'+group+'.txt')
    return X, y
 
# load all train
X_train, y_train = load_dataset('train')
print(X_train.shape, y_train.shape)

# load all test
X_test, y_test = load_dataset('test')
print(X_test.shape, y_test.shape)

(7352, 128, 9) (7352, 1)
(2947, 128, 9) (2947, 1)


In [4]:
# tot_acc_x
X_train = X_train[:,:,0]
X_test  = X_test[:,:,0]  

In [6]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

X_train shape: (7352, 128)
X_test shape: (2947, 128)


In [8]:
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.piecewise import SymbolicAggregateApproximation

In [28]:
X_train.shape

(7352, 128)

In [9]:
scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)  # Rescale time series

In [12]:
X_scaled = scaler.fit_transform(X_train)

In [13]:
X_train.shape

(7352, 128)

In [97]:
# SAX transform
n_paa_segments = 12
n_sax_symbols = 8
sax = SymbolicAggregateApproximation(n_segments=n_paa_segments, alphabet_size_avg=n_sax_symbols)

In [98]:
X_sax = sax.fit_transform(X_scaled)

In [99]:
X_sax.shape

(7352, 12, 1)

In [100]:
X_seq = list()
for x in X_sax:
    X_seq.append(x.ravel())

In [101]:
len(X_seq)

7352

In [102]:
from prefixspan import PrefixSpan

In [103]:
help(PrefixSpan)

Help on class PrefixSpan in module prefixspan.prefixspan:

class PrefixSpan(builtins.object)
 |  PrefixSpan(db)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, db)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  defaultkey lambda patt, matches
 |  
 |  frequent = PrefixSpan_frequent(self, minsup, closed=False, generator=False, key=None, bound=None, filter=None, callback=None)
 |  
 |  topk = PrefixSpan_topk(self, k, closed=False, generator=False, key=None, bound=None, filter=None, callback=None)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [104]:
ps = PrefixSpan(X_seq)

In [108]:
patterns_80 = ps.frequent(7352*0.10) # min_sup
listafreq_80 = []
for idx, i in enumerate(patterns_80):
    if len(patterns_80[idx][1])>=4:
        listafreq_80.append(i)
len(listafreq_80)

526

In [116]:
new_list = sorted(listafreq_80, key = lambda x: x[0], reverse=True)
new_list[:10]

[(1250, [4, 4, 4, 4]),
 (1239, [2, 2, 2, 2]),
 (1221, [4, 4, 4, 3]),
 (1218, [3, 3, 3, 3]),
 (1209, [4, 4, 3, 3]),
 (1209, [2, 2, 2, 6]),
 (1209, [2, 6, 2, 2]),
 (1198, [4, 3, 3, 3]),
 (1197, [2, 2, 6, 2]),
 (1189, [6, 2, 2, 2])]

In [None]:
# patterns_80 = ps.frequent(7352*0.) # min_sup
listafreq_80 = []
for idx, i in enumerate(patterns_80):
    if len(patterns_80[idx][1])>=4:
        listafreq_80.append(i)
len(listafreq_80)

In [113]:
new_list = sorted(listafreq_80, key = lambda x: x[0], reverse=True)
new_list[:10]

[(1250, [4, 4, 4, 4]),
 (1239, [2, 2, 2, 2]),
 (1221, [4, 4, 4, 3]),
 (1218, [3, 3, 3, 3]),
 (1209, [4, 4, 3, 3]),
 (1209, [2, 2, 2, 6]),
 (1209, [2, 6, 2, 2]),
 (1198, [4, 3, 3, 3]),
 (1197, [2, 2, 6, 2]),
 (1189, [6, 2, 2, 2])]