# Sequential Pattern Mining

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
import re
import textdistance
import nltk
import ast

from collections import OrderedDict
from sklearn import preprocessing
from numpy import nan
from random import randint
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from data.tgsp import *
from data.spmf import Spmf
from prefixspan import PrefixSpan
from gsppy.gsp import GSP
from apyori import apriori
from sklearn.cluster import AgglomerativeClustering 
from tqdm import tqdm
from functools import reduce

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
df = pd.read_csv("data/clean_df.csv", sep='\t', decimal=',')
df.info()

## Products Generalization

In [None]:
products = []
products = df['ProductDescription'].unique()
print('Total products [{}]'.format(len(products)))

Generalization of products

In [None]:
def normalize(text):
    #Lower-case
    text = str(text)
    text = re.sub('[^a-z0-9]+', ' ', text.lower())
    text = re.sub("\s\s+" , " ", text)

    #Removing numbers
    text = re.sub(r'[0-9]+', '', text)
  
    #NLTK StopWords removal
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    text = (" ").join(tokens_without_sw)

    #Remove Non-Nouns according to POS tagging
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    text = ''
    for t in tagged:
        if(t[1].startswith('N')):
            text+=' {}'.format(t[0])

    #Remove color-material useless infos
    BANS = ['red','blue','green','metal','pink','silver','yellow','white','orange']
    rpl = tuple([(b,'') for b in BANS])
  
    text = reduce(lambda a, kv: a.replace(*kv), rpl, text)
    return text.strip()

def group_texts(texts, threshold=0.5): 
    #Replace each text with the centroid of each cluster
    normalized_texts = np.array([normalize(text) for text in texts])
    len_list = len(normalized_texts)
    print('[DEBUG] Normalization Done')

    distances = []

    for i in tqdm(range(len_list)):
        row = []
        for j in range(len_list):
            row.append(1-textdistance.jaro_winkler(normalized_texts[i], normalized_texts[j]))
        distances.append(row)
    distances = np.array(distances)

    clustering = AgglomerativeClustering(
        distance_threshold=threshold, # this parameter needs to be tuned carefully
        affinity="precomputed", linkage="complete", n_clusters=None
    ).fit(distances)
  
    centers = dict()
    for cluster_id in set(clustering.labels_):
        index = clustering.labels_ == cluster_id
        centrality = distances[:, index][index].sum(axis=1)
        centers[cluster_id] = normalized_texts[index][centrality.argmin()]

    return [centers[i] for i in clustering.labels_]

reduced_prod = group_texts(products)

print('------------Starting Products [{}]------------\n\n{}\n'.format(len(products), products))

print('------------Clustered Products [{}]------------\n\n{}'.format(len(set(reduced_prod)), reduced_prod))

print('\nProducts reduced by {}%'.format(100*len(products)/len(set(reduced_prod))))

In [None]:
df['ProductDescription'].replace(products, reduced_prod,inplace=True)

Transform Date in number of day from 2010 to 2011 and create list of Carts for each Customer

In [None]:
df = df[~df['ProductDescription'].str.contains('|'.join(['nan']))]
df['CartDate'] = df['CartDate'].apply(lambda r: pd.to_datetime(r).dayofyear + (pd.to_datetime(r).year==2011)*365)

groups = df.groupby(['CustomerID','CartDate'])['ProductDescription'].apply(list).reset_index(name='Products')
groups = groups.groupby('CustomerID')['Products'].apply(list).reset_index(name='Itemset')
groups['Carts'] = groups.apply(lambda rec: len(rec['Itemset']), axis=1)

groups.head()

In [None]:
timestamps = df.copy()
timestamps = timestamps.groupby(['CustomerID'])['CartDate'].apply(lambda t: sorted(list(set(t)))).reset_index(name='DayList')
timestamps.head()

In [None]:
behaviour = pd.read_csv("data/customer_behaviour_class.csv", sep='\t', decimal=',')
behaviour = behaviour[['CustomerID', 'CustomerType']]
behaviour.head()

In [None]:
merged_df = timestamps.merge(groups,on='CustomerID')
merged_df = merged_df.merge(behaviour,on='CustomerID')
merged_df.head()

In [None]:
itemset, tmp = {}, {}

merged_df = merged_df[merged_df['Carts'] > 1]

itemset.update({'HIGH': merged_df[merged_df['CustomerType'] == 'high-spending']['Itemset'].tolist()})
itemset.update({'MEDIUM': merged_df[merged_df['CustomerType'] == 'medium-spending']['Itemset'].tolist()})
itemset.update({'LOW': merged_df[merged_df['CustomerType'] == 'low-spending']['Itemset'].tolist()})
tmp.update({'HIGH': merged_df[merged_df['CustomerType'] == 'high-spending']['DayList'].tolist()})
tmp.update({'MEDIUM': merged_df[merged_df['CustomerType'] == 'medium-spending']['DayList'].tolist()})
tmp.update({'LOW': merged_df[merged_df['CustomerType'] == 'low-spending']['DayList'].tolist()})

In [None]:
merged_df.to_csv('data/[MERGED]SequenceDataset.csv')

## Algorithm Analysis

### TGSP

In [None]:
profiles, vals = ['LOW','MEDIUM','HIGH'], [80,600,250]

for p in zip(profiles, vals):
    print('\n\n[{}] Profile \t [{}] Records\n'.format(p[0], len(itemset[p[0]])))
    result_set, rules, freq = apriori(itemset[p[0]],tmp[p[0]], minSupport = p[1], minGap = 1, maxGap = 7, minInterval = 1, verbose=True)

### PrefixSpan

In [None]:
df = pd.read_csv("data/[MERGED]SequenceDataset.csv", sep=',')
df.head()

In [None]:
# Transform Itemset to list of list
# tmp is a list strings (but these strings are list)
tmp = df['Itemset'].values.astype(list).tolist()
li = []
for i in tmp:
    # Convert strings in lists
    li.append(ast.literal_eval(i))
products = [item for sublist in li for item in sublist]

In [None]:
# Take unique products for each cart
products = [list(set(x)) for x in products]

In [None]:
len(products)

In [None]:
ps = PrefixSpan(products)

Pattern in at least 10% of Carts

In [None]:
pattern_10 = ps.frequent(1500)
#filter=lambda patt, matches: len(patt)>1)
pattern_10.sort(key=lambda li: li[0], reverse=True)
print(pattern_10)

In [None]:
pat_tup = [(l[0], '\n'.join(l[1])) for l in pattern_10[:20]]
pat_tup
freq = [t[0] for t in pat_tup]
prod = [t[1] for t in pat_tup]
plt.figure(figsize=(15,5))
plt.bar(prod, freq)
plt.ylabel('Frequence of Pattern')
plt.yticks(range(0, len(products), 1500))
plt.xticks(prod, rotation=45, ha='right')
plt.xlabel('Product Description')
plt.title('Top 20 of 10% Frequent patterns')
plt.show()

### GSPPy

In [None]:
gsp_res = GSP(products).search(0.1)

In [None]:
gsp_res = list(gsp_res[0].items())
gsp_tup = []
for tup in gsp_res:
    gsp_tup.append((tup[1], '\n'.join(tup[0])))
gsp_tup.sort(key=lambda li: li[0], reverse=True)
gsp_tup

In [None]:
freq = [t[0] for t in gsp_tup[:20]]
prod = [t[1] for t in gsp_tup[:20]]
plt.figure(figsize=(15,5))
plt.bar(prod, freq)
plt.ylabel('Frequence of Pattern')
plt.yticks(range(0, len(products), 500))
plt.xticks(prod, rotation=45, ha='right')
plt.xlabel('Product Description')
plt.title('Top 20 of 10% Frequent patterns GSPPy')
#plt.savefig('gsppy.png', bbox_inches = "tight")
plt.show()

### Apyori

In [None]:
apriori_res = apriori(products, min_support=0.1, min_confidence=0.5)
apriori_res = list(apriori_res)

In [None]:
# Print rules
for item in apriori_res:
    # item[2] contains all possible rules for an itemset
    stats = item[2]
    # for each single rule
    for rule in stats:
        pre = [x for x in rule[0]]
        post = [x for x in rule[1]]
        pre = ', '.join(pre)
        post = ', '.join(post)
        print("Rule: " + pre + " --> " + post)
        print("Confidence: "+ str(rule[2]))
        print("------------------------------------")

    #Support for an itemset
    print("Support: " + str(item[1]))
    print("=====================================")

In [None]:
apyori_triples = []
# Print rules
for item in apriori_res:
    # item[2] contains all possible rules for an itemset
    stats = item[2]
    # for each single rule
    for rule in stats:
        pre = [x for x in rule[0]]
        post = [x for x in rule[1]]
        pre = ', '.join(pre)
        post = ', '.join(post)
        rule_str = pre + " --> " + post
        apyori_triples.append((item[1], rule_str, rule[2]))

In [None]:
apyori_triples.sort(key=lambda li: (li[0], li[2]), reverse=True)

In [None]:
# Support, Rule, Confidence
apyori_triples[:20]

### SPMF

In [None]:
# Prepare the input for SPMF package. SPMF takes list of list of list.
# The reduced dataframe has form ["[[s1, s2][s3, s4]]", "[[s5, s6]]"]
# Convert reduced dataframe to list of list of list
tmp = df['Itemset'].values.astype(list).tolist()
spmf_list = []
for i in tmp:
    # Convert strings in lists
    spmf_list.append(ast.literal_eval(i))
# Now spmf_list is a list of list of list.

# Encode products in number, because SPMF works on number
# Take list of all products by open
all_prod = [x for subsub in spmf_list for sub in subsub for x in sub]
# Products encoder
le = preprocessing.LabelEncoder()
le.fit(all_prod)

# Remove duplicated products in each Cart and encode product description.
spmf_list = [[list(set(le.transform(x))) for x in carts] for carts in spmf_list]
# Sort encoded products
spmf_list = [[sorted(subsub) for subsub in sub] for sub in spmf_list]

In [None]:
print("Customer: "+ str(len(spmf_list)))
n_carts = 0
n_customer = 0
n_prod = 0
n_cart = 0
for customer in spmf_list:
    n_carts += len(customer)
    n_customer +=1
    for cart in customer:
        n_prod += len(cart)
        n_cart += 1
print("Average Carts per Customer: "+ str(n_carts/n_customer))
print("Average Products per Cart: "+ str(n_prod/n_cart))

In [None]:
# Convert encoded number in string.
# pattern column is in the form ['n1 n2 n3']. First split by space to obtain list of strings. Then cast str in 
# int using list(map()) function. At the end use label encoder to retrieve original product.
def enc2str(df, le):
    return df['pattern'].apply(lambda li: le.inverse_transform(list(map(int, li[0].split(' ')))))

In [None]:
# Plot top-n pattern from SPMF dataframe.
# Inputs: spmf datframe, label encoder, number of element to see in plot, percentage of spmf,
# algorithm used in spmf, total size of dataset passed to spmf.
def plotSpmf(df, le, n=20, percentage=10, algorithm='', total_size=2000):
    df['pattern'] = enc2str(df, le)
    # Create a list of tuples (products, support)
    tmp = [('\n'.join(list(x)), y)  for x, y in zip(df['pattern'], df['sup'])]
    tmp.sort(key=lambda li: li[1], reverse=True)
    # Prepare plot
    freq = [t[1] for t in tmp[:n]]
    prod = [t[0] for t in tmp[:n]]
    plt.figure(figsize=(15,5))
    plt.bar(prod, freq)
    plt.ylabel('Frequence of Pattern')
    plt.yticks(range(0, total_size, 500))
    plt.xticks(prod, rotation=45, ha='right')
    plt.xlabel('Product Description')
    plt.title('Top '+str(n)+' of '+ str(percentage)+'% Frequent patterns SPMF '+algorithm)
    #plt.savefig('spmf_'+algorithm+'.png', bbox_inches = "tight")
    plt.show()

In [None]:
# Arguments = min support, max length, verbose
spmf = Spmf("PrefixSpan", input_direct=spmf_list, spmf_bin_location_dir='./data', arguments=[0.1])
spmf.run()
patt_PS = spmf.to_pandas_dataframe()
plotSpmf(patt_PS, le, n=20, percentage=10, algorithm='PrefixSpan', total_size=len(spmf_list))

In [None]:
spmf = Spmf("GSP", input_direct=spmf_list, spmf_bin_location_dir='./data', arguments=[0.1])
spmf.run()
patt_GSP = spmf.to_pandas_dataframe()
plotSpmf(patt_GSP, le, n=20, percentage=10, algorithm='GSP', total_size=len(spmf_list))

In [None]:
spmf = Spmf("SPADE", input_direct=spmf_list, spmf_bin_location_dir='./data', arguments=[0.1])
spmf.run()
patt_SPD = spmf.to_pandas_dataframe()
plotSpmf(patt_SPD, le, n=20, percentage=10, algorithm='SPADE', total_size=len(spmf_list))

In [None]:
#Arguments = min support, min pattern size, max pattern size, max gap
spmf = Spmf("SPAM", input_direct=spmf_list, spmf_bin_location_dir='./data', arguments=[0.1])
spmf.run()
patt_SPM = spmf.to_pandas_dataframe()
plotSpmf(patt_SPM, le, n=20, percentage=10, algorithm='SPAM', total_size=len(spmf_list))

In [None]:
spmf = Spmf("SPAM", input_direct=spmf_list, spmf_bin_location_dir='./data', arguments=[0.1, 2])
spmf.run()
patt_SPM2 = spmf.to_pandas_dataframe()
plotSpmf(patt_SPM2, le, n=20, percentage=10, algorithm='SPAM (patt_len>=2)', total_size=len(spmf_list))

In [None]:
spmf = Spmf("LAPIN", input_direct=spmf_list, spmf_bin_location_dir='./data', arguments=[0.1])
spmf.run()
patt_LPN = spmf.to_pandas_dataframe()
plotSpmf(patt_LPN, le, n=20, percentage=10, algorithm='LAPIN', total_size=len(spmf_list))