In [11]:
import numpy as np
import pandas as pd
import json

In [12]:
from mlxtend.frequent_patterns import association_rules, fpmax, fpgrowth, apriori

In [25]:
from mlxtend.preprocessing import TransactionEncoder

In [41]:
dataset_file = np.load('dataset_values.npz')
dataset_values = dataset_file.f.arr_0
dataset_file = None

In [42]:
rows_lens_file = np.load('rows_probas.npz')
rows_lens = rows_lens_file.f.arr_0
rows_lens_file = None

In [43]:
rows_idecies = np.arange(0, rows_lens.shape[0])
proba = np.zeros(len(rows_idecies), dtype=np.float)
for i in range(len(rows_idecies)):
    proba[i] = 2 ** rows_lens[i]
    
proba /= np.sum(proba)

In [68]:
with open('id_inv_map.json', 'r') as f:
    id_inv_map = json.load(f)
    id_inv_map = { int(k): v for k, v in id_inv_map.items() }
unique_ids = { v: k for k, v in id_inv_map.items() }

with open('original_cols.json', 'r') as f:
    original_cols = json.load(f)

In [26]:
te = TransactionEncoder()
te_ary = te.fit(dataset_values).transform(dataset_values)

In [29]:
df = pd.DataFrame(te_ary, columns=te.columns_)

In [30]:
df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2742,2743,2744,2745,2746,2747,2748,2749,2750,2751
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [65]:
def key_gen(col_name, val):
    return f'{col_name}__{val}'

def filter_binary_dataset(binary_df: pd.DataFrame, unique_ids:dict, original_cols: list):
    cols = original_cols
    to_remove = [ unique_ids[key_gen(col_name, -1)] for col_name in cols if key_gen(col_name, -1) in unique_ids ]
    binary_df.drop(columns=to_remove, inplace=True)


In [69]:
filter_binary_dataset(df, unique_ids, original_cols)

In [23]:
def calculate_real_support(frequent_items: pd.DataFrame, df: pd.DataFrame):
    result = frequent_items.copy()

    for i in frequent_items.index:

        cols = list(frequent_items.at[i, 'itemsets'])
        df_subset = df[cols[0]]

        for col in cols:
            df_subset = df_subset & df[col]

        result.at[i, 'support'] = np.sum(df_subset) / df_subset.shape[0]
        
        print(result.at[i, 'support'], frequent_items.at[i, 'support'])

    return result

In [80]:
def merge_frequent_patterns(frequent_patterns: list):
    result = pd.DataFrame(columns=['support', 'itemsets'])
    itemsets = {}
    c = 0
    for frequent_pattern in frequent_patterns:
        for i in frequent_pattern.index:
            item_set = frequent_pattern.iloc[i]

            if item_set['itemsets'] in itemsets: continue
            itemsets[item_set['itemsets']] = True

            result.loc[c] = [item_set['support'], item_set['itemsets']]
            c += 1

    return result
    
def get_frequent_patterns(binary_df: pd.DataFrame, proba: np.ndarray, trials=10, min_support=0.5):
    
    frequent_patterns = []
    
    for i in range(trials):
        print(i)
        selected_rows = np.random.choice(rows_idecies, size=100000, replace=False, p=proba)
        sub_sample = df.iloc[selected_rows]

        fpmax_set = fpmax(sub_sample, min_support=min_support, use_colnames=True)
        
        frequent_patterns.append(fpmax_set)
    
    frequent_patterns = merge_frequent_patterns(frequent_patterns)
    return frequent_patterns

In [81]:
frequent_patterns = get_frequent_patterns(df, proba, trials=20, min_support=0.4)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [82]:
def decompose_key(k:str):
    return k.split('__')[0], int(k.split('__')[1])

In [83]:
# add description to the patterns
f = open('descritption.json', 'r')
desc = json.load(f)

res = []
for i, itemset in frequent_patterns.iterrows():
    l = [itemset['support']]
    
    it = []
    for i in itemset['itemsets']:

        col_name , val = decompose_key(id_inv_map[i])
        d = '__'
        col_desc = desc[col_name]['description']
        
        if col_desc['type'] == 'mono':
            d = col_desc['value']
        else:
            d = col_desc[str(val)]
        
        it.append((col_name, val, d))
        
    res.append((it, l))

In [84]:
for r in res: #[s:s + 10]:
    print(r)
    print()

([('grav', 1, 'Indemne'), ('actp', 0, 'non spécifié ou non applicable')], [0.41472])

([('trajet', 5, 'Promenade - loisirs')], [0.42622])

([('agg', 2, 'En agglomération'), ('int', 1, 'Hors intersection')], [0.40003])

([('agg', 2, 'En agglomération'), ('sexe', 1, 'Homme')], [0.4245])

([('catv', 7, 'VL uniquement'), ('agg', 2, 'En agglomération')], [0.45489])

([('agg', 2, 'En agglomération'), ('lum', 1, 'Plein Jour')], [0.46438])

([('actp', 0, 'non spécifié ou non applicable'), ('catu', 1, 'Pilote'), ('atm', 1, 'Normal'), ('place', 1, 'Place occupée'), ('agg', 2, 'En agglomération')], [0.40017])

([('actp', 0, 'non spécifié ou non applicable'), ('lum', 1, 'Plein Jour'), ('sexe', 1, 'Homme')], [0.41886])

([('catv', 7, 'VL uniquement'), ('actp', 0, 'non spécifié ou non applicable'), ('sexe', 1, 'Homme')], [0.41847])

([('actp', 0, 'non spécifié ou non applicable'), ('int', 1, 'Hors intersection'), ('sexe', 1, 'Homme')], [0.43107])

([('actp', 0, 'non spécifié ou non applicable'), ('c

In [89]:
frequent_assoc_rules = association_rules(frequent_patterns, min_threshold=0.4, support_only=True)

In [90]:
frequent_assoc_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(2570),(2596),,,0.41472,,,,
1,(2596),(2570),,,0.41472,,,,
2,(1499),(1501),,,0.40003,,,,
3,(1501),(1499),,,0.40003,,,,
4,(1499),(2575),,,0.42450,,,,
...,...,...,...,...,...,...,...,...,...
261,"(1512, 2596)",(1499),,,0.45610,,,,
262,"(1499, 2596)",(1512),,,0.45610,,,,
263,(1512),"(1499, 2596)",,,0.45610,,,,
264,(1499),"(1512, 2596)",,,0.45610,,,,
