In [None]:
import os
import sys
import numpy as np
import pandas as pd
import pysubgroup as ps
import pickle
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn import tree

In [None]:
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'SDQL'))

In [None]:
saved_path = os.path.join(os.path.dirname(os.getcwd()), 'Saved Data\\')

In [None]:
from sd_analysis import *
from subgroup_discovery import *
from sd_postprocessing import *

In [None]:
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
sns.set(color_codes=True)
%matplotlib inline
plt.rcParams["figure.figsize"] = [16, 6]

In [None]:
import warnings
warnings.filterwarnings("ignore")
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')

In [None]:
requetes = pd.read_csv(saved_path + 'requetes.csv', index_col=[0])

## USE CASES

### Use cases 1 : Execution TIME

#### Table = mvtrealise

In [None]:
table = 'fr.infologic.stocks.gestion.modele.mvtrealise'

In [None]:
requetes_ = get_df_table (requetes, table) 
get_analysis(requetes_)

In [None]:
{'algorithm' : 'beam search' , 'mesure' : 'Support'}

result1_supp = sd_binary_table (requetes, 
                       table   = table,
                       _target = 'discDurations',
                       mesure  = 'Support',
                       _depth  = 1,
                       threshold = 10000,
                       result_size = 200,
                       algorithm   = 'Beam Search',
                       _beam_width = 200)
res1_supp = result1_supp.to_dataframe()
res1_supp[res1_supp['coverage_sg'] > 0.94]

In [None]:
requetes.columns[:33]

In [None]:
res1_supp[~res1_supp['subgroup'].str.contains('==0')]

In [None]:
res1_supp['subgroup'][166]

In [None]:
{'algorithm' : 'beam search' , 'mesure' : 'Lift'}

result1_lift = sd_binary_table (requetes, 
                       table   = table,
                       _target = 'discDurations',
                       mesure  = 'Lift',
                       _depth  = 2,
                       threshold = 10000,
                       result_size = 100,
                       algorithm   = 'Beam Search',
                       _beam_width = 100)
res1_lift = result1_lift.to_dataframe()

In [None]:
res1_lift[res1_lift['target_share_sg'] > 0.95].sort_values(by='coverage_sg', ascending = False)[:10]

In [None]:
res1_lift['subgroup'][12]

In [None]:
{'algorithm' : 'beam search' , 'mesure' : 'WRAcc'}

result1_wracc = sd_binary_table (requetes, 
                       table   = table,
                       _target = 'discDurations',
                       mesure  = 'WRAcc',
                       _depth  = 2,
                       threshold = 10000,
                       result_size = 100,
                       algorithm   = 'Beam Search',
                       _beam_width = 100)
res1_wracc = result1_wracc.to_dataframe()
res1_wracc[res1_wracc['positives_sg'] > 10].sort_values(by='target_share_sg', ascending = False)[:10]

In [None]:
{'algorithm' : 'beam search' , 'mesure' : 'Binomial'}

result1_binomial = sd_binary_table (requetes, 
                       table   = table,
                       _target = 'discDurations',
                       mesure  = 'Binomial',
                       _depth  = 2,
                       threshold = 10000,
                       result_size = 100,
                       algorithm   = 'Beam Search',
                       _beam_width = 100)
res1_binomial = result1_binomial.to_dataframe()
res1_binomial[res1_binomial['positives_sg'] < 20000].sort_values(by='target_share_sg', ascending = False)[:10]

In [None]:
res1_binomial[res1_binomial['positives_sg'] < 20700].sort_values(by='target_share_sg', ascending = False)[:]

In [None]:
res1_wracc[res1_binomial['subgroup'].str.contains('versionBDD')]

In [None]:
res1_binomial['subgroup'][18]

In [None]:
similarity_sgs(result1_wracc.to_descriptions(), 20, requetes_, color=True)

In [None]:
d, d_names, sg_names = greedy_jaccard(result2_std.to_descriptions(),20, requetes_tables, 0.5)
for sg in d_names.keys() :
    print(sg)

In [None]:
similarity_dendrogram(result2_std.to_descriptions(), 20, requetes_tables)

In [None]:
indices2_std = similarity_dendrogram(result2_std.to_descriptions(), 20, requetes_tables,truncated = True, p = 14)

In [None]:
res2_raf = res2_std[res2_std.index.isin(indices2_std)]
res2_raf

#### Dec : VENTES with > 100 users

In [None]:
dict_conds = {'nbUtilisateurs' : {'gt' : 100}, 'DeclinaisonCOP' : 'Ventes'}       

In [None]:
result3 = sd_numerical_conds (requetes,
                          dict_conds = dict_conds ,
                          _target = 'durationMS',
                          mesure = 'mean',
                          coef_sg_size = 0.5,
                          _depth  = 1,
                          result_size = 10,
                          algorithm   = 'Beam Search',
                          _beam_width = 20)
res3 = result3.to_dataframe()
res3

In [None]:
res3['mean_sg']/res3['mean_dataset']

In [None]:
res3['subgroup'][3]

In [None]:
plot_distribution_numeric_test(result3.to_descriptions(), requetes_ventes, 1, 'durationMS')

In [None]:
def plot_distribution_numeric_test(sgs, data, bins, target):
    fig, _ = plt.subplots(figsize=(4, 2))
    target_values_data = data[target].values
    maxlim = 0
    for i in range(1,5):
        sg = sgs[i-1][1]
        target_values_sg = data[sg.covers(data)][target].values
        plt.hist(target_values_sg, bins= 20, range = (np.amin(target_values_data),np.amax(target_values_data)),linewidth=1.5,histtype=u'step' ,alpha=0.5, label="subgroup"+str(i), density=True)
        maxlim = max(maxlim,np.amax(target_values_sg))
    plt.hist(target_values_data, bins=20,range = (np.amin(target_values_data),np.amax(target_values_data)),linewidth=1.5 ,alpha=0.5, label="Overall Data", density=True)
    plt.xlim(0, maxlim + 42000)
    plt.xlabel('time')
    #plt.yscale('log')
    plt.ticklabel_format(axis="x", style="sci", scilimits=(0,0))
    plt.legend(loc='upper right')

In [None]:
result3_mean = sd_numerical_conds (requetes,
                          dict_conds = dict_conds ,
                          _target = 'durationMS',
                          mesure = 'median',
                          coef_sg_size = 0.5,
                          _depth  = 1,
                          result_size = 10,
                          algorithm   = 'Beam Search',
                          _beam_width = 100)
res_mean3 = result3_mean.to_dataframe()
res_mean3

In [None]:
result3_std = sd_numerical_conds (requetes,
                          dict_conds = dict_conds ,
                          _target = 'durationMS',
                          mesure = 't-score',
                          coef_sg_size = 0.5,
                          _depth  = 1,
                          result_size = 100,
                          algorithm   = 'Beam Search',
                          _beam_width = 100)
res_std = result3_std.to_dataframe()
res_std.sort_values(by='size_sg', ascending = False)

In [None]:
res_std[13:17]

In [None]:
requetes_ventes = get_df_conditions(requetes, dict_conds)

In [None]:
similarity_dendrogram(result3_mean.to_descriptions(), 20, requetes_ventes)

In [None]:
result3_std.to_descriptions()[:20]

In [None]:
indices = similarity_dendrogram(result3_std.to_descriptions(), 20, requetes_ventes,truncated = True, p = 10)

In [None]:
indices

In [None]:
res_std['mean_sg']/res_std['std_sg'][:20]

In [None]:
def plot_distribution_numeric(sg, data, bins, target):
    fig, _ = plt.subplots(figsize=(4, 3))
    target_values_sg = data[sg.covers(data)][target].values
    target_values_data = data[target].values
    plt.hist(target_values_sg, bins= 100, range = (np.amin(target_values_data),np.amax(target_values_data)) ,alpha=0.5, label="subgroup(1)", density=True)
    plt.hist(target_values_data, bins=100,range = (np.amin(target_values_data),np.amax(target_values_data)) ,alpha=0.5, label="Overall Data", density=True)
    plt.xlim(0, np.amax(target_values_sg))
    plt.xlabel('time')
    #plt.yscale('log')
    plt.ticklabel_format(axis="x", style="sci", scilimits=(0,0))
    plt.legend(loc='upper right')

In [None]:
def plot_distribution_numeric_test(sgs, data, bins, target):
    fig, _ = plt.subplots(figsize=(4, 2))
    target_values_data = data[target].values
    maxlim = 0
    j = 0
    indexes = [7,1,8,5]
    for i in [2,7,16,19] :
        sg = sgs[i-1][1]
        print(sg)
        target_values_sg = data[sg.covers(data)][target].values
        plt.hist(target_values_sg, bins= bins, range = (np.amin(target_values_data),np.amax(target_values_data)),linewidth=1.5,histtype=u'step' ,alpha=0.5, label="subgroup"+str(indexes[j]), density=True)
        maxlim = max(maxlim,np.amax(target_values_sg))
        j = j + 1 
    plt.hist(target_values_data, bins=bins,range = (np.amin(target_values_data),np.amax(target_values_data)),linewidth=1.5 ,alpha=0.5, label="Overall Data", density=True)
    plt.xlim(0, maxlim + 42000)
    plt.xlabel('time')
    #plt.yscale('log')
    plt.ticklabel_format(axis="x", style="sci", scilimits=(0,0))
    plt.ticklabel_format(axis="y", style="sci", scilimits=(0,0))
    plt.legend(loc='upper right')
    plt.show()

In [None]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

In [None]:
target_values_data = data[target].values
plt.hist(target_values_data, bins = 80,range = (np.amin(target_values_data),np.amax(target_values_data)),linewidth=1.5 ,alpha=0.5, label="Overall Data", density=True)

In [None]:
plot_distribution_numeric_test(result3_std.to_descriptions(), requetes_ventes, 80, 'durationMS')

In [None]:
res_std['subgroup'][6]

In [None]:
result3_std.to_descriptions()[:20]

### Use cases 2 : ASH

In [None]:
requetes[requetes['concurrence'] > 0.5].shape

In [None]:
requetes[requetes['concurrence'] > 0.5]['versionCOP'].value_counts()

In [None]:
requetes_conc = requetes[requetes['versionCOP'] == 'V15_2L.d211415.21/10/2020 16:48']
requetes_conc = requetes_conc.loc[:, (requetes_conc != 0).any(axis=0)]
for column in requetes_conc.columns :
    if requetes_conc[column].unique().size == 1 : 
        del requetes_conc[column]

In [None]:
requetes_conc['conc_disc'] = requetes_conc['concurrence'].apply(lambda x : 1 if x > 0.5 else 0)

In [None]:
{'algorithm' : 'beam search' , 'mesure' : 'Support'}

result_ash_supp = sd_binary_conds (requetes_conc,
                       dict_conds = {},
                       _target = 'conc_disc',
                       mesure  = 'Support',
                       _depth  = 1,
                       threshold = 10000,
                       result_size = 100,
                       algorithm   = 'Beam Search',
                       _beam_width = 100)
res_ash_supp = result_ash_supp.to_dataframe()
res_ash_supp[res_ash_supp['coverage_sg'] > 0.94]

In [None]:
{'algorithm' : 'beam search' , 'mesure' : 'Lift'}

result_ash_lift = sd_binary_conds (requetes_conc,
                       dict_conds = {},
                       _target = 'conc_disc',
                       mesure  = 'Lift',
                       _depth  = 2,
                       threshold = 10000,
                       result_size = 100,
                       algorithm   = 'Beam Search',
                       _beam_width = 100,
                       features_ignore = ['concurrence'])
res_ash_lift = result_ash_lift.to_dataframe()
res_ash_lift[res_ash_lift['target_share_sg'] > 0.95].sort_values(by='coverage_sg', ascending = False)[:10]

In [None]:
res_ash_lift['subgroup'][1]

In [None]:
{'algorithm' : 'beam search' , 'mesure' : 'WRAcc'}

result_ash_wracc = sd_binary_conds (requetes_conc,
                       dict_conds = {},
                       _target = 'conc_disc',
                       mesure  = 'WRAcc',
                       _depth  = 1,
                       threshold = 10000,
                       result_size = 100,
                       algorithm   = 'Beam Search',
                       _beam_width = 100,
                      features_ignore = ['concurrence'])
res_ash_wracc = result_ash_wracc.to_dataframe()
res_ash_wracc[res_ash_wracc['positives_sg'] > 5].sort_values(by='target_share_sg', ascending = False)[:10]

In [None]:
{'algorithm' : 'beam search' , 'mesure' : 'Binomial'}

result_ash_bin = sd_binary_conds (requetes_conc,
                       dict_conds = {},
                       _target = 'conc_disc',
                       mesure  = 'Binomial',
                       _depth  = 2,
                       threshold = 10000,
                       result_size = 100,
                       algorithm   = 'Beam Search',
                       _beam_width = 100,
                      features_ignore=['concurrence'])
res_ash_bin = result_ash_bin.to_dataframe()
res_ash_bin[res_ash_bin['positives_sg'] > 10].sort_values(by='target_share_sg', ascending = False)[:10]

In [None]:
res_ash_bin['subgroup'][75]

In [None]:
requetes[requetes['instanceCode'] == 'LDC_pop11501']['DeclinaisonCOP'].unique()

In [None]:
res_ash_bin[res_ash_bin['positives_sg'] > 10].sort_values(by='target_share_sg', ascending = False)[:]

In [None]:
res_ash_bin[~res_ash_bin['subgroup'].str.contains('==0')]

### Use cases 3 : Alertes

In [None]:
requetes[requetes['nbSessionBDBloquee_disc']>0]['instanceCode'].value_counts()

In [None]:
requetes_alertes = requetes[requetes['instanceCode'].str.contains(('|'.join(requetes[requetes['nbSessionBDBloquee_disc']>0]['instanceCode'].value_counts().index)))]
requetes_alertes = requetes_alertes.loc[:, (requetes_alertes != 0).any(axis=0)]
for column in requetes_alertes.columns :
    if requetes_alertes[column].unique().size == 1 : 
        del requetes_alertes[column]

In [None]:
instance = 'LDC_reg00101'

In [None]:
def discretize_alert (x) :
    if x == 'Critique' or x == 'Bloquant' or x == 'Alarme':
        return 1 
    else :
        return 0

In [None]:
requetes_alertes['nbSessionBDBloquee_disc'] = requetes_alertes['nbSessionBDBloquee'].apply(lambda x : discretize_alert(x))

In [None]:
requetes['nbSessionBDBloquee_disc'] = requetes['nbSessionBDBloquee'].apply(lambda x : discretize_alert(x))

In [None]:
requetes_tmp2 = get_df_instance (requetes, instance) 

In [None]:
for column in requetes_tmp2.columns :
    if requetes_tmp2[column].unique().size == 1 : 
        del requetes_tmp2[column]

In [None]:
res_alrt_lift['subgroup'][2]

In [None]:
{'algorithm' : 'beam search' , 'mesure' : 'Support'}

result_alrt_supp = sd_binary_conds (requetes_alertes,
                       dict_conds = {},
                          _target = 'nbSessionBDBloquee_disc',
                          mesure  = 'Support',
                          _depth  = 1,
                          threshold = 10000,
                          result_size = 150,
                          algorithm   = 'Beam Search',
                          _beam_width = 150)
res_alrt_supp = result_alrt_supp.to_dataframe()
res_alrt_supp[res_alrt_supp['coverage_sg'] > 0.99]

In [None]:
{'algorithm' : 'beam search' , 'mesure' : 'Lift'}

result_alrt_lift = sd_binary_conds (requetes_alertes,
                       dict_conds = {},
                          _target = 'nbSessionBDBloquee_disc',
                          mesure  = 'Lift',
                          _depth  = 1,
                          threshold = 10000,
                          result_size = 100,
                          algorithm   = 'Beam Search',
                          _beam_width = 100,
                         features_ignore=['nbSessionBDBloquee','DeclinaisonCOP','instanceCode'])
res_alrt_lift = result_alrt_lift.to_dataframe()
res_alrt_lift[res_alrt_lift['target_share_sg'] > 0.8].sort_values(by='coverage_sg', ascending = False)[:10]

In [None]:
res_alrt_lift[res_alrt_lift['target_share_sg'] >= 0.9]

In [None]:
res_alrt_lift[res_alrt_lift['subgroup'].str.contains('modele.cdeligliv')]

In [None]:
res_alrt_lift['subgroup'][56]

In [None]:
{'algorithm' : 'beam search' , 'mesure' : 'WRAcc'}

result_alrt_wracc = sd_binary_conds (requetes_alertes,
                       dict_conds = {},
                          _target = 'nbSessionBDBloquee_disc',
                          mesure  = 'WRAcc',
                          _depth  = 1,
                          threshold = 10000,
                          result_size = 100,
                          algorithm   = 'Beam Search',
                          _beam_width = 100,
                        features_ignore=['nbSessionBDBloquee','DeclinaisonCOP','instanceCode'])
res_alrt_wracc = result_alrt_wracc.to_dataframe()
res_alrt_wracc[res_alrt_wracc['positives_sg'] > 25].sort_values(by='target_share_sg', ascending = False)[:15]

In [None]:
{'algorithm' : 'beam search' , 'mesure' : 'Binomial'}

result_alrt_bin = sd_binary_conds (requetes_alertes,
                       dict_conds = {},
                          _target = 'nbSessionBDBloquee_disc',
                          mesure  = 'Binomial',
                          _depth  = 1,
                          threshold = 10000,
                          result_size = 100,
                          algorithm   = 'Beam Search',
                          _beam_width = 100,
                        features_ignore=['nbSessionBDBloquee','DeclinaisonCOP','instanceCode'])
res_alrt_bin = result_alrt_bin.to_dataframe()
res_alrt_bin[res_alrt_bin['positives_sg'] > 25].sort_values(by='target_share_sg', ascending = False)[:15]

In [None]:
result_alrt_bin.to_descriptions()[:10]

In [None]:
similarity_sgs(result_alrt_bin.to_descriptions(), 10, requetes_alertes, color=True)

In [None]:
def similarity_dendrogram(result_descriptions, result_size, data, truncated = False, p = None):
    fig, _ = plt.subplots(figsize=(10, 5))
    dist_df = similarity_sgs(result_descriptions, result_size, data, color=False)
    #sgNames = [str(x[1]) for x in result_descriptions[:result_size]]
    sgNames = ['WHERE_ventes.commandesfactures.modele.bonliv.datdepart > 0',
              'anomalyRepartition = Alarm',
              'anomalyRepartition = Info',
              'hour : [10 : 14]',
              'activSessions = Info',
              'application > 0.6',
              'day = Sunday',
              'configuration > 0.1',
              'anomalyRepartition = Critique',
              'poolActivConnection = Info']
    mat = 1 - dist_df.values
    dists = squareform(mat)
    linkage_matrix = linkage(dists, "single")
    if truncated == True :
        r = dendrogram(linkage_matrix, labels=sgNames, leaf_rotation=90, p = p, truncate_mode='lastp')
        count = 0
        l_count = []
        for sg in r['ivl'] :
            if '(' in sg : # number 
                print(sgNames[count])
                l_count.append(count)
                count = count + int(sg[sg.find('(') + 1 : sg.find(')')])
            else :
                print(sg)
                l_count.append(count)
                count = count + 1
        
        jaccard_threshold = 1- min([j for i in r['dcoord'] for j in i[1:-1]])
        print(jaccard_threshold)
        return l_count
    
    else :
        dendrogram(linkage_matrix, labels=sgNames, leaf_rotation=90)

In [None]:
similarity_dendrogram(result_alrt_bin.to_descriptions(), 10, requetes_alertes)

In [None]:
indices = similarity_dendrogram(result_alrt_bin.to_descriptions(), 10, requetes_alertes,truncated = True, p = 5)

In [None]:
plot_distribution_numeric_test(result_alrt_bin.to_descriptions(), requetes_alertes, 1, 'durationMS')

In [None]:
requetes.columns[:33]

In [None]:
requetes[requetes['moyenneNbSessionsActives'] == 'Alarme']['instanceCode'].unique()

In [None]:
requetes['moyenneNbSessionsActives'].value_counts()

In [None]:
requetes.shape

In [None]:
requetes.drop_duplicates().shape