In [1]:
import numpy as np
import pandas as pd
import pydotplus as ptp

from datetime import time
from datetime import datetime

# my classes
import import_ipynb
import helper_general as my

importing Jupyter notebook from helper.ipynb


## Supp, Conf, State

In [2]:
def prettyRule(df_X, df_Y):
    df_X = df_X.apply(lambda row: f"{row['V']}[{row['L']}-{row['U']}]", axis=1)
    df_Y = df_Y.apply(lambda row: f"{row['V']}[{row['L']}-{row['U']}]", axis=1)
    return ', '.join(df_X.values)+' --> '+', '.join(df_Y.values)

In [3]:
def rule_supp_conf(df_database, df_X, df_Y ):
    # for each attribute, find the tuple that satisfied its interval
    posX = df_X.apply( lambda row: (row.L <= df_database[row.V]) & ( df_database[row.V] <= row.U), axis=1)
    posY = df_Y.apply( lambda row: (row.L <= df_database[row.V]) & ( df_database[row.V] <= row.U), axis=1)
    #my.Display([posX, posY], names=['posX', 'posY'],axis=1)
    
    # for each tuple(columns) -> look if satisfies very attrubute
    posX = posX.sum()==len(df_X) 
    posY = posY.sum()==len(df_Y)
    posXY = posX & posY
    #my.Display([posX, posY,posXY], names=['posX', 'posY', 'posXY'],axis=1)
    
    r = {'count_Ant':posX.sum(), 'sup_Antec': posX.sum()/len(df_database),
         'count_Cons':posY.sum(), 'sup_Cons': posY.sum()/len(df_database),
         'count_Rule':posXY.sum(), 'sup_Rule': posXY.sum()/len(df_database),
         'conf_Rule':0 if posX.sum()==0 else posXY.sum()/posX.sum()}
    return r

In [4]:
def rule_State(ruleInfo, supp, conf):
    if ruleInfo['sup_Rule'] < supp:
        return 'Not supported'
    if ruleInfo['conf_Rule'] < conf:
        return 'Not confident'
    return 'Confident'

## Generalization Rules

In [5]:
def generalizations(df_Rules): # rule_state
    
    # NB: Funziona per le regole scritte in formato: pd.DataFrame(columns={'idRule','V','L','U','Side'})
    
    # Preprocessing:-------------------------------------------
    tempX = df_Rules[df_Rules['Side'] == 'X']
    tempY = df_Rules[df_Rules['Side'] == 'Y']
    df_Origin_lenX = tempX.groupby('idRule').agg({'V':'count'}).rename(columns={'V':'lenAnt_Orig'}).reset_index()
    df_Origin_lenY = tempY.groupby('idRule').agg({'V':'count'}).rename(columns={'V':'lenCon_Orig'}).reset_index()
    #my.Display([tempX, df_Origin_lenX, tempY, df_Origin_lenY],names=['tempX', 'df_Origin_lenX', 'tempY', 'df_Origin_lenY'], axis=1) 

    # Only Antecedent:----------------------------------------------
    # Identifico attributi degli Antecedenti che generalizzano altri Antecedenti
    tempX = pd.merge(tempX, tempX, on=['V','Side'], suffixes=('_G','_g'))  # Merge: G generalizza g (or g specializza G)
    tempX = tempX[ tempX['idRule_G'] != tempX['idRule_g'] ] # rimuovo rules in join con se stesse
    tempX = tempX[(tempX['L_G'] <= tempX['L_g']) & (tempX['U_g'] <= tempX['U_G'])] # generalizzo check: allargo Attributo degll'Antecedente
    #tempX = tempX[(tempX['L_g'] <= tempX['L_G']) & (tempX['U_G'] <= tempX['U_g'])] # specializzo check: restringo Attributo degll'Antecedente

    # Affinchè G generalizzi g: len(G[Att] intersecato g[Att]) == len(g[Att]): Since ALL g[ATT] devono essere presenti. OK if len(G[Att])>=len(g[Att])
    df_lenAnt = tempX.groupby(['idRule_G','idRule_g'])['V'].count().reset_index().rename(columns={'V':'lenAnt_G&g'})
    df_lenAnt = pd.merge(df_Origin_lenX, df_lenAnt, left_on=['idRule'], right_on=['idRule_G'])
    df_Ant = df_lenAnt[ df_lenAnt['lenAnt_Orig']==df_lenAnt['lenAnt_G&g']]

    # Only Conseguent:----------------------------------------------
    # Identifico attributi dei Conseguenti che generalizzano altri Conseguenti
    tempY = pd.merge(tempY, tempY, on=['V','Side'], suffixes=('_G','_g')) # Merge: G generalizza g
    tempY = tempY[ tempY['idRule_G'] != tempY['idRule_g'] ] # rimuovo rule join con se stesse
    tempY = tempY[(tempY['L_g'] <= tempY['L_G']) & (tempY['U_G'] <= tempY['U_g'])] # generalizzo check: restringo Attributo conseguente
    #tempY = tempY[(tempY['L_G'] <= tempY['L_g']) & (tempY['U_g'] <= tempY['U_G'])] # specializzo check: allargo Attributo conseguente

    # Affinchè G generalizzi g: len(G[Att] intersecato g[Att]) == len(A[Att]): Since ALL G[Att] devono essere presenti. OK if len(g[Att])>=len(G[Att])
    df_lenCon = tempY.groupby(['idRule_G','idRule_g'])['V'].count().reset_index().rename(columns={'V':'lenCon_G&g'})
    df_lenCon = pd.merge(df_Origin_lenY, df_lenCon, left_on=['idRule'], right_on=['idRule_g'])
    df_Con =  df_lenCon[ df_lenCon['lenCon_Orig']==df_lenCon['lenCon_G&g']]
    #my.Display([tempY, df_lenCon, df_Con], axis=1)

    # Concrete Rules:---------------------------------------------------
    # G generlizza g se esistone la coppia (id_G, id_g) sia in df_Ant che in df_Con
    df_idRuleGeneral = pd.merge(df_Ant, df_Con, on=['idRule_G','idRule_g'])[['idRule_G','idRule_g']] 
    return df_idRuleGeneral

## Tree Generalization Rules

In [6]:
def DFS_Closure( startNode, next_Nodes, dfAdj):
    closureList = []
    for u in next_Nodes:
        if (not pd.isna(u)) and (dfAdj.at[u,'visited']==0): #  NON è già stato visitato 
            dfAdj.at[u,'visited'] = 1
            closureList += [(startNode, u)] + DFS_Closure(startNode, dfAdj.at[u,'T'], dfAdj)
    return closureList

def transitive_closure(edgeList): # FUNZIA se NON ci sono cicli
    dfEdges = pd.DataFrame(data=edgeList, columns=['S','T']) # Source, Target

    # Find and add leaves + create dataframe Adj
    leaves = set(dfEdges['T'])- set(dfEdges['S'])
    dfEdges = pd.concat([dfEdges, pd.DataFrame(data=leaves, columns=['S'])])
    dfAdj = dfEdges.groupby(['S']).agg({'T':list}).assign(visited = 0)

    closureEdgeList = []
    for v in dfAdj.index:
        closureEdgeList += DFS_Closure(v, dfAdj.at[v,'T'], dfAdj)     
        dfAdj['visited'] = 0

    return pd.DataFrame(data=closureEdgeList, columns=['S','T'])  

def pruneEdge(node, adj, edgeList_tranClosure ): # edgeList della
    temp_adj = adj.copy()
    df_tranClosure = pd.DataFrame(edgeList_tranClosure, columns=['S','T'])
    for u in adj:
        #print(f"edge:{node}-{u}")
        V_target =  set(df_tranClosure[node == df_tranClosure['S']]['T']) # ciò che può partire ad node
        U_sorgent = set(df_tranClosure[u == df_tranClosure['T']]['S'] )     # ciò che può arrivare ad u
        inters = V_target.intersection(U_sorgent)
        #my.Display([V_target, U_sorgent, inters],axis=1)
        if len(inters)!=0:
            temp_adj.remove(u)
    return temp_adj    

# Visualizzazione Tree

In [7]:
def addEdge(graph, node1, node2, color='black'):
    n1 = ptp.Node( str(node1), shape='box', style='rounded') # questo caso, id is the same of the label
    n1.set('label', str(node1) )
    graph.add_node(n1)
    
    # check if node2 already exits -> if not, create now
    n2 = graph.get_node(str(node2)) # return a list
    if len(n2) == 0 : # Not exits -> empty list
        n2 = ptp.Node( str(node2), shape='box', style='rounded') # questo caso, id is the same of the label
        n2.set('label', str(node2) )
        graph.add_node(n2)
    else:
        n2 = n2[0]# estraggo dalla lista
    
    # Create edge
    edge = ptp.Edge(n1, n2)#, label='G')
    edge.set("color", color)
    graph.add_edge(edge)
    

# gTEST = ptp.Dot()
# adj_ruleTEST.apply(lambda adj: [addEdge(gTEST, adj.name, u) for u in adj['nodeList']], axis=1)
# #gTEST.write_svg(f'adj_ruleTEST.svg')

# Load file for PLOTTING

In [8]:
def loadReport_TEST(filename):
    print(filename)
    with open(filename) as f:
        line = f.readlines()

        start=0
        for l in line:
            if l[0:2]=='#=':
                break
            start += 1
        rawData = line[start+1:]

    #remove empty lines
    try:
        while(rawData.remove('\n')):
            pass
    except:
        pass
    
    # remove last part of interrupt research
    while rawData[-1][0:3] != 'run':
        rawData.pop()

    
    # divide in run and subrun 
    data_Run_1 = []
    data_Run_2 = []
    data_SubRun = []
    
    idSubrun = -1
    for l in rawData:
        if l[0:3]=='Run':
            data_Run_1.append( l.split(', ') )
            idSubrun += 1
        elif l[0:3]=='-->':
            data_SubRun.append([idSubrun]+l.split(', '))
        else:
            data_Run_2.append(l.split(', '))
          
    # dataframe Runs :--------------------------------------------
    data_Run_1 = pd.DataFrame(data_Run_1).drop([0,3], axis=1).rename(columns={1:'lenghtConstr', 2:'run_startTime'})
    data_Run_2 = pd.DataFrame(data_Run_2).rename(columns={ 0:'run_time', 1:'run_totTime', 2:'state'})
    
    data =  pd.concat([data_Run_1, data_Run_2], axis=1) 
    data = data.reset_index().rename(columns={'index':'ID_Run'})
    # refinements
    data['lenghtConstr'] = data['lenghtConstr'].map(lambda x: x.split(':')[1:][0])
    data['run_startTime'] = data['run_startTime'].map(lambda x: x.split('d:')[1:][0])
    data['run_time'] = data['run_time'].map(lambda x: x.split('e:')[1:][0])
    data['run_totTime'] = data['run_totTime'].map(lambda x: x.split('e:')[1:][0])
    data['state'] = data['state'].map(lambda x: x.split(':')[1:][0][:-2])
   
    # time conversion
    zero = datetime.strptime( "00:00", "%H:%M") 
    data['run_startTime'] = data['run_startTime'].map(lambda t: datetime.strptime(t,'%m-%d %H:%M:%S'))
    data['run_time'] = data['run_time'].map(lambda t: datetime.strptime(t,'%H:%M:%S'))-zero
    data['run_totTime'] = data['run_totTime'].map(lambda t: datetime.strptime(t,'%H:%M:%S'))#-zero più comodo come asseX
    #display(data)
    
    # dataframe SubRuns :--------------------------------------------
    data_SubRun =  pd.DataFrame(data_SubRun).drop([1,3,5], axis=1).rename(columns={0:'ID_Run', 2:'ID_SubRun', 4:'subRun_time'})
    data_SubRun['ID_SubRun'] = data_SubRun['ID_SubRun'].map(lambda x: x.split(':')[1])
    data_SubRun['subRun_time'] = data_SubRun['subRun_time'].map(lambda x: x.split('e:')[1:][0][:-1])
    
    # time conversion
    data_SubRun['subRun_time'] = data_SubRun['subRun_time'].map(lambda t: datetime.strptime(t,'%H:%M:%S.%f'))-zero
    data_SubRun['subRun_totTime'] = data_SubRun['subRun_time'].cumsum()+zero # non precisissimo, ma utile al momento
    
    return data, data_SubRun

In [9]:
def loadReport_up(filename):
    print(filename)
    with open(filename) as f:
        line = f.readlines()

        start=0
        for l in line:
            if l[0:2]=='#=':
                break
            start += 1
        rawData = line[start+1:]

    #remove empty lines
    try:
        while(rawData.remove('\n')):
            pass
    except:
        pass
    
    # remove last part of interrupt research
    while rawData[-1][0:3] != 'run':
        rawData.pop()
 
    # divide in run and subrun 
    data_Run_1 = []
    data_Run_2 = []
    count_UD = []
    
    for l in rawData:
        if l[0:3]=='Run':
            data_Run_1.append( l.split(', ') )
        elif l[0:3]=='UD,':
            count_UD.append( [len(l.split(','))] )
        else:
            data_Run_2.append(l.split(', '))
          
    # dataframe Runs :--------------------------------------------
    data_Run_1 = pd.DataFrame(data_Run_1).drop([0,3], axis=1).rename(columns={1:'lenghtConstr', 2:'run_startTime'})
    data_Run_2 = pd.DataFrame(data_Run_2).rename(columns={ 0:'run_time', 1:'run_totTime', 2:'state'})
    count_UD = pd.DataFrame(count_UD, columns=['count_UD'])
    
    data =  pd.concat([data_Run_1, count_UD, data_Run_2], axis=1) 
    data = data.reset_index().rename(columns={'index':'ID_Run'})

    # refinements
    data['lenghtConstr'] = data['lenghtConstr'].map(lambda x: x.split(':')[1:][0])
    data['run_startTime'] = data['run_startTime'].map(lambda x: x.split('d:')[1:][0])
    data['run_time'] = data['run_time'].map(lambda x: x.split('e:')[1:][0])
    data['run_totTime'] = data['run_totTime'].map(lambda x: x.split('e:')[1:][0])
    data['state'] = data['state'].map(lambda x: x.split(':')[1:][0][:-2])
   
    # time conversion
    zero = datetime.strptime( "00:00", "%H:%M") 
    data['run_startTime'] = data['run_startTime'].map(lambda t: datetime.strptime(t,'%m-%d %H:%M:%S'))
    data['run_time'] = data['run_time'].map(lambda t: datetime.strptime(t,'%H:%M:%S'))-zero
    data['run_totTime'] = data['run_totTime'].map(lambda t: datetime.strptime(t,'%H:%M:%S'))#-zero più comodo come asseX
    #display(data)
    
    # dataframe SubRuns :--------------------------------------------
   
    return data