In [None]:
import pandas as pd
import pydotplus as ptp

# myClass 
import import_ipynb
import helper_general as my
import helper_ProcessingResults as myPR

# Loading + preprocessing AirQuality Dataset

In [None]:
# Take the correct rows and columns
airQuality = pd.read_csv("AirQualityUCI.csv", sep=';').iloc[0:9356,1:-2] # No Date
#display(airQuality.head(3))

# Replacement the "," with "." in the scientific notations 
airQuality = airQuality.astype(str).applymap(lambda val: val.replace(',','.'))

# Replacement the "(" and ")" with "_" in the columns' name to prevent grammar's errors
airQuality.columns = [ colName.replace("(","_").replace(')','').replace('.','_') for colName in airQuality.columns.to_list()]
display(airQuality.head(3))

## Mapping values, Encoder and Decoder

In [None]:
# Values mapping 
encod , decod = {},{}
for col in airQuality.columns.to_list():
    
    if col == "Time":
        dec = airQuality[col].drop_duplicates().sort_values().reset_index(drop=True)
    else :
        dec = airQuality[col].astype(float).drop_duplicates().sort_values() #float-> -200 == -200.0
        if dec.iloc[0] == -200:
            dec = dec.iloc[1:] # remove the -200.0 (None) => mapping only between not None values 
        dec = dec.reset_index(drop=True) 
    enc = dec.reset_index().set_index(col)['index']
    
    encod |= {col: dict(enc)| {-200.0: -1} } # Si può modificate -> per avere tutti interi
    decod |= {col: dict(dec)| {-1 : -200.0}}
#my.myDisplay([encod['CO_GT'], decod['CO_GT']], axis=1)

Encoding Dataset

In [None]:
df_AQ = pd.DataFrame() 
for col in airQuality.columns:
    if col == "Time":
         df_AQ[col] = airQuality[col].map(lambda val: encod[col][val])
    else:
        df_AQ[col] = airQuality[col].map(lambda val: encod[col][float(val)]) # cast float -> '2' == '2.0' == 2.0
df_AQ.head(3)

# Load + decoded rules + Supp, Conf, Generaliz

In [None]:
sourceDir = "./data_raw/"
filename_path =  ['RCE_rules.csv','RSE_rules.csv','RCA_rules.csv','RSA_rules.csv']#,'Report_RC_ud.txt']
report_name = ['RCE','RSE','RCA','RSA'] #NB: same orther of the loaded files
outputDir = "./processed_data/"

In [None]:
id_file=0
rep_name = report_name[id_file]
df_Rules = pd.read_csv(sourceDir+filename_path[id_file])
df_Rules

In [None]:
df_decodRules = df_Rules.copy()
df_decodRules['L'] = df_decodRules.apply(lambda row: decod[row['V']][row['L']], axis=1)
df_decodRules['U'] = df_decodRules.apply(lambda row: decod[row['V']][row['U']], axis=1)
my.Display([df_Rules, df_decodRules], names=['df_Rules', 'df_decodRules'], axis=1)

Decode Rules

In [None]:
supp = 0.05
conf = 0.8

infoRule = df_Rules.groupby('idRule').apply(lambda df: myPR.rule_supp_conf(df_AQ, df[df['Side']=='X'], df[df['Side']=='Y'] ) )
infoRule = pd.DataFrame(list(infoRule)).rename_axis('idRule')
infoRule['state'] = infoRule.apply(lambda rule: myPR.rule_State(rule,supp,conf), axis=1)
infoRule

In [None]:
prettyRule = df_decodRules.groupby('idRule').apply(lambda df: myPR.prettyRule(df[df['Side']=='X'], df[df['Side']=='Y'] ) )
prettyRule = prettyRule.rename('Rule').to_frame().assign(state= infoRule['state'].values)
prettyRule.head(5)

GENERALIZATIONS

In [None]:
df_idRuleGeneral = myPR.generalizations(df_decodRules)
df_idRuleGeneral  

df_RuleGeneral = pd.merge(df_idRuleGeneral, prettyRule, left_on=['idRule_G'], right_on=['idRule'])
df_RuleGeneral = pd.merge(df_RuleGeneral, prettyRule, left_on=['idRule_g'], right_on=['idRule'], suffixes=('_G','_g'))
df_RuleGeneral.head(5)

In [None]:
# seleziono solo Confident --> Confident
df_RuleGeneral_Conf = df_RuleGeneral[ (df_RuleGeneral['state_G']=='Confident') & (df_RuleGeneral['state_g']=='Confident')]
df_RuleGeneral_Conf = df_RuleGeneral_Conf[ df_RuleGeneral_Conf['idRule_G'] > df_RuleGeneral_Conf['idRule_g'] ]
display(df_RuleGeneral_Conf.head(3))

## Prune adjTree 

In [None]:
# df_RuleGeneral_Conf_OK
edjeList_tc = df_RuleGeneral_Conf[['idRule_G','idRule_g']].values # (merge come tranClosure)
df_General_tree = df_RuleGeneral_Conf.groupby('idRule_G').agg({'idRule_g':list})
df_General_tree['adj_pruned'] = df_General_tree.apply(lambda row: myPR.pruneEdge(row.name, row.idRule_g, edjeList_tc), axis=1)
display(df_General_tree.head(3))

## Save data

In [None]:
# SAVE inforule
infoRule.to_csv( outputDir + rep_name + '_infoRule.csv' )
prettyRule.to_csv( outputDir + rep_name + '_prettyRule.csv')

In [None]:
# # Save generalization
df_RuleGeneral.to_csv(outputDir + rep_name + '_General_ALL.csv')
df_RuleGeneral_Conf.to_csv(outputDir + rep_name + '_General_Conf.csv')

In [None]:
# # SAve generalization Tree pruned
edfeList_prued= df_General_tree['adj_pruned'].explode().reset_index().rename(columns={'adj_pruned':'idRule_g'})
edfeList_prued.to_csv(outputDir + rep_name + '_edgeList_pruned_Conf.csv')

# Loop

In [None]:
sourceDir = "./data_raw/"
filename_path =  ['RCE_rules.csv','RSE_rules.csv','RCA_rules.csv','RSA_rules.csv']#,'Report_RC_ud.txt']
report_name = ['RCE','RSE','RCA','RSA'] #NB: stesso ordine dei file caricati
outputDir = "./processed_data/"

## Include/esclude RC_ud

In [None]:
# abilitare/disabilitare l'analisi di RC_up
filename_path += ['RC_ud_rules.csv']
report_name += ['RC_ud']

In [None]:
for path, rep_name in zip(filename_path, report_name):
    print(path)
    df_Rules = pd.read_csv(sourceDir + path)
    
    df_decodRules = df_Rules.copy()
    df_decodRules['L'] = df_decodRules.apply(lambda row: decod[row['V']][row['L']], axis=1)
    df_decodRules['U'] = df_decodRules.apply(lambda row: decod[row['V']][row['U']], axis=1)
    #my.Display([df_Rules, df_decodRules], names=['df_Rules', 'df_decodRules'], axis=1)

    #Decode Rules:------------------------------
    supp = 0.05
    conf = 0.8

    # Funziona bene con dataset AirQuality codificato (tutto stessa forma)
    infoRule = df_Rules.groupby('idRule').apply(lambda df: myPR.rule_supp_conf(df_AQ, df[df['Side']=='X'], df[df['Side']=='Y'] ) )
    infoRule = pd.DataFrame(list(infoRule)).rename_axis('idRule')
    infoRule['state'] = infoRule.apply(lambda rule: myPR.rule_State(rule,supp,conf), axis=1)

    prettyRule = df_decodRules.groupby('idRule').apply(lambda df: myPR.prettyRule(df[df['Side']=='X'], df[df['Side']=='Y'] ) )
    prettyRule = prettyRule.rename('Rule').to_frame().assign(state= infoRule['state'].values)

    #GENERALIZATIONS:---------------------------

    df_idRuleGeneral = myPR.generalizations(df_decodRules)  
    df_RuleGeneral = pd.merge(df_idRuleGeneral, prettyRule, left_on=['idRule_G'], right_on=['idRule'])
    df_RuleGeneral = pd.merge(df_RuleGeneral, prettyRule, left_on=['idRule_g'], right_on=['idRule'], suffixes=('_G','_g'))
   
    # seleziono solo Confident --> Confident
    df_RuleGeneral_Conf = df_RuleGeneral[ (df_RuleGeneral['state_G']=='Confident') & (df_RuleGeneral['state_g']=='Confident')]
    df_RuleGeneral_Conf= df_RuleGeneral_Conf[ df_RuleGeneral_Conf['idRule_G'] > df_RuleGeneral_Conf['idRule_g'] ]
   
    #Prune adjTree:-----------------------------------
    edjeList_tc = df_RuleGeneral_Conf[['idRule_G','idRule_g']].values # (merge come tranClosure)
    df_General_tree= df_RuleGeneral_Conf.groupby('idRule_G').agg({'idRule_g':list})
    df_General_tree['adj_pruned'] = df_General_tree.apply(lambda row: myPR.pruneEdge(row.name, row.idRule_g, edjeList_tc), axis=1)

   
    ## Save data: -----------------------------------
    # SAVE inforule
    infoRule.to_csv( outputDir + rep_name + '_infoRule.csv' )
    prettyRule.to_csv( outputDir + rep_name + '_prettyRule.csv')

    # # Save generalization
    df_RuleGeneral.to_csv(outputDir + rep_name + '_General_ALL.csv')
    df_RuleGeneral_Conf.to_csv(outputDir + rep_name + '_General_Conf.csv')
    
    # # SAve generalization Tree pruned
    edfeList_prued = df_General_tree['adj_pruned'].explode().reset_index().rename(columns={'adj_pruned':'idRule_g'})
    edfeList_prued.to_csv(outputDir + rep_name + '_edgeList_pruned_Conf.csv')

print('END')  