# Air quality : Transitive frontier learnig Example

Below is an example to implement a Transitive Frontier Learning on the Air quality dataset.
We first show how we processed the dataset, then how to perform the analysis.

In [None]:
import pandas as pd
import datetime as dt
from math import log2  
import pydotplus as ptp 

# helper_classes 
import import_ipynb
import helper_general as my
from ConstraintDefiner import ConstraintDefiner as Lark_CD


## Loading + preprocessing Dataset

In [None]:
# Take the correct rows and columns
airQuality = pd.read_csv("./AirQualityUCI.csv", sep=';').iloc[0:9356,1:-2] # No Date
display(airQuality.head(3))

# Replacement the "," with "." in the scientific notations 
airQuality = airQuality.astype(str).applymap(lambda val: val.replace(',','.'))

# Replacement the "(" and ")" with "_" in the columns' name to prevent grammar's errors
airQuality.columns = [ colName.replace("(","_").replace(')','').replace('.','_') for colName in airQuality.columns.to_list()]
airQuality

## Mapping values, Encoder and Decoder
Mapping

In [None]:
# Values mapping into integer values
encod , decod = {},{}
for col in airQuality.columns.to_list():
    
    if col == "Time":
        dec = airQuality[col].drop_duplicates().sort_values().reset_index(drop=True)
    else :
        dec = airQuality[col].astype(float).drop_duplicates().sort_values() #float-> -200 == -200.0 
        dec = dec.iloc[1:].reset_index(drop=True) # remove the -200.0 (None) => mapping only between not None values    
    enc = dec.reset_index().set_index(col)['index']
    
    encod |= {col: dict(enc)| {-200.0: -1} } # In orther to have only integer values
    decod |= {col: dict(dec)| {-1 : -200.0}}
#my.myDisplay([encod['CO_GT'], decod['CO_GT']], axis=1)

Encoding Dataset

In [None]:
df_AQ = pd.DataFrame() 
for col in airQuality.columns:
    if col == "Time":
         df_AQ[col] = airQuality[col].map(lambda val: encod[col][val])
    else:
        df_AQ[col] = airQuality[col].map(lambda val: encod[col][float(val)]) # cast in float -> '2' == '2.0' == 2.0
df_AQ

Decoding Dataset

In [None]:
df_AQ_2 = pd.DataFrame() 
for col in airQuality.columns:
         df_AQ_2[col] = df_AQ[col].map(lambda val: decod[col][val])
df_AQ_2

## Decode Air Quality Rule

In [None]:
def AQ_decodeRule(decod, df_X, df_Y):
    def colDec(col, name):
        return col.map(lambda v: decod[name][v])  
    
    X = df_X.T.apply(lambda col: colDec(col, col.name)).T
    Y = df_Y.T.apply(lambda col: colDec(col, col.name)).T
    
    #--------------------------
    # Pretty rule
    X_pretty = X.index+'['+ X.L.astype(str)+', '+X.U.astype(str)+']'
    Y_pretty = Y.index+'['+ Y.L.astype(str)+', '+Y.U.astype(str)+']'
    s = f"{', '.join(X_pretty.tolist())} --> {', '.join(Y_pretty.tolist())}"
    
    return X,Y,s

# dX,dY,ds = AQ_decodeRule(decod, X,Y) 
# my.myDisplay([dX,dY], axis=1)
# ds

## Support and Confidence

In [2]:
def supp_conf(df_Coded, df_X, df_Y, tempShow=False):
    df_temp = pd.DataFrame()
    for col in df_X.index:
        df_temp[col] = (df_X['L'][col] <= df_Coded[col]) & (df_Coded[col] <= df_X['U'][col])
    
    for col in df_Y.index:
        df_temp[col] = (df_Y['L'][col] <= df_Coded[col]) & (df_AQ[col] <= df_Y['U'][col])
    
    #-----------------------------------------
    # Calcolo freq X e XY
    freqX = df_temp[ df_temp[df_X.index].T.sum() == len(df_X)] # freq of X 
    freqXY = freqX[freqX[df_Y.index].T.sum() == len(df_Y)]     # freq of XY
    
    if tempShow : my.myDisplay([df_temp,freqX,freqXY],names=['Evaluation:','X:','XY:'], axis=1)
    return {'supp':len(freqXY)/len(df_Coded) , 'conf':0 if len(freqX)==0 else len(freqXY)/len(freqX)}

# X,Y,s = c.decodeRule(res)
# suppConf = supp_conf(df_AQ, X, Y, True)
# suppConf

## Test Pipeline 

In [None]:
# after preprocessing
df_AQ.head(3)

In [None]:
# Select Vars to perform the analysis
df_var = df_AQ.max().to_frame().reset_index().iloc[1:10,:]
df_var

###  ConstraintDefiner Initialization

This class is designed to create an interface between the user and the SAT-like core. 
It takes the variables we are interested on and their maximum value to generate constraints to guide the SAT-like search.

This class try to find ranges where a rule could have some meanig. For each variable is defined a lower bond and an upper bound. The value of each variable (and its lower and upper bound) and their is represented in bits, and its evaluation is done bit by bit.

The constraints implemented are: 

    init_constr(): that describes the structure of the rule that must be respected and the values' interval (to zero from ther maximum  values 
    NotSupported(res): Describes the constraint we want to apply in the next search if the obtained rule was not supported. 
    NotConfident(res): Describes the constraint we want to apply in the next search if the obtained rule was not confident.
    Confident(res):    Describes the constraint we want to apply in the next search if the obtained rule was confident.
    reject_solut(res): Describes the constraint we want to apply in the next search if we don't want the same rule again.


In [None]:
c = Lark_CD(df_var['index'], df_var[0]) 
c.addConstr( c.init_constr() )  
c.set_seed(2981050372)  # we can set a seed to reproducibility

At this point, the search is like a normal SAT-solver. We can choose to optimize the search by inserting a priority between the bits (digit) of the variables themselves. 

    digitPrior 'Simple': The priority is defined only between bits that represent the same variable (ex, only between the lower bound's bits or the upper bound's bits).
    digitPrior 'Complete': The priority is defined between which represents the same range.

In [None]:
c.set_priority(digitPrior='Complete') # none, 'Simple', 'Complete'

To avoid long rules, we can enable the 'saturate' option which tries to assign default values to each bits.
We decide the default values for each varialbe range is zero for the lower bound and the maximum value for the upper bound. When an attribute has the maximum interval (0-max), we decide to remove that attribute from the rule.

    saturate 'all': Try to saturate all the variables that are not already assigned.
    saturate 'effective': Try to saturate only those extremes that define the same range that doen't have values different from the default ones.

In [None]:
c.set_saturate_freeVars(saturate='effective') # options:None, 'all', 'effective'

In the ConstraintDefiner, each rule is represented as a path in a binary tree. We can choose how to explore this tree (in this test, we only use the option 'random')

In [None]:
c.set_childrenChoice('random') #'random' or 'min_avgLenPath' or 'min_power':

We can also avoid to take path that was already discarded by previous runs (always respecting the previous constraints)

In [None]:
c.set_resetTree(False) 

#### Start search

In [None]:
# Decide support and confidence
supp = 0.05
conf = 0.8
duration = '10h' # test durations

#----------------------
i, allBool = 0, False
sol, state_Sol, data_Solut= [], [], []

#--------------------
duration = pd.Timedelta(duration)
startTime = dt.datetime.now() # only for test

#---------------------------
while (dt.datetime.now() - startTime ) < duration:  
    print(startTime - dt.datetime.now())
    start_run = dt.datetime.now()
    nChar = sum([len(constr) for constr in c.constr])
    print(f"Run:{i+1}, #char:{nChar}, Started:{start_run.strftime('%m-%d %H:%M:%S')}, ", end='')
    print("")
    #-----------------------
    # SAT
    res = c.Sat_Solver(test='speed') # only for test, else:  c.Sat_Solver()
    end_run = dt.datetime.now()
    
    if res is None: break    # noSoluzione
    if len(res)==0:  
        allBool=True    # anly true and false -> ris={}
        break
       
    #-----------------------
    # Supp + Conf
    X,Y,s = c.decodeRule(res) # X -> antecedent, Y-> consequent, L->lower bount, U->upper bound
    suppConf = supp_conf(df_AQ, X, Y)
    
    sol = {'solut':s } | suppConf
    if suppConf['supp'] < supp:
        c.addConstr( c.NotSupported(res) )
        sol |= {'state':'Not supported'}
    
    elif suppConf['conf'] < conf:
        c.addConstr( c.NotConfident(res) )
        sol |= {'state':'Not confident'}
    
    else:
        c.addConstr( c.Confident(res) )
        sol |= {'state':'Confidend'}
    
    c.addConstr(  c.reject_solut(res) )
    print(f"runTime:{str(end_run - start_run)[:-7]}, TotTime:{str(end_run-startTime)[:-7]}, State:{sol['state']},")
 
    #-----------------------------
    # Save sol
    state_Sol.append(sol) 
    
    X[['Side','idRule']], Y[['Side','idRule']] = ('X',i),('Y',i)
    data_Solut.append([X,Y])
    i = i+1
    

In [None]:
# Save constraints
c.save_Constr_txt( 'AQ_contrsFile' )

In [None]:
# Save state of the tested rules
pd.DataFrame(state_Sol).to_csv('Tfl_state.csv')
pd.DataFrame(state_Sol)

In [None]:
# save the rule with range attribute
pd.concat([pd.concat(d).reset_index().set_index('idRule') for d in data_Solut]).reset_index().to_csv('Tfl_solutions.csv',index=False)
pd.concat([pd.concat(d).reset_index().set_index('idRule') for d in data_Solut]).reset_index()