The code below is the metaDataFrame class for capturing the metadata from the previous loops and recording them. There will be strategies that are based on particular scenarios, i.e. High-Variance (HV), Low-Value-Low-Variance (LV2), and consistent-negative-change (CNC).

In [1]:
import numpy as np
import pandas as pd

class metaDataFrame(object):
    def __init__(self, inferIdx):
        self.df = pd.DataFrame(index=inferIdx)
        
    # Function: Assign the ground truth
    # This is only used for experimentation purposes, as the ground truth label is normally not available for training data.
    def assignGroundTruth(self, groundTruth):
        gt_idx = [row[0] for row in groundTruth]
        gt_series = pd.Series([row[1] for row in groundTruth], index=gt_idx)
        
        self.df['ground_truth'] = gt_series
        
    # Function: Record the metadata from the AL loop
    def appendMetadata(self, metaType, iterNum, inferResults):
        class_name = 'class_' + str(iterNum)
        meta_name = metaType + '_' + str(iterNum)
        
        infer_idx = [row[0] for row in inferResults]
        infer_pred = pd.Series([row[2] for row in inferResults], index=infer_idx)
        infer_prob = pd.Series([row[3] for row in inferResults], index=infer_idx)
        
        # Create df from the prediction and probability series
        frame = { class_name: infer_pred, meta_name: infer_prob } 
        infer_df = pd.DataFrame(frame)
        
        self.df = self.df.merge(infer_df, how='left', left_index=True, right_index=True)
    
    # Function: Calculate the mean of the metadata
    def calculateAverage(self, metaType, k):
        loops = [col for col in self.df.columns.tolist() if metaType in col]
        
        # Check that the meta dataframe has at least 2 metadata cols and >= k
        if len(loops) >= 2 and len(loops) >= k:
            metadata_avg = []
            
            mask = [col for col in self.df.columns.tolist() if metaType in col][-k:]
            metadata_avg = self.df[mask].apply(lambda x: x.mean(), axis=1)
        else:
            print('The meta dataframe has no iterations record to compute the average.')
        
        return metadata_avg
    
    # Function: Calculate the variance in the metadata
    def calculateVariance(self, metaType, k):
        loops = [col for col in self.df.columns.tolist() if metaType in col]
        
        # Check that the meta dataframe has at least 2 metadata cols and >= k
        if len(loops) >= 2 and len(loops) >= k:
            metadata_var = []
            
            mask = [col for col in self.df.columns.tolist() if metaType in col][-k:]
            metadata_var = self.df[mask].apply(lambda x: x.var(), axis=1)
        else:
            print('The meta dataframe has no iterations record to compute the variance.')
        
        return metadata_var
    
    # Function: Calculate the net difference in the metadata of k loops
    def calculateNetDiff(self, metaType, k):
        loops = [col for col in self.df.columns.tolist() if metaType in col]
        
        # Check that the dataframe has at least 2 metadata cols and >= k
        if len(loops) >= 2 and len(loops) >= k:
            metadata_diff = []
            
            mask = [col for col in self.df.columns.tolist() if metaType in col][-k:]            
            metadata_diff = self.df[mask].diff(axis=1)
            net_diff = metadata_diff[metadata_diff.columns[-k+1:]].values.tolist()
            
#             metadata_diff = self.df[mask].apply(lambda x: [x[i+1] - x[i] for i in range(len(x)-1)])
#             metadata_netdiff = metadata_diff.sum()
        else:
            print('The meta dataframe has no iterations record to compute the net difference.')
        
        return  net_diff
    
    # Function: Create a list of predicted classes across loops
    def createPredClassList(self, k):
        loops = [col for col in self.df.columns.tolist() if 'class' in col]
        
        # Check that the meta dataframe has at least 2 metadata cols and >= k
        if len(loops) >= 2 and len(loops) >= k:
            mask = [col for col in self.df.columns.tolist() if 'class' in col][-k:]
            pred_classes = self.df[mask].values.tolist()
        
        return pred_classes


Functions to identify the data points that fall into specific multi-gen scenarios.

In [10]:
# Function: Output the indexes of the data that pertain to the High-Variance (HV) scenario according to the set params.
def queryHV(varList, varThresh):
    hv_bool = varList >= varThresh
    hv_index = hv_bool[hv_bool].index
    
    return pd.Series(hv_index)


# Function: Output the indexes of the data that pertain to the Low-Value-Low-Variance (LV2) scenario according to the set params.
def queryLV2(avgList, varList, avgThresh, varThresh):
    lv_bool = (avgList <= avgThresh) & (varList <= varThresh)
    lv2_index = lv_bool[lv_bool].index
    
    return pd.Series(lv2_index)


# Function: Output the indexes of the data that pertain to the Consistent-Negative-Change (CNC) scenario according to the set params.
def queryCNC(net_diff, negThresh, changeThresh):
    # Logic here to differentiate negative differences
    # 1. Instead of an actual count, the threshold may have to be a ratio? > 0.6? # of diff / k
    # 2. Weight average of the differences
    # Maybe make this an if statement with choices?
    k = len(net_diff[0])
    change_list = np.array(net_diff) <= negThresh
    change_count = np.sum(change_list, 1)
    coc_bool = change_count >= changeThresh
    coc_index = np.where(coc_bool)
    
    return pd.Series(coc_index)
    

# Function: Output the indexes of the data that pertain to the Change-of-Class (CoC) scenario according to the set params.
def queryCoC(predClasses, changeThresh):
    change_list = np.array([[(row[i] != row[i+1]) for i in range(len(row)-1)] for row in predClasses])
    change_count = np.sum(change_list, 1)
    coc_bool = change_count >= changeThresh
    coc_index = list(*np.where(coc_bool))
    
    return pd.Series(coc_index)

In [3]:
# Low hanging fruit = we have experiment results, both Fashion-MNIST and facial recog.
# We don't even know whether these 4 scenarios > baseline.

# HV, LV2, CNC, CoC: high hanging fruit, nice to have.
# Build a greedy algorithm, that evaluates every one of these 4 scenarios at every loop.-> how much of an improvement in model performance.
# 1. Take the most effective approach at every loop after the evaluation
# 2. A weighted average of all the approaches, where it learns the weights over time.
# Computationally heavy. <- not sure if it can handle the load.

Test with mock up data

In [4]:
# Sample test data to test the different multi-gen scenario functions
# Sample scenarios:
# 1. Not informative
# 2. HV
# 3. LV2
# 4. CNC
# 5. CoC
meta_df = metaDataFrame([1, 2, 3, 5, 6])

meta_df.appendMetadata('confidence', 1, [[1, 'class2', 'class2', .55], [2, 'class2', 'class2', .92], [3, 'class2', 'class2', .12], [5, 'class2', 'class2', .78], [6, 'class2', 'class2', .82]])
meta_df.appendMetadata('confidence', 2, [[1, 'class1', 'class2', .68], [2, 'class1', 'class1', .30], [3, 'class1', 'class2', .22], [5, 'class1', 'class2', .75], [6, 'class1', 'class1', .68]])
meta_df.appendMetadata('confidence', 3, [[1, 'class1', 'class2', .81], [2, 'class1', 'class2', .68], [3, 'class1', 'class1', .15], [5, 'class1', 'class2', .67], [6, 'class1', 'class2', .72]])
meta_df.appendMetadata('confidence', 4, [[1, 'class2', 'class2', .90], [2, 'class2', 'class2', .72], [3, 'class2', 'class2', .19], [5, 'class2', 'class2', .51], [6, 'class2', 'class1', .88]])
meta_df.appendMetadata('confidence', 5, [[1, 'class2', 'class2', .89], [2, 'class2', 'class1', .37], [3, 'class2', 'class2', .21], [5, 'class2', 'class2', .52], [6, 'class2', 'class2', .90]])
meta_df.assignGroundTruth([[1, 'class2'], [2, 'class1'], [3, 'class1'], [5, 'class2'], [6, 'class2']])

meta_df.df

Unnamed: 0,class_1,confidence_1,class_2,confidence_2,class_3,confidence_3,class_4,confidence_4,class_5,confidence_5,ground_truth
1,class2,0.55,class2,0.68,class2,0.81,class2,0.9,class2,0.89,class2
2,class2,0.92,class1,0.3,class2,0.68,class2,0.72,class1,0.37,class1
3,class2,0.12,class2,0.22,class1,0.15,class2,0.19,class2,0.21,class1
5,class2,0.78,class2,0.75,class2,0.67,class2,0.51,class2,0.52,class2
6,class2,0.82,class1,0.68,class2,0.72,class1,0.88,class2,0.9,class2


In [5]:
# Test the different class helper functions
var_list = meta_df.calculateVariance('confidence', 3)
print(var_list)

avg_list = meta_df.calculateAverage('confidence', 3)
print(avg_list)

pred_classes = meta_df.createPredClassList(3)
print(pred_classes)

1    0.002433
2    0.036700
3    0.000933
5    0.008033
6    0.009733
dtype: float64
1    0.866667
2    0.590000
3    0.183333
5    0.566667
6    0.833333
dtype: float64
[['class2', 'class2', 'class2'], ['class2', 'class2', 'class1'], ['class1', 'class2', 'class2'], ['class2', 'class2', 'class2'], ['class2', 'class1', 'class2']]


In [9]:
# HV scenario test
HV_test = queryHV(var_list, 0.001)
print(HV_test)

# LV2 scenario test
LV2_test = queryLV2(avg_list, var_list, .4, 0.001)
print(LV2_test)

# CNC scenario test
CNC_test = queryCNC(net_diff, -.01, 2)
print(CNC_test)

# CoC scenario test
CoC_test = queryCoC(pred_classes, 1)
print(CoC_test)

0    1
1    2
2    5
3    6
dtype: int64
0    3
dtype: int64
0    [1, 3]
dtype: object
0    1
1    2
2    4
dtype: int64


In [7]:
print(meta_df.calculateNetDiff('confidence', 5))

[[0.13, 0.13, 0.08999999999999997, -0.010000000000000009], [-0.6200000000000001, 0.38000000000000006, 0.039999999999999925, -0.35], [0.1, -0.07, 0.04000000000000001, 0.01999999999999999], [-0.030000000000000027, -0.07999999999999996, -0.16000000000000003, 0.010000000000000009], [-0.1399999999999999, 0.039999999999999925, 0.16000000000000003, 0.020000000000000018]]


In [8]:
net_diff = meta_df.calculateNetDiff('confidence', 5)
print(queryCNC(net_diff, -.01, 2))

0    [1, 3]
dtype: object
