All the libraries, modules, imports and constant used in this file

In [109]:
from json import load
import pandas as pd # type: ignore 
import mlxtend # type: ignore

FILE_INPUT = "modified_coco.json"
FILE_INPUT_TOY = "toy_dataset.txt"
THRESHOLD = 1
READ_TOY_DATA = False
MIN_SUPPORT = 0.02
MAX_LEN_ITEMSETS = None
USE_APRIORI = 0

This function reads a toy dataset from file to implement the apriori principle 

In [110]:
def readToyDatasetFile(file:str)->[set[tuple[str]], set[str]]:  # type: ignore
    data = set()
    singleItems = set()
    with open(file, "r", encoding='UTF-8') as fp:
        for line in fp:
            line = tuple(line.strip().split(","))
            data.add(line)
            for item in line:
                singleItems.add(item)
            
    return [data, singleItems]

This function reads the coco dataset

In [111]:
def readFile(file:str)->list[list[dict], set[str]]:
    with open(file, "r", encoding='UTF-8') as fp:
        data = load(fp)
    
    return data, {el for item in data for el in item['annotations']}

This function creates the matrix data frame for the data as a sparse matrix

In [112]:
def createDataFrame(data:list[dict], columnsName:list[str])->list[list[bool]]:
    return pd.DataFrame([[column in element['annotations'] for column in columnsName] for element in data],
                        index=[element['image_id'] for element in data], columns=columnsName)

This function counts the number of times the itemsets of length length are in the dataset

In [113]:
def countOccurrences(data:list[dict], items:set[tuple], length:int)->dict[tuple:int]:
    return {tuple(item): sum(1 for row in data if len(set(item).intersection(set(row['annotations']))) == len(item)) for item in set(filter(lambda x : len(x)<= length, items))}
    

This function searches for pictures with that contains all the given characteristic 

In [114]:
def searchPictureWithItemSet(data:list[dict], words:set[str])->list[int, str, str]:
    return [[index, item['file_name'], item['image_id']] for index, item in enumerate(data) if words.issubset(set(item['annotations']))]

This is the main function of the program that will control all the flow
<ol>
<li>Reads the toy data set or the new data set</li>
<li>Creates the data frame from the given data</li>
<li>Executes the fp growth algorithm</li>
<li>Executes the apriori algorithm</li>
<li>Searches for pictures matching a given description</li>
<li>Counts and prints the occurrences of the items found by fp or ap in the given item set</li>
</ol>

In [115]:
def main() -> None:
    [data, singleItems] = readToyDatasetFile(FILE_INPUT_TOY) if READ_TOY_DATA else readFile(FILE_INPUT) 

    matrix = createDataFrame(sorted(data, key=lambda x:x['image_id']), sorted(singleItems))

    fp = mlxtend.frequent_patterns.fpgrowth(df=matrix, min_support=MIN_SUPPORT, use_colnames=True, max_len=MAX_LEN_ITEMSETS)
    print(len(fp), "\n", fp.to_string())


    if USE_APRIORI: 
        ap = mlxtend.frequent_patterns.apriori(df=matrix, min_support=MIN_SUPPORT, use_colnames=True, max_len=MAX_LEN_ITEMSETS)
        print(len(ap), "\n", ap.to_string())


    
    for picture_id in searchPictureWithItemSet(data, {'baseball bat', 'baseball glove', 'bench', 'person'}):
        print(picture_id)


    """
    occurrences = countOccurrences(data, set(ap.itemsets), length=2)
    
    for item in sorted(list(map(lambda x:[x, occurrences[x]], occurrences.keys())), key=lambda y:y[1], reverse=True):
        print(f"item {list(item[0])} : {item[1]}")
    
    # The same thing actually
    print("\n")
    for item in sorted(zip(fp.support, fp.itemsets), key=lambda x:x[0], reverse=True):
        print(f"item {list(item[1])} : {int(item[0]*len(data))}")
    """
    
    
main()

144 
      support                                       itemsets
0     0.5886                                       (person)
1     0.4338                                        (bench)
2     0.0276                                          (dog)
3     0.1332                                    (stop sign)
4     0.0492                                        (train)
5     0.3230                                (traffic light)
6     0.3704                                          (car)
7     0.1286                                        (truck)
8     0.0912                                          (bus)
9     0.0554                                (parking meter)
10    0.0332                                        (clock)
11    0.0852                                     (backpack)
12    0.0762                                      (bicycle)
13    0.0386                                 (dining table)
14    0.0200                                         (bird)
15    0.0474                      