In [2]:
import h5py
import glob 
import os
import numpy as np

In [3]:
def removeDuplicates(thumbnail):
    
    seen_labels = []
    
    boxes = thumbnail["boxes"]
    scores= thumbnail["scores"]
    labels= thumbnail["labels"]
    
    new_boxes = []
    new_scores= []
    new_labels= []
    
    for idx,label in enumerate(labels):
        
        if label not in seen_labels:
            seen_labels.append(label)
            #np.append(new_boxes,boxes[idx],axis=0)
            #np.append(new_scores,scores[idx],axis=0)
            #np.append(new_labels,[label],axis=0)
            new_boxes.append(boxes[idx])
            new_scores.append(scores[idx])
            new_labels.append(label)
    
    new_boxes = np.array(new_boxes)
    new_scores= np.array(new_scores)
    new_labels= np.array(new_labels)
    
    return {"boxes":new_boxes,"scores":new_scores,"labels":new_labels}  

In [4]:
COCO_INSTANCE_CATEGORY_NAMES = [
    '_background_', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

dict_keys = ['CA','DE','FR','GB','IN','JP','KR','MX','RU','US']
dset = {}
reduced_dset = {}

for f in glob.glob('data/*.{}'.format('h5')):
    with h5py.File(f, 'r') as h5_file: 
        
        class_name = os.path.splitext(os.path.basename(f))[0]
        all_thumbnails = []
        reduced_thumbnails = []
        
        for key in h5_file.keys():
            
            boxes = h5_file[key]['boxes'][:]
            scores = h5_file[key]['scores'][:]
            labels = [COCO_INSTANCE_CATEGORY_NAMES[i] for i in h5_file[key]['labels'][:]]
            
            one_thumbnail = {"boxes":boxes,"scores":scores,"labels":labels}
            
            all_thumbnails.append(one_thumbnail)
            reduced_thumbnails.append(removeDuplicates(one_thumbnail))
            
            dset[class_name] = all_thumbnails
            reduced_dset[class_name] = reduced_thumbnails

In [5]:
#TODO: Do frequent itemset analysis on labels for each country
#Get frequent itemsets for each country?
#Maybe FP growth

In [6]:
import pandas as pd
import numpy as np

In [7]:
all_dict = []
for country in reduced_dset:
    country_dict = dict.fromkeys(COCO_INSTANCE_CATEGORY_NAMES,0)
    for datapoint in reduced_dset[country]:
        for label in datapoint["labels"]:
            country_dict[label] += 1
    all_dict.append(country_dict)

df = pd.DataFrame(all_dict,index=dict_keys)

In [8]:
df = df.transpose()

In [9]:
df

Unnamed: 0,CA,DE,FR,GB,IN,JP,KR,MX,RU,US
_background_,0,0,0,0,0,0,0,0,0,0
person,19437,22288,21604,2808,12466,6866,9083,20478,23812,5490
bicycle,399,471,617,70,128,192,133,420,470,131
car,5241,6421,6257,706,3343,2551,2798,5774,7544,1477
motorcycle,312,349,466,30,95,159,70,275,416,81
...,...,...,...,...,...,...,...,...,...,...
vase,3275,3515,3684,460,2114,1240,1290,4032,3921,884
scissors,1007,1005,1138,108,556,330,256,1263,1218,265
teddy bear,2989,3155,3333,373,1926,1332,1216,3694,3691,779
hair drier,1410,1578,1694,202,716,477,604,1649,1771,368


In [10]:
['CA','DE','FR','GB','IN','JP','KR','MX','RU','US']

videoCounts = pd.DataFrame({    'CA':[len(dset["CA"])],
                                'DE':[len(dset["DE"])],
                                'FR':[len(dset["FR"])],
                                'GB':[len(dset["GB"])],
                                'IN':[len(dset["IN"])],
                                'JP':[len(dset["JP"])],
                                'KR':[len(dset["KR"])],
                                'MX':[len(dset["MX"])],
                                'RU':[len(dset["RU"])],
                                'US':[len(dset["US"])]})

In [11]:
videoCounts

Unnamed: 0,CA,DE,FR,GB,IN,JP,KR,MX,RU,US
0,20464,23743,22954,2973,12930,7425,9645,22001,26197,5977


In [12]:
df.nlargest(20,columns=df.columns)

Unnamed: 0,CA,DE,FR,GB,IN,JP,KR,MX,RU,US
person,19437,22288,21604,2808,12466,6866,9083,20478,23812,5490
tv,16521,19131,18660,2432,10620,5977,8279,17497,20128,4801
laptop,11802,13586,13299,1628,7574,4293,6042,12582,14576,3382
chair,11148,12507,12530,1689,7096,4120,5307,11317,12966,3048
cell phone,8973,9419,9491,1141,6402,2620,3799,8979,9770,2295
dining table,7727,8415,8391,916,4533,2961,3322,7973,9822,2086
train,7537,9805,9413,1117,6817,3632,4536,9201,11374,2118
cup,7517,7974,8166,955,4657,2681,3236,7922,8353,2035
book,7174,8041,7615,895,4530,2130,2991,7520,8630,2018
cat,6735,8017,7936,1162,3559,2931,3024,7893,9176,2183


## PART 2: Frequency Itemset Analysis

In [15]:
!pip install mlxtend

Collecting mlxtend
  Using cached https://files.pythonhosted.org/packages/52/04/c362f34f666f0ddc7cf593805e64d64fa670ed96fd9302e68549dd48287d/mlxtend-0.17.0-py2.py3-none-any.whl
Installing collected packages: mlxtend
Successfully installed mlxtend-0.17.0


In [20]:
onehot_coded = {} 

from mlxtend.frequent_patterns import apriori, association_rules,fpgrowth
from tqdm import tqdm,trange

In [36]:
'''
for country in reduced_dset:
    country_DF = pd.DataFrame(0,index=np.arange(len(reduced_dset[country])),columns=COCO_INSTANCE_CATEGORY_NAMES)
    for idx in trange(len(dset[country])):
        datapoint = dset[country][idx]
        country_DF.loc[idx,datapoint['labels']] = 1
    onehot_coded[country] = country_DF
'''

100%|██████████| 20464/20464 [14:41<00:00, 23.22it/s]
100%|██████████| 23743/23743 [21:01<00:00, 18.82it/s]
100%|██████████| 22954/22954 [15:57<00:00, 23.96it/s]
100%|██████████| 2973/2973 [00:17<00:00, 165.21it/s]
100%|██████████| 12930/12930 [05:12<00:00, 41.39it/s]
100%|██████████| 7425/7425 [01:49<00:00, 67.82it/s]
100%|██████████| 9645/9645 [03:05<00:00, 51.94it/s]
100%|██████████| 22001/22001 [17:17<00:00, 21.21it/s]
100%|██████████| 26197/26197 [26:20<00:00, 16.57it/s]
100%|██████████| 5977/5977 [01:07<00:00, 88.07it/s]


In [86]:

for t in trange(len(onehot_coded)):
    country = dict_keys[t]
    onehot_coded[country].to_csv('{}.csv'.format(country),index=False)


100%|██████████| 10/10 [00:09<00:00,  1.07it/s]


In [87]:
loaded_onehot = {} 
for t in trange(len(dict_keys)):
    country = dict_keys[t]
    loaded_onehot[country] = pd.read_csv('{}.csv'.format(country))

100%|██████████| 10/10 [00:00<00:00, 15.68it/s]


In [88]:
country_itemsets = {} 

for t in trange(len(dict_keys)):
    country = dict_keys[t]

    current_df = loaded_onehot[country]
    freqLists = fpgrowth(current_df, min_support=0.1, use_colnames=True, max_len=None, verbose=0)
    country_itemsets[country] = freqLists

100%|██████████| 10/10 [00:07<00:00,  1.32it/s]


In [89]:
for country in country_itemsets.keys():
    curr = country_itemsets[country]
    name = "freqsets/{}_freqsets.csv".format(country)
    curr.to_csv(name,index=False)

In [92]:
a = country_itemsets["CA"]

In [95]:
a.loc[0,'itemsets']

frozenset({'tv'})