In [34]:
import ijson
import json
import numpy as np
import pandas as pd
import os
from pandas.io.json import json_normalize

In [35]:
google_json_path='./inputs/Google/FRIENDS - Hulu (Japan)_celebrity.json'
aws_json_path = './inputs/aws_FRIENDS.json'
azure_json_path = './inputs/azure_FRIENDS.json'

In [36]:
class AmazonJSONParser:
    def __init__(self, aws_json_path):
        self.aws_json_path = aws_json_path
        
    def __add_bounding_box_celebrity(self, row):
        temp_dict = { 'left': row['Celebrity.Face.BoundingBox.Left'], \
                      'right': row['Celebrity.Face.BoundingBox.Left'] + row['Celebrity.Face.BoundingBox.Width'], \
                      'top' : row['Celebrity.Face.BoundingBox.Top'], \
                      'bottom' : row['Celebrity.Face.BoundingBox.Top'] + row['Celebrity.Face.BoundingBox.Height'] }
        return temp_dict

    def get_celebrity_info(self):
        with open(self.aws_json_path, 'r') as f:
            celebrities_list = list(ijson.items(f, 'Celebrities.item'))
        df = json_normalize(celebrities_list)
        
        df.drop(['Celebrity.Face.Landmarks', 'Celebrity.Face.Pose.Roll', 'Celebrity.Face.Pose.Yaw', \
                 'Celebrity.Face.Pose.Pitch', 'Celebrity.Face.Quality.Brightness', 'Celebrity.Face.Quality.Sharpness', \
                 'Celebrity.Face.Confidence'], axis=1, inplace=True)

        df['bounding_box'] = df.apply(self.__add_bounding_box_celebrity, axis=1)
        df['timestamp'] = df['Timestamp']/1000
        df['source'] = 'amazon'

        df.drop(['Celebrity.Face.BoundingBox.Left', 'Celebrity.Face.BoundingBox.Width', 'Celebrity.Face.BoundingBox.Top', \
                 'Celebrity.Face.BoundingBox.Height', 'Timestamp'], axis=1, inplace=True)
        df = df.rename(columns = { "Celebrity.Urls":"url", "Celebrity.Name": "name", \
                                   "Celebrity.Id": "id", "Celebrity.Confidence" : "confidence" })
        return df
    
    def __handle_bounding_box_labels(self, row):
        if pd.isna(row['BoundingBox.Left']):
            return None
        temp_dict = { 'left': row['BoundingBox.Left'], \
                      'right': row['BoundingBox.Left'] + row['BoundingBox.Width'], \
                      'top' : row['BoundingBox.Top'], \
                      'bottom' : row['BoundingBox.Top'] + row['BoundingBox.Height'] }
        return temp_dict
    
    def get_label_info(self):
        with open(self.aws_json_path, 'r') as f:
            labels_list = list(ijson.items(f, 'Labels.item'))
        df = json_normalize(labels_list)
        df['source'] = 'amazon'
        df['timestamp'] = df['Timestamp']/1000
        df = df.explode('Label.Instances').reset_index(drop=True)
        df['Label.Instances'] = df['Label.Instances'].apply(lambda x: {} if pd.isna(x) else x)
        df = df.join(json_normalize(df['Label.Instances']))
        df['confidence'] = df.apply(lambda x: x['Confidence'] if pd.notna(x['Confidence']) else x['Label.Confidence'], axis = 1)
        df['bounding_box'] = df.apply(self.__handle_bounding_box_labels, axis=1)
        df.drop(['Timestamp', 'Label.Confidence', 'Confidence', 'Label.Instances', 'BoundingBox.Left', 'BoundingBox.Width', \
                 'BoundingBox.Top', 'BoundingBox.Height'], axis=1, inplace=True)
        df = df.rename(columns = { "Label.Name": "name", "Label.Parents": "parents"})
        return df
    
    def get_logo_info(self):
        #unsupported
        pass
    
    def get_text_info(self):
        #unsupported
        pass
    
    def get_speech_info(self):
        #unsupported
        pass
    
    def get_shots_info(self):
        #unsupported
        pass
    
    def get_content_moderation_info(self):
        with open(self.aws_json_path, 'r') as f:
            moderation_list = list(ijson.items(f, 'ModerationLabels.item'))
        df = json_normalize(moderation_list)
        df['source'] = 'amazon'
        df['Timestamp'] = df['Timestamp']/1000
        df = df.rename(columns = { "Timestamp":"timestamp", "ModerationLabel.Name": "name", \
                                   "ModerationLabel.Confidence" : "confidence", \
                                   "ModerationLabel.ParentName" : "parent_category" })
        return df


In [37]:
amazonParser = AmazonJSONParser(aws_json_path)
aws_celeb_info = amazonParser.get_celebrity_info()
aws_celeb_info

Unnamed: 0,url,name,id,confidence,bounding_box,timestamp,source
0,[www.imdb.com/name/nm0001612],Matthew Perry,2i3Ga9e,100,"{'left': 0.4437499940395355, 'right': 0.533333...",0.0,amazon
1,[www.imdb.com/name/nm0001455],Matt LeBlanc,43be2M,50,"{'left': 0.5604166388511658, 'right': 0.649999...",0.0,amazon
2,[www.imdb.com/name/nm0001612],Matthew Perry,2i3Ga9e,98,"{'left': 0.42500001192092896, 'right': 0.51041...",0.458,amazon
3,[www.imdb.com/name/nm0001455],Matt LeBlanc,43be2M,100,"{'left': 0.5572916865348816, 'right': 0.650000...",0.458,amazon
4,[www.imdb.com/name/nm0001612],Matthew Perry,2i3Ga9e,89,"{'left': 0.4333333373069763, 'right': 0.529166...",0.959,amazon
5,[www.imdb.com/name/nm0001455],Matt LeBlanc,43be2M,98,"{'left': 0.5541666746139526, 'right': 0.648958...",0.959,amazon
6,[www.imdb.com/name/nm0001435],Lisa Kudrow,2aM9z3i,76,"{'left': 0.684374988079071, 'right': 0.8281249...",3.962,amazon
7,[www.imdb.com/name/nm0001435],Lisa Kudrow,2aM9z3i,50,"{'left': 0.7552083134651184, 'right': 0.889583...",4.462,amazon
8,[www.imdb.com/name/nm0001435],Lisa Kudrow,2aM9z3i,95,"{'left': 0.7916666865348816, 'right': 0.928125...",4.963,amazon
9,[www.imdb.com/name/nm0001455],Matt LeBlanc,43be2M,97,"{'left': 0.41458332538604736, 'right': 0.57812...",10.468,amazon


In [38]:
aws_label_info = amazonParser.get_label_info()
aws_label_info

Unnamed: 0,name,parents,source,timestamp,confidence,bounding_box
0,Dungeon,[],amazon,0.000,53.042518615722656,
1,Human,[],amazon,0.000,98.46746063232422,
2,Person,[],amazon,0.000,98.38488006591797,"{'left': 0.3312615156173706, 'right': 0.523564..."
3,Person,[],amazon,0.000,98.06590270996094,"{'left': 0.6007581353187561, 'right': 0.791674..."
4,Person,[],amazon,0.000,96.37962341308594,"{'left': 0.38436955213546753, 'right': 0.55190..."
...,...,...,...,...,...,...
985,Logo,[{'Name': 'Symbol'}],amazon,29.863,64.84605407714844,
986,Symbol,[],amazon,29.863,71.2292709350586,
987,Text,[],amazon,29.863,87.5086669921875,
988,Trademark,[{'Name': 'Symbol'}],amazon,29.863,64.84605407714844,


In [39]:
aws_moderation_info = amazonParser.get_content_moderation_info()
aws_moderation_info

Unnamed: 0,timestamp,confidence,name,parent_category,source
0,6.965,67.12017059326172,Violence,,amazon
1,6.965,67.12017059326172,Weapon Violence,Violence,amazon
2,7.465,74.79823303222656,Violence,,amazon
3,7.465,71.15692901611328,Weapon Violence,Violence,amazon
4,22.981,66.28107452392578,Violence,,amazon


In [40]:
restructured_json = {'celebrities': json.loads(aws_celeb_info.to_json(orient='records')),
                     'labels': json.loads(aws_label_info.to_json(orient='records')),
                     'content_moderation': json.loads(aws_moderation_info.to_json(orient='records'))
                    }
with open('restructured_aws.json', "w") as write_file:
    json.dump(restructured_json, write_file, indent=4)