In [None]:
import json
import pandas as pd
import numpy as np
import os

In [None]:
import logging

logging.basicConfig(
    filename='run.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
)


In [1]:
def files_to_df(dir_name):
    full_df = pd.DataFrame() # initialize full df compilation of files
    files = os.listdir(dir_name) # get list of files in directory
    logging.info(f"{len(files)} files found in {dir_name}")

    for i in files:
        filename = os.path.join(dir_name, i)
        with open(filename, 'r') as f:
            data = json.loads(json.load(f))['data']
            df = pd.DataFrame(data)
        full_df = pd.concat([full_df, df], ignore_index = True) # join current json file to full df

    # add a column of `null` values for where images will be stored
    full_df['image'] = np.nan
    
    return full_df

def check_for_images(df, query):
    # populate image column with image file names, if they exist
    for i in range(len(df)):
        filename = f"full_data/images/query_{query}/{df['hash_id'][i]}.jpg" # name the image using its hash_id
        if os.path.exists(filename):
            df['image'][i] = filename
    return df

def df_to_json(query):
    
    logging.info(f"Converting json files in raw_data/query_{query} to dataframe.")
    df = files_to_df(f'raw_data/query_{query}')
    logging.debug(f"Shape of dataframe: {df.shape}")
        
    # convert dataframe back to json and export it
    os.makedirs('full_data', exist_ok=True)
    export = df.to_json(f"full_data/query_{query}.json")
    logging.info("Export complete.")
    
    return df

In [None]:
# load query
with open('query.txt', 'r') as f:
    query = f.read().strip()

In [None]:
df = files_to_df(f'raw_data/query_{query}')
df = check_for_images(df, query)
df_to_json(query)