# Document Model JSON Files Creation

- First of all we need to import the necessary libraries.

In [1]:
import pandas as pd

- We will read the stores datasets.

In [2]:
stores_df_50k = pd.read_csv('./datasets/stores_50k.csv')
stores_df_100k = pd.read_csv('./datasets/stores_100k.csv')
stores_df = pd.read_csv('./datasets/stores_full.csv')

- We will create a file containing a JSON array that will be used by `mongoimport` to insert the stores into the `Stores` collection in the database.

In [3]:
def create_stores_json_array(df, dataset_size):
    with open(f'./json_data/{dataset_size}/stores.json', 'w', encoding='utf-8') as stores_file:
        stores_file.write('[\n')
        for idx, store in df.iterrows():
            stores_file.write(f'''  {{\n    "id": {store['id']},\n    "name": "{store['name']}",\n    "description": "{store['description']}",\n    "address": "{store['address']}",\n    "city": "{store['city']}",\n    "state": "{store['state']}",\n    "postal_code": "{store['postal_code']}",\n    "location": {{\n      "type": "Point",\n      "coordinates": [{store['longitude']}, {store['latitude']}]\n    }}\n  }}{',' if idx != (len(df) - 1) else ''}\n''')
        stores_file.write(']')

In [4]:
create_stores_json_array(stores_df_50k, dataset_size='50k')
create_stores_json_array(stores_df_100k, dataset_size='100k')
create_stores_json_array(stores_df, dataset_size='full')

- We will read the products datasets.

In [5]:
products_df_50k = pd.read_csv('./datasets/products_50k.csv')
products_df_100k = pd.read_csv('./datasets/products_100k.csv')
products_df = pd.read_csv('./datasets/products_full.csv')

- We will create 4 equal sized files, each containing a JSON array that will be used by `mongoimport` to insert the products into the `Products` collection in the database.

In [6]:
def create_products_json_arrays(df, dataset_size, num_files=4):
    chunk_size = len(df) // num_files
    for i in range(num_files):
        start = i * chunk_size
        end = (i + 1) * chunk_size if i < num_files - 1 else len(df)
        chunk = df.iloc[start:end]
        with open(f'./json_data/{dataset_size}/products_{i+1}.json', 'w', encoding='utf-8') as products_file:
            products_file.write('[\n')
            for idx, product in chunk.iterrows():
                products_file.write(f'''  {{\n    "id": {product['id']},\n    "name": "{product['name']}",\n    "description": "{product['description']}",\n    "price": {product['price']},\n    "calories": {product['calories']},\n    "protein": {product['protein']},\n    "carbs": {product['carbs']},\n    "fat": {product['fat']},\n    "store_id": {product['store_id']}\n  }}{',' if idx != (end - 1) else ''}\n''')
            products_file.write(']')

In [7]:
create_products_json_arrays(products_df_50k, dataset_size='50k')
create_products_json_arrays(products_df_100k, dataset_size='100k')
create_products_json_arrays(products_df, dataset_size='full')