# First: Review Existing Unstructured Data and Diagram a New Structured Relational Data Model
This script reviews the unstructured JSON data and then it flattens each JSON file to understand the structured relationship between them. An Entity-Relationship (ER) diagram has been drawn for this model and is attched in the pdf format.

## Import Libraries

In [1]:
import pandas as pd
import json

## Load the JSON data

In [4]:
# Function to load a JSON file into a list
def load_json_file(file_path):
    
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            data = [json.loads(line) for line in file]  # Read each line and parse as JSON
        print(f"JSON Loaded Successfully from {file_path}")
        return data
    except json.JSONDecodeError as e:
        print(f"Error loading JSON from {file_path}: {e}")  # Handle JSON formatting errors
        return None
    except FileNotFoundError:
        print(f"File not found: {file_path}")   # Handle missing files
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}") # Catch any other unexpected errors
        return None
    
# Load JSON files into Python lists
data_users = load_json_file("users.json")
data_brands = load_json_file("brands.json")
data_receipts = load_json_file("receipts.json")

JSON Loaded Successfully from users.json
JSON Loaded Successfully from brands.json
JSON Loaded Successfully from receipts.json


## Print the preview of JSON data and then convert into Pandas dataframe

In [21]:
# # Print the loaded JSON data to verify contents
print(data_users[:5])  # Prints list of user objects

[{'_id': {'$oid': '5ff1e194b6a9d73a3a9f1052'}, 'active': True, 'createdDate': {'$date': 1609687444800}, 'lastLogin': {'$date': 1609687537858}, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI'}, {'_id': {'$oid': '5ff1e194b6a9d73a3a9f1052'}, 'active': True, 'createdDate': {'$date': 1609687444800}, 'lastLogin': {'$date': 1609687537858}, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI'}, {'_id': {'$oid': '5ff1e194b6a9d73a3a9f1052'}, 'active': True, 'createdDate': {'$date': 1609687444800}, 'lastLogin': {'$date': 1609687537858}, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI'}, {'_id': {'$oid': '5ff1e1eacfcf6c399c274ae6'}, 'active': True, 'createdDate': {'$date': 1609687530554}, 'lastLogin': {'$date': 1609687530597}, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI'}, {'_id': {'$oid': '5ff1e194b6a9d73a3a9f1052'}, 'active': True, 'createdDate': {'$date': 1609687444800}, 'lastLogin': {'$date': 1609687537858}, 'role': 'consumer', 'signUpSource': 'Email', 

In [6]:
## Convert JSON files into Pandas DataFrames for analysis

df_users = pd.read_json("users.json", lines=True) 
print(df_users.head()) # Display the first few rows of each DataFrame

                                    _id  active               createdDate  \
0  {'$oid': '5ff1e194b6a9d73a3a9f1052'}    True  {'$date': 1609687444800}   
1  {'$oid': '5ff1e194b6a9d73a3a9f1052'}    True  {'$date': 1609687444800}   
2  {'$oid': '5ff1e194b6a9d73a3a9f1052'}    True  {'$date': 1609687444800}   
3  {'$oid': '5ff1e1eacfcf6c399c274ae6'}    True  {'$date': 1609687530554}   
4  {'$oid': '5ff1e194b6a9d73a3a9f1052'}    True  {'$date': 1609687444800}   

                  lastLogin      role signUpSource state  
0  {'$date': 1609687537858}  consumer        Email    WI  
1  {'$date': 1609687537858}  consumer        Email    WI  
2  {'$date': 1609687537858}  consumer        Email    WI  
3  {'$date': 1609687530597}  consumer        Email    WI  
4  {'$date': 1609687537858}  consumer        Email    WI  


In [22]:
print(data_brands[:5])  # Prints list of JSON objects

[{'_id': {'$oid': '601ac115be37ce2ead437551'}, 'barcode': '511111019862', 'category': 'Baking', 'categoryCode': 'BAKING', 'cpg': {'$id': {'$oid': '601ac114be37ce2ead437550'}, '$ref': 'Cogs'}, 'name': 'test brand @1612366101024', 'topBrand': False}, {'_id': {'$oid': '601c5460be37ce2ead43755f'}, 'barcode': '511111519928', 'brandCode': 'STARBUCKS', 'category': 'Beverages', 'categoryCode': 'BEVERAGES', 'cpg': {'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, '$ref': 'Cogs'}, 'name': 'Starbucks', 'topBrand': False}, {'_id': {'$oid': '601ac142be37ce2ead43755d'}, 'barcode': '511111819905', 'brandCode': 'TEST BRANDCODE @1612366146176', 'category': 'Baking', 'categoryCode': 'BAKING', 'cpg': {'$id': {'$oid': '601ac142be37ce2ead437559'}, '$ref': 'Cogs'}, 'name': 'test brand @1612366146176', 'topBrand': False}, {'_id': {'$oid': '601ac142be37ce2ead43755a'}, 'barcode': '511111519874', 'brandCode': 'TEST BRANDCODE @1612366146051', 'category': 'Baking', 'categoryCode': 'BAKING', 'cpg': {'$id': {'$oid': '6

In [23]:
df_brands = pd.read_json("brands.json", lines=True)  # Handles line-by-line JSON
print(df_brands.head(5))

                                    _id       barcode        category  \
0  {'$oid': '601ac115be37ce2ead437551'}  511111019862          Baking   
1  {'$oid': '601c5460be37ce2ead43755f'}  511111519928       Beverages   
2  {'$oid': '601ac142be37ce2ead43755d'}  511111819905          Baking   
3  {'$oid': '601ac142be37ce2ead43755a'}  511111519874          Baking   
4  {'$oid': '601ac142be37ce2ead43755e'}  511111319917  Candy & Sweets   

       categoryCode                                                cpg  \
0            BAKING  {'$id': {'$oid': '601ac114be37ce2ead437550'}, ...   
1         BEVERAGES  {'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, ...   
2            BAKING  {'$id': {'$oid': '601ac142be37ce2ead437559'}, ...   
3            BAKING  {'$id': {'$oid': '601ac142be37ce2ead437559'}, ...   
4  CANDY_AND_SWEETS  {'$id': {'$oid': '5332fa12e4b03c9a25efd1e7'}, ...   

                        name  topBrand                      brandCode  
0  test brand @1612366101024       0.0      

In [25]:
print(data_receipts[:2])  # Prints list of JSON objects

[{'_id': {'$oid': '5ff1e1eb0a720f0523000575'}, 'bonusPointsEarned': 500, 'bonusPointsEarnedReason': 'Receipt number 2 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)', 'createDate': {'$date': 1609687531000}, 'dateScanned': {'$date': 1609687531000}, 'finishedDate': {'$date': 1609687531000}, 'modifyDate': {'$date': 1609687536000}, 'pointsAwardedDate': {'$date': 1609687531000}, 'pointsEarned': '500.0', 'purchaseDate': {'$date': 1609632000000}, 'purchasedItemCount': 5, 'rewardsReceiptItemList': [{'barcode': '4011', 'description': 'ITEM NOT FOUND', 'finalPrice': '26.00', 'itemPrice': '26.00', 'needsFetchReview': False, 'partnerItemId': '1', 'preventTargetGapPoints': True, 'quantityPurchased': 5, 'userFlaggedBarcode': '4011', 'userFlaggedNewItem': True, 'userFlaggedPrice': '26.00', 'userFlaggedQuantity': 5}], 'rewardsReceiptStatus': 'FINISHED', 'totalSpent': '26.00', 'userId': '5ff1e1eacfcf6c399c274ae6'}, {'_id': {'$oid': '5ff1e1bb0a720f052300056b'}, 'bonusPointsEarned': 1

In [26]:
df_receipts = pd.read_json("receipts.json", lines=True)  # Handles line-by-line JSON
print(df_receipts.head(5))

                                    _id  bonusPointsEarned  \
0  {'$oid': '5ff1e1eb0a720f0523000575'}              500.0   
1  {'$oid': '5ff1e1bb0a720f052300056b'}              150.0   
2  {'$oid': '5ff1e1f10a720f052300057a'}                5.0   
3  {'$oid': '5ff1e1ee0a7214ada100056f'}                5.0   
4  {'$oid': '5ff1e1d20a7214ada1000561'}                5.0   

                             bonusPointsEarnedReason  \
0  Receipt number 2 completed, bonus point schedu...   
1  Receipt number 5 completed, bonus point schedu...   
2                         All-receipts receipt bonus   
3                         All-receipts receipt bonus   
4                         All-receipts receipt bonus   

                 createDate               dateScanned  \
0  {'$date': 1609687531000}  {'$date': 1609687531000}   
1  {'$date': 1609687483000}  {'$date': 1609687483000}   
2  {'$date': 1609687537000}  {'$date': 1609687537000}   
3  {'$date': 1609687534000}  {'$date': 1609687534000}   
4  {'

## Script to get the Metadata for each file, to understand the details about data

In [11]:
import os

# Function to get metadata description of the JSON file
def get_metadata(file_path, expected_keys=None):
   
    # Get the file size
    file_size = os.path.getsize(file_path) # Get file size in bytes
    
    # Get number of records 
    num_records = len(data_users)
    
    # Get keys from all records to understand the structure
    all_keys_found = set()
    
    # Extract all unique keys from the JSON objects
    for record in data_users:
        all_keys_found.update(record.keys())
         
    # Store metadata details in a dictionary
    metadata = {
        'file_size_bytes': file_size,
        'num_records': num_records,
        'all_keys_found': list(all_keys_found),
    }

    return metadata

# Path to your JSON file
file_path = 'users.json'  # Modify with the correct file path

# Get and print the metadata description
metadata = get_metadata(file_path)
print(metadata)

{'file_size_bytes': 88257, 'num_records': 495, 'all_keys_found': ['state', '_id', 'active', 'createdDate', 'role', 'signUpSource', 'lastLogin']}


In [12]:
df_users.head()

Unnamed: 0,_id,active,createdDate,lastLogin,role,signUpSource,state
0,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
1,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
2,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
3,{'$oid': '5ff1e1eacfcf6c399c274ae6'},True,{'$date': 1609687530554},{'$date': 1609687530597},consumer,Email,WI
4,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI


In [13]:
# Function to get metadata description of the JSON file
def get_metadata(file_path, expected_keys=None):
   
    # Get the file size
    file_size = os.path.getsize(file_path)
    
    # Get number of records (assuming the file is an array of objects)
    num_records = len(data_brands)
    
    # Get keys from all records to understand the structure
    all_keys_found = set()
    for record in data_users:
        all_keys_found.update(record.keys())
         
    metadata = {
        'file_size_bytes': file_size,
        'num_records': num_records,
        'all_keys_found': list(all_keys_found),
    }

    return metadata

# Path to your JSON file
file_path = 'brands.json'  # Modify with the correct file path

# Get and print the metadata description
metadata = get_metadata(file_path)
print(metadata)

{'file_size_bytes': 265784, 'num_records': 1167, 'all_keys_found': ['state', '_id', 'active', 'createdDate', 'role', 'signUpSource', 'lastLogin']}


In [14]:
df_brands.head()

Unnamed: 0,_id,barcode,category,categoryCode,cpg,name,topBrand,brandCode
0,{'$oid': '601ac115be37ce2ead437551'},511111019862,Baking,BAKING,"{'$id': {'$oid': '601ac114be37ce2ead437550'}, ...",test brand @1612366101024,0.0,
1,{'$oid': '601c5460be37ce2ead43755f'},511111519928,Beverages,BEVERAGES,"{'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, ...",Starbucks,0.0,STARBUCKS
2,{'$oid': '601ac142be37ce2ead43755d'},511111819905,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146176,0.0,TEST BRANDCODE @1612366146176
3,{'$oid': '601ac142be37ce2ead43755a'},511111519874,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146051,0.0,TEST BRANDCODE @1612366146051
4,{'$oid': '601ac142be37ce2ead43755e'},511111319917,Candy & Sweets,CANDY_AND_SWEETS,"{'$id': {'$oid': '5332fa12e4b03c9a25efd1e7'}, ...",test brand @1612366146827,0.0,TEST BRANDCODE @1612366146827


In [15]:
# Function to get metadata description of the JSON file
def get_metadata(file_path, expected_keys=None):
   
    # Get the file size
    file_size = os.path.getsize(file_path)
    
    # Get number of records (assuming the file is an array of objects)
    num_records = len(data_receipts)
    
    # Get keys from all records to understand the structure
    all_keys_found = set()
    for record in data_users:
        all_keys_found.update(record.keys())
         
    metadata = {
        'file_size_bytes': file_size,
        'num_records': num_records,
        'all_keys_found': list(all_keys_found),
    }

    return metadata

# Path to your JSON file
file_path = 'receipts.json'  # Modify with the correct file path

# Get and print the metadata description
metadata = get_metadata(file_path)
print(metadata)

{'file_size_bytes': 2414680, 'num_records': 1119, 'all_keys_found': ['state', '_id', 'active', 'createdDate', 'role', 'signUpSource', 'lastLogin']}


In [16]:
df_receipts.head()

Unnamed: 0,_id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId
0,{'$oid': '5ff1e1eb0a720f0523000575'},500.0,"Receipt number 2 completed, bonus point schedu...",{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687536000},{'$date': 1609687531000},500.0,{'$date': 1609632000000},5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6
1,{'$oid': '5ff1e1bb0a720f052300056b'},150.0,"Receipt number 5 completed, bonus point schedu...",{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687488000},{'$date': 1609687483000},150.0,{'$date': 1609601083000},2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052
2,{'$oid': '5ff1e1f10a720f052300057a'},5.0,All-receipts receipt bonus,{'$date': 1609687537000},{'$date': 1609687537000},,{'$date': 1609687542000},,5.0,{'$date': 1609632000000},1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b
3,{'$oid': '5ff1e1ee0a7214ada100056f'},5.0,All-receipts receipt bonus,{'$date': 1609687534000},{'$date': 1609687534000},{'$date': 1609687534000},{'$date': 1609687539000},{'$date': 1609687534000},5.0,{'$date': 1609632000000},4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6
4,{'$oid': '5ff1e1d20a7214ada1000561'},5.0,All-receipts receipt bonus,{'$date': 1609687506000},{'$date': 1609687506000},{'$date': 1609687511000},{'$date': 1609687511000},{'$date': 1609687506000},5.0,{'$date': 1609601106000},2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052


## Normalize the JSON file and Print preview

In [17]:
# Normalize JSON to flatten nested structures into tabular format
users_df = pd.json_normalize(data_users)

# Display the normalized DataFrame
users_df.head()  # Show the first few rows of the dataframe

Unnamed: 0,active,role,signUpSource,state,_id.$oid,createdDate.$date,lastLogin.$date
0,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
1,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
2,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
3,True,consumer,Email,WI,5ff1e1eacfcf6c399c274ae6,1609687530554,1609688000000.0
4,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0


In [18]:
# Normalize the nested objects and arrays
brands_df = pd.json_normalize(data_brands)

# Display the normalized DataFrame
brands_df.head()  # Show the first few rows of the dataframe

Unnamed: 0,barcode,category,categoryCode,name,topBrand,_id.$oid,cpg.$id.$oid,cpg.$ref,brandCode
0,511111019862,Baking,BAKING,test brand @1612366101024,False,601ac115be37ce2ead437551,601ac114be37ce2ead437550,Cogs,
1,511111519928,Beverages,BEVERAGES,Starbucks,False,601c5460be37ce2ead43755f,5332f5fbe4b03c9a25efd0ba,Cogs,STARBUCKS
2,511111819905,Baking,BAKING,test brand @1612366146176,False,601ac142be37ce2ead43755d,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146176
3,511111519874,Baking,BAKING,test brand @1612366146051,False,601ac142be37ce2ead43755a,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146051
4,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,False,601ac142be37ce2ead43755e,5332fa12e4b03c9a25efd1e7,Cogs,TEST BRANDCODE @1612366146827


In [19]:
# Normalize the nested objects and arrays
receipts_df = pd.json_normalize(data_receipts)

# Display the normalized DataFrame
receipts_df.head()  # Show the first few rows of the dataframe

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,_id.$oid,createDate.$date,dateScanned.$date,finishedDate.$date,modifyDate.$date,pointsAwardedDate.$date,purchaseDate.$date
0,500.0,"Receipt number 2 completed, bonus point schedu...",500.0,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6,5ff1e1eb0a720f0523000575,1609687531000,1609687531000,1609688000000.0,1609687536000,1609688000000.0,1609632000000.0
1,150.0,"Receipt number 5 completed, bonus point schedu...",150.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052,5ff1e1bb0a720f052300056b,1609687483000,1609687483000,1609687000000.0,1609687488000,1609687000000.0,1609601000000.0
2,5.0,All-receipts receipt bonus,5.0,1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b,5ff1e1f10a720f052300057a,1609687537000,1609687537000,,1609687542000,,1609632000000.0
3,5.0,All-receipts receipt bonus,5.0,4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6,5ff1e1ee0a7214ada100056f,1609687534000,1609687534000,1609688000000.0,1609687539000,1609688000000.0,1609632000000.0
4,5.0,All-receipts receipt bonus,5.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052,5ff1e1d20a7214ada1000561,1609687506000,1609687506000,1609688000000.0,1609687511000,1609688000000.0,1609601000000.0
