## In this codebase, I will transform raw files from a MongoDB collection by flattening the data, normalizing it, and structuring it into relational tables.

In [1]:
import pandas as pd
import json

# 1. Users Raw File

In [2]:
# reading the file and loading it 

users_file_path = "../raw_files/users.json"
with open(users_file_path, "r") as file:
    users_data = [json.loads(row) for row in file]

In [3]:
# extracting and flattening the JSON data and also handling the missing data by the get dictionary function

users_cleansed_data = []
for entry in users_data:
    users_cleaned_entry = {
        "userID": entry["_id"]["$oid"], 
        "signUpSource": entry.get("signUpSource", None),
        "state": entry.get("state", None),
        "active": entry.get("active", None),
        "role": entry.get("role", None),
        "createdDate": pd.to_datetime(entry["createdDate"]["$date"], unit='ms').date() if "createdDate" in entry else None,
        "lastLogin": pd.to_datetime(entry["lastLogin"]["$date"], unit='ms').date() if "lastLogin" in entry else None
    }
    users_cleansed_data.append(users_cleaned_entry)

In [4]:
# converting to DataFrame
users_df = pd.DataFrame(users_cleansed_data)

In [5]:
# finding duplicate enteries 
duplicated_entries = users_df.duplicated().sum()
print(duplicated_entries)

283


In [6]:
# dropping duplicate enteries
users_df = users_df.drop_duplicates()

In [7]:
# reordering the columns
users_df = users_df.reindex(columns=["userID", "signUpSource", "state", "active", "role", "createdDate", "lastLogin"])

In [8]:
# saving the file as a CSV
users_df.to_csv("../cleansed_file/users_cleansed_data.csv", index=False)

# 2. Brands Raw FIle

In [9]:
# reading the file and loading it
brands_file_path = "../raw_files/brands.json"

with open(brands_file_path, "r") as file:
    brands_data = [json.loads(row) for row in file]

In [10]:
# extracting and flattening the JSON data and also handling the missing data by the get dictionary function

brands_cleansed_data = []
for entry in brands_data:
    brands_cleaned_entry = {
        "brandID": entry.get("_id", {}).get("$oid", None),  
        "brandName": entry.get("name", None),   
        "barcode": str(entry.get("barcode", None)),  
        "brandCode": str(entry.get("brandCode",None)),  
        "category": entry.get("category", None),  
        "categoryCode": entry.get("categoryCode", None),  
        "cpgID": entry.get("cpg", {}).get("$id", {}).get("$oid", None),  
        "cpgRef": entry.get("cpg", {}).get("$ref", None),  
        "topBrand": entry.get("topBrand", None)
        }
    brands_cleansed_data.append(brands_cleaned_entry)

In [11]:
# converting to a df
brands_df = pd.DataFrame(brands_cleansed_data)

In [12]:
# finding duplicate enteries 
duplicated_entries = brands_df.duplicated().sum()
print(duplicated_entries)

0


In [13]:
#reordering the columns
brands_df = brands_df.reindex(columns=["brandID","barcode", "brandName", "brandCode", "category", "categoryCode", "cpgID", "cpgRef","topBrand"])

In [14]:
# saving the file as a CSV
brands_df.to_csv("../cleansed_file/brands_cleansed_data.csv", index=False)

# 3. Receipt Raw File

In [15]:
# reading the file and loading it
receipts_file_path = "../raw_files/receipts.json"

with open(receipts_file_path, "r") as file:
    receipts_data = [json.loads(row) for row in file]

#### ***Note***: Receipt table has rewardsReceiptItemList column which stores all the items in that particular list, hence the granularity of this column when flattened would be on the item level in a single receipt, whereas in the receipt table the granularity is on the receipt level. Hence seperating the rewardsReceiptItemList in a seperate df where receipt would be the foreign key.

#### As the keys in the rewardsReceiptItemList are dynamic, collecting all the unique keys, this would be later on used to populate values as each item would have values for its own keys.

In [16]:
# finding all unique keys in rewardsReceiptItemList
all_item_keys = set()

for receipt in receipts_data:
    if "rewardsReceiptItemList" in receipt:
        for key in receipt["rewardsReceiptItemList"]:
            all_item_keys.update(key.keys())

In [17]:
print(len(all_item_keys))

34


In [18]:
# extracting and flattening the JSON data and also handling the missing data by the get dictionary function

receipts_cleansed_data = []
items_cleansed_data = []

for entry in receipts_data:
    receipt_id = entry["_id"]["$oid"]

    # flattening receipt-level data
    receipt_cleaned_entry = {
        "receiptID": receipt_id,
        "bonusPointsEarned": entry.get("bonusPointsEarned", None),
        "bonusPointsEarnedReason": entry.get("bonusPointsEarnedReason", None),
        "createDate": pd.to_datetime(entry["createDate"]["$date"], unit='ms') if "createDate" in entry else None,
        "dateScanned": pd.to_datetime(entry["dateScanned"]["$date"], unit='ms') if "dateScanned" in entry else None,
        "finishedDate": pd.to_datetime(entry["finishedDate"]["$date"], unit='ms') if "finishedDate" in entry else None,
        "modifyDate": pd.to_datetime(entry["modifyDate"]["$date"], unit='ms') if "modifyDate" in entry else None,
        "pointsAwardedDate": pd.to_datetime(entry["pointsAwardedDate"]["$date"], unit='ms') if "pointsAwardedDate" in entry else None,
        "pointsEarned": entry.get("pointsEarned", None),
        "purchaseDate": pd.to_datetime(entry["purchaseDate"]["$date"], unit='ms') if "purchaseDate" in entry else None,
        "purchasedItemCount": entry.get("purchasedItemCount", None),
        "rewardsReceiptStatus": entry.get("rewardsReceiptStatus", None),
        "totalSpent": entry.get("totalSpent", None),
        "userID": entry.get("userId", None),
    }
    receipts_cleansed_data.append(receipt_cleaned_entry)

    # flattening item-level data
    if "rewardsReceiptItemList" in entry:
        for item in entry["rewardsReceiptItemList"]:
            item_cleaned_entry = {"receiptID": receipt_id}  # Link item to receipt
            for key in all_item_keys:
                item_cleaned_entry[key] = item.get(key, None)  # Fill missing keys with None
            items_cleansed_data.append(item_cleaned_entry)

In [19]:
# converting to DataFrames
receipts_df = pd.DataFrame(receipts_cleansed_data)
items_df = pd.DataFrame(items_cleansed_data)

In [20]:
# finding duplicate enteries 
duplicated_entries = receipts_df.duplicated().sum()
print(duplicated_entries)

0


In [21]:
# saving the file as a CSV
receipts_df.to_csv("../cleansed_file/cleansed_receipts.csv", index=False)
items_df.to_csv("../cleansed_file/cleansed_receipt_items.csv", index=False)