# Third: Evaluate Data Quality Issues in the Data Provided
The purpose of this script is to uncover potential data issues related to missing values, incorrect data formats, inconsistencies, out-of-range values, and logical errors. The script performs several checks on the data and provides insights into where and how data quality issues might be impacting analysis.

## Import Libraries

In [2]:
import pandas as pd
import json

## Load the JSON data

In [3]:
# Function to load a JSON file into a list
def load_json_file(file_path):
    
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            data = [json.loads(line) for line in file]
        print(f"JSON Loaded Successfully from {file_path}")
        return data
    except json.JSONDecodeError as e:
        print(f"Error loading JSON from {file_path}: {e}")
        return None
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

users_data = load_json_file("users.json")
brands_data = load_json_file("brands.json")
receipts_data = load_json_file("receipts.json")

JSON Loaded Successfully from users.json
JSON Loaded Successfully from brands.json
JSON Loaded Successfully from receipts.json


## Print the preview of JSON data

In [25]:
print(users_data[:5])  # Prints list of JSON objects

[{'_id': {'$oid': '5ff1e194b6a9d73a3a9f1052'}, 'active': True, 'createdDate': {'$date': 1609687444800}, 'lastLogin': {'$date': 1609687537858}, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI'}, {'_id': {'$oid': '5ff1e194b6a9d73a3a9f1052'}, 'active': True, 'createdDate': {'$date': 1609687444800}, 'lastLogin': {'$date': 1609687537858}, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI'}, {'_id': {'$oid': '5ff1e194b6a9d73a3a9f1052'}, 'active': True, 'createdDate': {'$date': 1609687444800}, 'lastLogin': {'$date': 1609687537858}, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI'}, {'_id': {'$oid': '5ff1e1eacfcf6c399c274ae6'}, 'active': True, 'createdDate': {'$date': 1609687530554}, 'lastLogin': {'$date': 1609687530597}, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI'}, {'_id': {'$oid': '5ff1e194b6a9d73a3a9f1052'}, 'active': True, 'createdDate': {'$date': 1609687444800}, 'lastLogin': {'$date': 1609687537858}, 'role': 'consumer', 'signUpSource': 'Email', 

In [26]:
print(brands_data[:5])  # Prints list of JSON objects

[{'_id': {'$oid': '601ac115be37ce2ead437551'}, 'barcode': '511111019862', 'category': 'Baking', 'categoryCode': 'BAKING', 'cpg': {'$id': {'$oid': '601ac114be37ce2ead437550'}, '$ref': 'Cogs'}, 'name': 'test brand @1612366101024', 'topBrand': False}, {'_id': {'$oid': '601c5460be37ce2ead43755f'}, 'barcode': '511111519928', 'brandCode': 'STARBUCKS', 'category': 'Beverages', 'categoryCode': 'BEVERAGES', 'cpg': {'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, '$ref': 'Cogs'}, 'name': 'Starbucks', 'topBrand': False}, {'_id': {'$oid': '601ac142be37ce2ead43755d'}, 'barcode': '511111819905', 'brandCode': 'TEST BRANDCODE @1612366146176', 'category': 'Baking', 'categoryCode': 'BAKING', 'cpg': {'$id': {'$oid': '601ac142be37ce2ead437559'}, '$ref': 'Cogs'}, 'name': 'test brand @1612366146176', 'topBrand': False}, {'_id': {'$oid': '601ac142be37ce2ead43755a'}, 'barcode': '511111519874', 'brandCode': 'TEST BRANDCODE @1612366146051', 'category': 'Baking', 'categoryCode': 'BAKING', 'cpg': {'$id': {'$oid': '6

In [27]:
print(receipts_data[:2])  # Prints list of JSON objects

[{'_id': {'$oid': '5ff1e1eb0a720f0523000575'}, 'bonusPointsEarned': 500, 'bonusPointsEarnedReason': 'Receipt number 2 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)', 'createDate': {'$date': 1609687531000}, 'dateScanned': {'$date': 1609687531000}, 'finishedDate': {'$date': 1609687531000}, 'modifyDate': {'$date': 1609687536000}, 'pointsAwardedDate': {'$date': 1609687531000}, 'pointsEarned': '500.0', 'purchaseDate': {'$date': 1609632000000}, 'purchasedItemCount': 5, 'rewardsReceiptItemList': [{'barcode': '4011', 'description': 'ITEM NOT FOUND', 'finalPrice': '26.00', 'itemPrice': '26.00', 'needsFetchReview': False, 'partnerItemId': '1', 'preventTargetGapPoints': True, 'quantityPurchased': 5, 'userFlaggedBarcode': '4011', 'userFlaggedNewItem': True, 'userFlaggedPrice': '26.00', 'userFlaggedQuantity': 5}], 'rewardsReceiptStatus': 'FINISHED', 'totalSpent': '26.00', 'userId': '5ff1e1eacfcf6c399c274ae6'}, {'_id': {'$oid': '5ff1e1bb0a720f052300056b'}, 'bonusPointsEarned': 1

## Normalize the JSON file and Print preview

In [7]:
# Load JSON files properly
def load_json_file(filename):
    with open(filename, "r", encoding="utf-8") as file:
        data = [json.loads(line) for line in file]  # Load each line as JSON
    return pd.json_normalize(data)  # Flatten nested structures

# Load and normalize data
users_df = load_json_file("users.json")
brands_df = load_json_file("brands.json")
receipts_df = load_json_file("receipts.json")

# Convert unhashable types (lists/dicts) to strings
def make_hashable(df):
    return df.applymap(lambda x: str(x) if isinstance(x, (list, dict)) else x)

users_df = make_hashable(users_df)
brands_df = make_hashable(brands_df)
receipts_df = make_hashable(receipts_df)

In [8]:
print(users_df.head())

   active      role signUpSource state                  _id.$oid  \
0    True  consumer        Email    WI  5ff1e194b6a9d73a3a9f1052   
1    True  consumer        Email    WI  5ff1e194b6a9d73a3a9f1052   
2    True  consumer        Email    WI  5ff1e194b6a9d73a3a9f1052   
3    True  consumer        Email    WI  5ff1e1eacfcf6c399c274ae6   
4    True  consumer        Email    WI  5ff1e194b6a9d73a3a9f1052   

   createdDate.$date  lastLogin.$date  
0      1609687444800     1.609688e+12  
1      1609687444800     1.609688e+12  
2      1609687444800     1.609688e+12  
3      1609687530554     1.609688e+12  
4      1609687444800     1.609688e+12  


In [9]:
print(brands_df.head())

        barcode        category      categoryCode                       name  \
0  511111019862          Baking            BAKING  test brand @1612366101024   
1  511111519928       Beverages         BEVERAGES                  Starbucks   
2  511111819905          Baking            BAKING  test brand @1612366146176   
3  511111519874          Baking            BAKING  test brand @1612366146051   
4  511111319917  Candy & Sweets  CANDY_AND_SWEETS  test brand @1612366146827   

  topBrand                  _id.$oid              cpg.$id.$oid cpg.$ref  \
0    False  601ac115be37ce2ead437551  601ac114be37ce2ead437550     Cogs   
1    False  601c5460be37ce2ead43755f  5332f5fbe4b03c9a25efd0ba     Cogs   
2    False  601ac142be37ce2ead43755d  601ac142be37ce2ead437559     Cogs   
3    False  601ac142be37ce2ead43755a  601ac142be37ce2ead437559     Cogs   
4    False  601ac142be37ce2ead43755e  5332fa12e4b03c9a25efd1e7     Cogs   

                       brandCode  
0                            NaN 

In [10]:
print(receipts_df.head())

   bonusPointsEarned                            bonusPointsEarnedReason  \
0              500.0  Receipt number 2 completed, bonus point schedu...   
1              150.0  Receipt number 5 completed, bonus point schedu...   
2                5.0                         All-receipts receipt bonus   
3                5.0                         All-receipts receipt bonus   
4                5.0                         All-receipts receipt bonus   

  pointsEarned  purchasedItemCount  \
0        500.0                 5.0   
1        150.0                 2.0   
2            5                 1.0   
3          5.0                 4.0   
4          5.0                 2.0   

                              rewardsReceiptItemList rewardsReceiptStatus  \
0  [{'barcode': '4011', 'description': 'ITEM NOT ...             FINISHED   
1  [{'barcode': '4011', 'description': 'ITEM NOT ...             FINISHED   
2  [{'needsFetchReview': False, 'partnerItemId': ...             REJECTED   
3  [{'barcod

## Getting the info that is data types and null counts

In [22]:
# Get data types and null value counts
print(users_df.info())
print(brands_df.info())
print(receipts_df.info())

# since the dataframe is handling JSON objects most of the datatypes are displayed as 'object'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495 entries, 0 to 494
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   active             495 non-null    bool          
 1   role               495 non-null    object        
 2   signUpSource       447 non-null    object        
 3   state              439 non-null    object        
 4   _id.$oid           495 non-null    object        
 5   createdDate.$date  495 non-null    datetime64[ns]
 6   lastLogin.$date    433 non-null    float64       
dtypes: bool(1), datetime64[ns](1), float64(1), object(4)
memory usage: 23.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167 entries, 0 to 1166
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   barcode       1167 non-null   object
 1   category      1012 non-null   object
 2   categoryCode  517 non-null    object

In [12]:
# Compute summary statistics for numerical columns which can be useful to look for unusual values that could indicate errors
print(brands_df.describe())

             barcode category categoryCode     name topBrand  \
count           1167     1012          517     1167      555   
unique          1160       23           14     1156        2   
top     511111305125   Baking       BAKING  Huggies    False   
freq               2      369          359        2      524   

                        _id.$oid              cpg.$id.$oid cpg.$ref brandCode  
count                       1167                      1167     1167       933  
unique                      1167                       196        2       897  
top     601ac115be37ce2ead437551  559c2234e4b06aca36af13c6     Cogs            
freq                           1                        98     1020        35  


## Check for missing values

In [13]:
# Check for missing values
print(users_df.isnull().sum())

active                0
role                  0
signUpSource         48
state                56
_id.$oid              0
createdDate.$date     0
lastLogin.$date      62
dtype: int64


In [14]:
print(brands_df.isnull().sum())

barcode           0
category        155
categoryCode    650
name              0
topBrand        612
_id.$oid          0
cpg.$id.$oid      0
cpg.$ref          0
brandCode       234
dtype: int64


In [15]:
print(receipts_df.isnull().sum())

bonusPointsEarned          575
bonusPointsEarnedReason    575
pointsEarned               510
purchasedItemCount         484
rewardsReceiptItemList     440
rewardsReceiptStatus         0
totalSpent                 435
userId                       0
_id.$oid                     0
createDate.$date             0
dateScanned.$date            0
finishedDate.$date         551
modifyDate.$date             0
pointsAwardedDate.$date    582
purchaseDate.$date         448
dtype: int64


## Check for duplicate values

In [16]:
# Check for duplicates
print(users_df.duplicated().sum())
print(brands_df.duplicated().sum())
print(receipts_df.duplicated().sum())

283
0
0


## Checking data inconsistencies

In [17]:
# Check if there are any inconsistent date formats
users_df['createdDate.$date'] = pd.to_datetime(users_df['createdDate.$date'], errors='coerce')
receipts_df['dateScanned.$date'] = pd.to_datetime(receipts_df['dateScanned.$date'], errors='coerce')

# Check for rows where date conversion failed
invalid_users_dates = users_df[users_df['createdDate.$date'].isna()]
invalid_receipts_dates = receipts_df[receipts_df['dateScanned.$date'].isna()]

# Print results
if invalid_users_dates.empty:
    print("No inconsistent date formats found in users")
else:
    print("Inconsistent date formats found in users")
    print(invalid_users_dates)

if invalid_receipts_dates.empty:
    print("No inconsistent date formats found in receipts")
else:
    print("Inconsistent date formats found in receipts")
    print(invalid_receipts_dates)

No inconsistent date formats found in users
No inconsistent date formats found in receipts


In [23]:
# Check for negative or out-of-range data 
invalid_total_spent = receipts_df[pd.to_numeric(receipts_df['totalSpent'], errors='coerce') < 0]
invalid_points_earned = receipts_df[pd.to_numeric(receipts_df['pointsEarned'], errors='coerce') < 0]

if invalid_total_spent.empty:
    print("No negative values found for totalSpent")
else:
    print("Negative values found for totalSpent:")
    print(invalid_total_spent)

if invalid_points_earned.empty:
    print("No negative values found for pointsEarned")
else:
    print("Negative values found for pointsEarned:")
    print(invalid_points_earned)

No negative values found for totalSpent
No negative values found for pointsEarned


## Checking integrity constraint such as foreign keys 

In [19]:
# Check if all user_ids in receipts exist in the users table which ensures checking Unreferenced data
invalid_user_ids = receipts_df[~receipts_df['userId'].isin(users_df['_id.$oid'])]

# If invalid_user_ids is not empty, it indicates missing foreign key relationships
if invalid_user_ids.empty:
    print("No missing foreign key relationships")
else:
    print("Missing foreign key relationships")

Missing foreign key relationships


## Checking appropriate categorical value distributions

In [20]:
# Check for unique values in a categorical field which ensures checking inconsistent categorical values
print(receipts_df['rewardsReceiptStatus'].unique())

['FINISHED' 'REJECTED' 'FLAGGED' 'SUBMITTED' 'PENDING']


## Checking Logical consistency

In [24]:
# Ensure logical consistency for date fields (create_date should be before finished_date)
receipts_df['createDate.$date'] = pd.to_datetime(receipts_df['createDate.$date'], errors='coerce')
receipts_df['finishedDate.$date'] = pd.to_datetime(receipts_df['finishedDate.$date'], errors='coerce')
logical_inconsistencies = receipts_df[receipts_df['finishedDate.$date'] < receipts_df['createDate.$date']]

if logical_inconsistencies.empty:
    print("No logical inconsistencies found")
else:
    print("Logical inconsistencies found")

No logical inconsistencies found
