# Data Cleaning Before Loading

This notebook is for exploring the dataset before loading to the database for better organizing the data.

In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Importing the dataset
products = pd.read_csv('../data/raw/products.csv')
transactions = pd.read_csv('../data/raw/transactions.csv')
users = pd.read_csv('../data/raw/users.csv')

## Explore users

In [3]:
users

Unnamed: 0,ID,CREATED_DATE,BIRTH_DATE,STATE,LANGUAGE,GENDER
0,5ef3b4f17053ab141787697d,2020-06-24 20:17:54.000 Z,2000-08-11 00:00:00.000 Z,CA,es-419,female
1,5ff220d383fcfc12622b96bc,2021-01-03 19:53:55.000 Z,2001-09-24 04:00:00.000 Z,PA,en,female
2,6477950aa55bb77a0e27ee10,2023-05-31 18:42:18.000 Z,1994-10-28 00:00:00.000 Z,FL,es-419,female
3,658a306e99b40f103b63ccf8,2023-12-26 01:46:22.000 Z,,NC,en,
4,653cf5d6a225ea102b7ecdc2,2023-10-28 11:51:50.000 Z,1972-03-19 00:00:00.000 Z,PA,en,female
...,...,...,...,...,...,...
99995,61fc06d41febf771966da8fa,2022-02-03 16:46:12.000 Z,1992-03-16 08:00:00.000 Z,CA,en,female
99996,6391e7ef90ad5449ec5f782d,2022-12-08 13:34:39.000 Z,1993-09-23 05:00:00.000 Z,MO,en,female
99997,637d5efdd6f2a49c49934dcb,2022-11-22 23:45:05.000 Z,1983-04-19 00:00:00.000 Z,RI,en,female
99998,5f0de23b05d8a6147dc0cafa,2020-07-14 16:50:04.000 Z,1995-06-09 04:00:00.000 Z,DE,en,female


In [None]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   ID            100000 non-null  object
 1   CREATED_DATE  100000 non-null  object
 2   BIRTH_DATE    96325 non-null   object
 3   STATE         95188 non-null   object
 4   LANGUAGE      69492 non-null   object
 5   GENDER        94108 non-null   object
dtypes: object(6)
memory usage: 4.6+ MB


In [5]:
# Remove " Z" from end of date
users['CREATED_DATE'] = users['CREATED_DATE'].str.replace(' Z', '')
users['BIRTH_DATE'] = users['BIRTH_DATE'].str.replace(' Z', '')

# Convert to datetime
users['CREATED_DATE'] = pd.to_datetime(users['CREATED_DATE'])
users['BIRTH_DATE'] = pd.to_datetime(users['BIRTH_DATE'])

users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   ID            100000 non-null  object        
 1   CREATED_DATE  100000 non-null  datetime64[ns]
 2   BIRTH_DATE    96325 non-null   datetime64[ns]
 3   STATE         95188 non-null   object        
 4   LANGUAGE      69492 non-null   object        
 5   GENDER        94108 non-null   object        
dtypes: datetime64[ns](2), object(4)
memory usage: 4.6+ MB


In [6]:
# Change "NaN" or "nan" to np.nan
users = users.replace('NaN', np.nan)
users = users.replace('nan', np.nan)

# Check for missing values
users.isnull().sum()

ID                  0
CREATED_DATE        0
BIRTH_DATE       3675
STATE            4812
LANGUAGE        30508
GENDER           5892
dtype: int64

In [7]:
# Check for duplicates
users.duplicated().sum()

np.int64(0)

In [8]:
# Check duplicated id in users
users['ID'].duplicated().sum()

np.int64(0)

In [9]:
# Check null values in id
users['ID'].isnull().sum()

np.int64(0)

In [10]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   ID            100000 non-null  object        
 1   CREATED_DATE  100000 non-null  datetime64[ns]
 2   BIRTH_DATE    96325 non-null   datetime64[ns]
 3   STATE         95188 non-null   object        
 4   LANGUAGE      69492 non-null   object        
 5   GENDER        94108 non-null   object        
dtypes: datetime64[ns](2), object(4)
memory usage: 4.6+ MB


In [11]:
users.head()

Unnamed: 0,ID,CREATED_DATE,BIRTH_DATE,STATE,LANGUAGE,GENDER
0,5ef3b4f17053ab141787697d,2020-06-24 20:17:54,2000-08-11 00:00:00,CA,es-419,female
1,5ff220d383fcfc12622b96bc,2021-01-03 19:53:55,2001-09-24 04:00:00,PA,en,female
2,6477950aa55bb77a0e27ee10,2023-05-31 18:42:18,1994-10-28 00:00:00,FL,es-419,female
3,658a306e99b40f103b63ccf8,2023-12-26 01:46:22,NaT,NC,en,
4,653cf5d6a225ea102b7ecdc2,2023-10-28 11:51:50,1972-03-19 00:00:00,PA,en,female


In [12]:
# Check for uniqueness in ID
users['ID'].nunique()

100000

In [13]:
# Save to users_cleaned.csv
users.to_csv('../data/processed/users_cleaned.csv', index=False)

## Explore products

In [14]:
products

Unnamed: 0,CATEGORY_1,CATEGORY_2,CATEGORY_3,CATEGORY_4,MANUFACTURER,BRAND,BARCODE
0,Health & Wellness,Sexual Health,Conductivity Gels & Lotions,,,,7.964944e+11
1,Snacks,Puffed Snacks,Cheese Curls & Puffs,,,,2.327801e+10
2,Health & Wellness,Hair Care,Hair Care Accessories,,PLACEHOLDER MANUFACTURER,ELECSOP,4.618178e+11
3,Health & Wellness,Oral Care,Toothpaste,,COLGATE-PALMOLIVE,COLGATE,3.500047e+10
4,Health & Wellness,Medicines & Treatments,Essential Oils,,MAPLE HOLISTICS AND HONEYDEW PRODUCTS INTERCHA...,MAPLE HOLISTICS,8.068109e+11
...,...,...,...,...,...,...,...
845547,Health & Wellness,Topical Muscle & Joint Relief Treatments,Braces & Wraps,,,,7.223016e+11
845548,Snacks,Cookies,,,"TREEHOUSE FOODS, INC.",LOFTHOUSE,4.182082e+10
845549,Snacks,Candy,Confection Candy,,HARIBO GMBH & CO KG,HARIBO,1.001672e+11
845550,Snacks,Nuts & Seeds,Hazelnuts,,DOUBLE-COLA CO,JUMBO,7.539076e+10


In [15]:
# Checking the data
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 845552 entries, 0 to 845551
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   CATEGORY_1    845441 non-null  object 
 1   CATEGORY_2    844128 non-null  object 
 2   CATEGORY_3    784986 non-null  object 
 3   CATEGORY_4    67459 non-null   object 
 4   MANUFACTURER  619078 non-null  object 
 5   BRAND         619080 non-null  object 
 6   BARCODE       841527 non-null  float64
dtypes: float64(1), object(6)
memory usage: 45.2+ MB


In [16]:
# Change data type of barcode to string
products['BARCODE'] = products['BARCODE'].astype(str)

# Checking the data
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 845552 entries, 0 to 845551
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   CATEGORY_1    845441 non-null  object
 1   CATEGORY_2    844128 non-null  object
 2   CATEGORY_3    784986 non-null  object
 3   CATEGORY_4    67459 non-null   object
 4   MANUFACTURER  619078 non-null  object
 5   BRAND         619080 non-null  object
 6   BARCODE       845552 non-null  object
dtypes: object(7)
memory usage: 45.2+ MB


In [17]:
# Checking the shape of the data
products.shape

(845552, 7)

In [18]:
# Change all value as "nan" or "NaN" to null
products = products.replace(['nan', 'NaN'], np.nan)

# Remove duplicates
products = products.drop_duplicates()

# Checking the data
products.info()

<class 'pandas.core.frame.DataFrame'>
Index: 845337 entries, 0 to 845551
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   CATEGORY_1    845226 non-null  object
 1   CATEGORY_2    843915 non-null  object
 2   CATEGORY_3    784774 non-null  object
 3   CATEGORY_4    67453 non-null   object
 4   MANUFACTURER  618873 non-null  object
 5   BRAND         618875 non-null  object
 6   BARCODE       841369 non-null  object
dtypes: object(7)
memory usage: 51.6+ MB


In [19]:
products.shape

(845337, 7)

In [20]:
# Descriptive statistics
products.describe()

Unnamed: 0,CATEGORY_1,CATEGORY_2,CATEGORY_3,CATEGORY_4,MANUFACTURER,BRAND,BARCODE
count,845226,843915,784774,67453,618873,618875,841369.0
unique,27,121,344,127,4354,8122,841342.0
top,Health & Wellness,Candy,Confection Candy,Lip Balms,PLACEHOLDER MANUFACTURER,REM BRAND,40111216.0
freq,512686,120898,56951,9737,86900,20813,2.0


In [21]:
# Checking for missing values
products.isnull().sum()

CATEGORY_1         111
CATEGORY_2        1422
CATEGORY_3       60563
CATEGORY_4      777884
MANUFACTURER    226464
BRAND           226462
BARCODE           3968
dtype: int64

In [22]:
# Checking for uniqueness of each column
products.nunique()

CATEGORY_1          27
CATEGORY_2         121
CATEGORY_3         344
CATEGORY_4         127
MANUFACTURER      4354
BRAND             8122
BARCODE         841342
dtype: int64

In [23]:
# Checking for duplicated barcodes
products['BARCODE'].duplicated().sum()

np.int64(3994)

In [24]:
# List the duplicated barcodes order by barcode
products[products['BARCODE'].duplicated(keep=False)].sort_values('BARCODE')

Unnamed: 0,CATEGORY_1,CATEGORY_2,CATEGORY_3,CATEGORY_4,MANUFACTURER,BRAND,BARCODE
428256,Health & Wellness,Skin Care,Facial Lotion & Moisturizer,,"R.M. PALMER COMPANY, LLC",PALMER,1018158.0
123194,Health & Wellness,Skin Care,Lip Balms & Treatments,Medicated Lip Treatments,"E.T. BROWNE DRUG CO., INC.",PALMER'S SKIN & HAIR CARE,1018158.0
304021,Health & Wellness,Hair Care,Hair Color,,HENKEL,GÖT2B,17000329260.0
213340,Health & Wellness,Hair Care,Hair Color,,HENKEL,SCHWARZKOPF,17000329260.0
783021,Snacks,Cookies,,,PLACEHOLDER MANUFACTURER,PRIVATE LABEL,20031077.0
...,...,...,...,...,...,...,...
845070,Health & Wellness,Medicines & Treatments,,,RECKITT BENCKISER,CEPACOL,
845187,Health & Wellness,Bath & Body,,,RECKITT BENCKISER,QUEEN V,
845215,Snacks,,,,PEPSICO,IMAGINE,
845234,Health & Wellness,Oral Care,,,BOIRON,BOIRON,


In [25]:
# List the duplicated barcodes order by barcode where barcode is not null
products[products['BARCODE'].notnull() & products['BARCODE'].duplicated(keep=False)].sort_values('BARCODE')

Unnamed: 0,CATEGORY_1,CATEGORY_2,CATEGORY_3,CATEGORY_4,MANUFACTURER,BRAND,BARCODE
428256,Health & Wellness,Skin Care,Facial Lotion & Moisturizer,,"R.M. PALMER COMPANY, LLC",PALMER,1018158.0
123194,Health & Wellness,Skin Care,Lip Balms & Treatments,Medicated Lip Treatments,"E.T. BROWNE DRUG CO., INC.",PALMER'S SKIN & HAIR CARE,1018158.0
304021,Health & Wellness,Hair Care,Hair Color,,HENKEL,GÖT2B,17000329260.0
213340,Health & Wellness,Hair Care,Hair Color,,HENKEL,SCHWARZKOPF,17000329260.0
783021,Snacks,Cookies,,,PLACEHOLDER MANUFACTURER,PRIVATE LABEL,20031077.0
132547,Snacks,Cookies,,,,,20031077.0
56987,Snacks,Nuts & Seeds,Almonds,,PLACEHOLDER MANUFACTURER,BRAND NOT KNOWN,20159078.0
776953,Snacks,Nuts & Seeds,Covered Nuts,,PLACEHOLDER MANUFACTURER,BRAND NOT KNOWN,20159078.0
127335,Snacks,Nuts & Seeds,Pistachios,,"LIDL US, LLC",LIDL,20522445.0
260690,Snacks,Candy,Chocolate Candy,,"LIDL US, LLC",LIDL,20522445.0


In [26]:
products.info()

<class 'pandas.core.frame.DataFrame'>
Index: 845337 entries, 0 to 845551
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   CATEGORY_1    845226 non-null  object
 1   CATEGORY_2    843915 non-null  object
 2   CATEGORY_3    784774 non-null  object
 3   CATEGORY_4    67453 non-null   object
 4   MANUFACTURER  618873 non-null  object
 5   BRAND         618875 non-null  object
 6   BARCODE       841369 non-null  object
dtypes: object(7)
memory usage: 51.6+ MB


In [27]:
# Remove ".0" from end of barcode
products['BARCODE'] = products['BARCODE'].str.replace('.0', '')

products

Unnamed: 0,CATEGORY_1,CATEGORY_2,CATEGORY_3,CATEGORY_4,MANUFACTURER,BRAND,BARCODE
0,Health & Wellness,Sexual Health,Conductivity Gels & Lotions,,,,796494407820
1,Snacks,Puffed Snacks,Cheese Curls & Puffs,,,,23278011028
2,Health & Wellness,Hair Care,Hair Care Accessories,,PLACEHOLDER MANUFACTURER,ELECSOP,461817824225
3,Health & Wellness,Oral Care,Toothpaste,,COLGATE-PALMOLIVE,COLGATE,35000466815
4,Health & Wellness,Medicines & Treatments,Essential Oils,,MAPLE HOLISTICS AND HONEYDEW PRODUCTS INTERCHA...,MAPLE HOLISTICS,806810850459
...,...,...,...,...,...,...,...
845547,Health & Wellness,Topical Muscle & Joint Relief Treatments,Braces & Wraps,,,,722301569399
845548,Snacks,Cookies,,,"TREEHOUSE FOODS, INC.",LOFTHOUSE,41820818468
845549,Snacks,Candy,Confection Candy,,HARIBO GMBH & CO KG,HARIBO,100167154940
845550,Snacks,Nuts & Seeds,Hazelnuts,,DOUBLE-COLA CO,JUMBO,75390755960


In [28]:
# Save the products data to products_cleaned.csv
products.to_csv('../data/processed/products_cleaned.csv', index=False)

## Explore transactions

In [29]:
transactions

Unnamed: 0,RECEIPT_ID,PURCHASE_DATE,SCAN_DATE,STORE_NAME,USER_ID,BARCODE,FINAL_QUANTITY,FINAL_SALE
0,0000d256-4041-4a3e-adc4-5623fb6e0c99,2024-08-21,2024-08-21 14:19:06.539 Z,WALMART,63b73a7f3d310dceeabd4758,1.530001e+10,1.00,
1,0001455d-7a92-4a7b-a1d2-c747af1c8fd3,2024-07-20,2024-07-20 09:50:24.206 Z,ALDI,62c08877baa38d1a1f6c211a,,zero,1.49
2,00017e0a-7851-42fb-bfab-0baa96e23586,2024-08-18,2024-08-19 15:38:56.813 Z,WALMART,60842f207ac8b7729e472020,7.874223e+10,1.00,
3,000239aa-3478-453d-801e-66a82e39c8af,2024-06-18,2024-06-19 11:03:37.468 Z,FOOD LION,63fcd7cea4f8442c3386b589,7.833997e+11,zero,3.49
4,00026b4c-dfe8-49dd-b026-4c2f0fd5c6a1,2024-07-04,2024-07-05 15:56:43.549 Z,RANDALLS,6193231ae9b3d75037b0f928,4.790050e+10,1.00,
...,...,...,...,...,...,...,...,...
49995,b5cd61a9-8033-4913-a5c4-fb3f65e3a321,2024-08-21,2024-08-31 14:13:08.634 Z,TARGET,6154bcf098f885648de2f299,8.523911e+10,2.00,1.18
49996,e1b2f634-c9ad-4152-b662-4b22efc25862,2024-08-11,2024-08-11 18:15:56.736 Z,STOP & SHOP,60aa809f188b926b2244c974,4.610040e+10,1.00,2.00
49997,b07ef8dd-e444-40a2-819b-f74a3e5f1ae7,2024-07-11,2024-07-11 08:03:25.816 Z,WALMART,60bd26e83dc3b13a15c5f4e7,6.466300e+11,1.00,20.96
49998,42475141-bef4-4df2-aa37-72577e2512bb,2024-06-18,2024-06-18 19:57:32.211 Z,MARKET BASKET,6169912fac47744405af62b7,4.180050e+10,1.00,3.00


In [30]:
# Remove " Z" from end of date
transactions['SCAN_DATE'] = transactions['SCAN_DATE'].str.replace(' Z', '')

# Convert to datetime
transactions['SCAN_DATE'] = pd.to_datetime(transactions['SCAN_DATE'])

# Convert purchase_date to datetime
transactions['PURCHASE_DATE'] = pd.to_datetime(transactions['PURCHASE_DATE'], format='%Y-%m-%d')

In [31]:
# Replace "NaN" or "nan" with np.nan
transactions = transactions.replace('NaN', np.nan)
transactions = transactions.replace('nan', np.nan)

# Change data type of barcode to string
transactions['BARCODE'] = transactions['BARCODE'].astype(str)

# Remove ".0" from end of barcode
transactions['BARCODE'] = transactions['BARCODE'].str.replace('.0', '')

# Change "zero" in final_quantity to 0
transactions['FINAL_QUANTITY'] = transactions['FINAL_QUANTITY'].replace('zero', 0)

transactions

Unnamed: 0,RECEIPT_ID,PURCHASE_DATE,SCAN_DATE,STORE_NAME,USER_ID,BARCODE,FINAL_QUANTITY,FINAL_SALE
0,0000d256-4041-4a3e-adc4-5623fb6e0c99,2024-08-21,2024-08-21 14:19:06.539,WALMART,63b73a7f3d310dceeabd4758,15300014978,1.00,
1,0001455d-7a92-4a7b-a1d2-c747af1c8fd3,2024-07-20,2024-07-20 09:50:24.206,ALDI,62c08877baa38d1a1f6c211a,,0,1.49
2,00017e0a-7851-42fb-bfab-0baa96e23586,2024-08-18,2024-08-19 15:38:56.813,WALMART,60842f207ac8b7729e472020,78742229751,1.00,
3,000239aa-3478-453d-801e-66a82e39c8af,2024-06-18,2024-06-19 11:03:37.468,FOOD LION,63fcd7cea4f8442c3386b589,783399746536,0,3.49
4,00026b4c-dfe8-49dd-b026-4c2f0fd5c6a1,2024-07-04,2024-07-05 15:56:43.549,RANDALLS,6193231ae9b3d75037b0f928,47900501183,1.00,
...,...,...,...,...,...,...,...,...
49995,b5cd61a9-8033-4913-a5c4-fb3f65e3a321,2024-08-21,2024-08-31 14:13:08.634,TARGET,6154bcf098f885648de2f299,85239110669,2.00,1.18
49996,e1b2f634-c9ad-4152-b662-4b22efc25862,2024-08-11,2024-08-11 18:15:56.736,STOP & SHOP,60aa809f188b926b2244c974,46100400555,1.00,2.00
49997,b07ef8dd-e444-40a2-819b-f74a3e5f1ae7,2024-07-11,2024-07-11 08:03:25.816,WALMART,60bd26e83dc3b13a15c5f4e7,646630019670,1.00,20.96
49998,42475141-bef4-4df2-aa37-72577e2512bb,2024-06-18,2024-06-18 19:57:32.211,MARKET BASKET,6169912fac47744405af62b7,41800501519,1.00,3.00


In [32]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   RECEIPT_ID      50000 non-null  object        
 1   PURCHASE_DATE   50000 non-null  datetime64[ns]
 2   SCAN_DATE       50000 non-null  datetime64[ns]
 3   STORE_NAME      50000 non-null  object        
 4   USER_ID         50000 non-null  object        
 5   BARCODE         50000 non-null  object        
 6   FINAL_QUANTITY  50000 non-null  object        
 7   FINAL_SALE      50000 non-null  object        
dtypes: datetime64[ns](2), object(6)
memory usage: 3.1+ MB


In [33]:
# Change ' ' to np.nan
transactions = transactions.replace(' ', np.nan)

# Change the data type of final_quantity and final_sale to float
transactions['FINAL_QUANTITY'] = transactions['FINAL_QUANTITY'].astype(float)
transactions['FINAL_SALE'] = transactions['FINAL_SALE'].astype(float)

transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   RECEIPT_ID      50000 non-null  object        
 1   PURCHASE_DATE   50000 non-null  datetime64[ns]
 2   SCAN_DATE       50000 non-null  datetime64[ns]
 3   STORE_NAME      50000 non-null  object        
 4   USER_ID         50000 non-null  object        
 5   BARCODE         50000 non-null  object        
 6   FINAL_QUANTITY  50000 non-null  float64       
 7   FINAL_SALE      37500 non-null  float64       
dtypes: datetime64[ns](2), float64(2), object(4)
memory usage: 3.1+ MB


In [34]:
# Check Top 5 rows
transactions.head()

Unnamed: 0,RECEIPT_ID,PURCHASE_DATE,SCAN_DATE,STORE_NAME,USER_ID,BARCODE,FINAL_QUANTITY,FINAL_SALE
0,0000d256-4041-4a3e-adc4-5623fb6e0c99,2024-08-21,2024-08-21 14:19:06.539,WALMART,63b73a7f3d310dceeabd4758,15300014978.0,1.0,
1,0001455d-7a92-4a7b-a1d2-c747af1c8fd3,2024-07-20,2024-07-20 09:50:24.206,ALDI,62c08877baa38d1a1f6c211a,,0.0,1.49
2,00017e0a-7851-42fb-bfab-0baa96e23586,2024-08-18,2024-08-19 15:38:56.813,WALMART,60842f207ac8b7729e472020,78742229751.0,1.0,
3,000239aa-3478-453d-801e-66a82e39c8af,2024-06-18,2024-06-19 11:03:37.468,FOOD LION,63fcd7cea4f8442c3386b589,783399746536.0,0.0,3.49
4,00026b4c-dfe8-49dd-b026-4c2f0fd5c6a1,2024-07-04,2024-07-05 15:56:43.549,RANDALLS,6193231ae9b3d75037b0f928,47900501183.0,1.0,


In [35]:
# Save the transactions data to transactions_cleaned.csv
transactions.to_csv('../data/processed/transactions_cleaned.csv', index=False)