#  Cleaning Data
### Today, we will be cleaning up some data.
### We will be using the `pandas` library to do this.

_first we start by importing the necessary libraries_

In [1]:
import pandas as pd
import numpy as np

_then we load the data_

In [2]:
data = pd.read_excel("../data/raw/messy_supermarket_sales.xlsx")

***Now we get some summaries of the data***

In [3]:
data.describe()

Unnamed: 0,Date,Quantity,Unit_Price,Total_Sales,Customer_ID
count,6300,6300.0,6300.0,6300.0,5663.0
mean,2023-07-02 09:41:56.571428608,4.913175,5.78378,19.127521,49865.525517
min,2023-01-01 00:00:00,-4.0,0.5,0.53,17.0
25%,2023-04-04 18:00:00,3.0,1.85,6.52,25328.0
50%,2023-07-01 00:00:00,5.0,2.89,13.16,49553.0
75%,2023-10-02 00:00:00,7.0,4.7,24.39,75034.5
max,2023-12-31 00:00:00,9.0,198.385505,134.64,99994.0
std,,2.716115,16.380023,19.176558,28804.80171


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6300 entries, 0 to 6299
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            6300 non-null   datetime64[ns]
 1   Product_ID      6300 non-null   object        
 2   Product         6300 non-null   object        
 3   Category        6300 non-null   object        
 4   Quantity        6300 non-null   int64         
 5   Unit_Price      6300 non-null   float64       
 6   Total_Sales     6300 non-null   float64       
 7   Customer_ID     5663 non-null   float64       
 8   Store_Location  6300 non-null   object        
 9   Payment_Method  6300 non-null   object        
dtypes: datetime64[ns](1), float64(3), int64(1), object(5)
memory usage: 492.3+ KB


In [17]:
data.sample(10)

Unnamed: 0,Date,Product_ID,Product,Category,Quantity,Unit_Price,Total_Sales,Customer_ID,Store_Location,Payment_Method
4821,2023-10-26,CAN786,Beans,Canned Goods,1,0.99,0.99,55494.0,Online,Debit Card
6224,2023-11-17,HOU885,Toilet Paper,Household,7,6.88,48.16,27255.0,Online,Cash
5361,2023-09-24,BEV838,Water,Beverages,7,3.7,25.9,36881.0,Online,Debit Card
2597,2023-10-15,SNA707,Granola Bars,Snacks,5,3.73,18.65,10711.0,Suburb,Cash
6279,2023-09-24,MEA963,Pork,Meat,6,14.62,87.72,79660.0,Downtown,Cash
294,2023-06-12,BAK65,Cakes,Bakery,5,3.57,17.85,15833.0,Suburb,Mobile App
13,2023-12-26,SNA293,Nuts,Snacks,7,2.14,14.98,44155.0,Suburb,Credit Card
2676,2023-09-18,SNA833,Nuts,Snacks,1,2.08,2.08,76653.0,Suburb,Credit Card
4250,2023-01-22,DAI149,Milk,Dairy,9,3.26,29.34,31453.0,Online,Debit Card
1303,2023-12-21,CAN573,Soup,Canned Goods,9,1.6,14.4,76785.0,Mall,Credit Card


### Firstly we make the location consistent by making non online purchases identified as physical.

In [19]:
data.Store_Location.unique()
data.replace({"Store_Location": {"Online": "Online", "Suburb": "Physical", "Downtown": "Physical", "Mall": "Physical"}}, inplace=True)

**next we handle missing data**

In [50]:
#customers with no ID are likely walk-ins, we will drop them.
data.dropna(subset=["Customer_ID"], inplace=True)

In [None]:
#further drop unnecessary missing data as the dataset is large enough
data.dropna(inplace=True)

***Next we fix up the formats***

In [84]:
# Make sure date is in datetime format
data["Date"] = pd.to_datetime(data["Date"], errors='coerce')

***Next we drop duplicates***

In [91]:
data.drop_duplicates(inplace=True)

***Next we standardize our data***

In [108]:
# make sure that things add up
data.Total_Sales = data.Quantity * data.Unit_Price
data.Total_Sales = data.Total_Sales.round(2)

In [112]:
#make sure all text is Pacals case
data.Category = data.Category.str.title()
data.Payment_Method = data.Payment_Method.str.replace(' ','')
data.Store_Location = data.Store_Location.str.title()
data.Product = data.Product.str.title()
data.Product = data.Product.str.replace(' ','_')
data.Unit_Price = data.Unit_Price.round(2)
data.sample(10)


Unnamed: 0,Date,Product_ID,Product,Category,Quantity,Unit_Price,Total_Sales,Customer_ID,Store_Location,Payment_Method
967,2023-03-08,BEV815,Water,Beverages,1,2.85,2.85,76129.0,Physical,CreditCard
765,2023-02-22,MEA223,Sausage,Meat,9,5.09,45.81,50211.0,Physical,CreditCard
3536,2023-08-25,PRO189,Apples,Produce,2,2.24,4.48,25901.0,Physical,DebitCard
13,2023-01-01,SNA930,Granola_Bars,Snacks,3,1.01,3.03,73559.0,Physical,MobileApp
3232,2023-08-05,PRO600,Lettuce,Produce,4,2.73,10.92,75599.0,Physical,MobileApp
5384,2023-12-29,HOU898,Detergent,Household,1,4.7,4.7,73261.0,Online,DebitCard
4435,2023-10-27,BEV840,Juice,Beverages,4,3.09,12.36,89636.0,Online,CreditCard
5188,2023-12-15,BEV353,Coffee,Beverages,2,2.79,5.58,61366.0,Physical,Cash
2189,2023-05-28,DAI526,Cheese,Dairy,4,4.35,17.4,10156.0,Physical,DebitCard
5199,2023-12-16,BEV162,Soda,Beverages,4,0.79,3.16,19312.0,Physical,DebitCard


**Finally, we arrange the data according to date**

In [113]:
data.sort_values(by="Date", inplace=True)
data.reset_index(drop=True, inplace=True)

**And we export our data**

In [114]:
data.to_csv("../data/cleaned/supermarket_sales_cleaned.csv", index=False)
data.to_excel("../data/cleaned/supermarket_sales_cleaned.xlsx", index=False, sheet_name='Cleaned Data')