# E-commerce Business Transaction Data Analysis
Data source: https://www.kaggle.com/datasets/gabrielramos87/an-online-shop-business

### Step 1: Setting up the environment

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

### Step 2: Loading and Inspecting the Data

In [43]:
file = "/Users/ramilojr.subiate/Documents/FILES/etl-to-dashboard/kaggle_data/sales_transaction.csv"
df = pd.read_csv(file)

In [44]:
df.head()

Unnamed: 0,TransactionNo,Date,ProductNo,ProductName,Price,Quantity,CustomerNo,Country
0,581482,12/9/2019,22485,Set Of 2 Wooden Market Crates,21.47,12,17490.0,United Kingdom
1,581475,12/9/2019,22596,Christmas Star Wish List Chalkboard,10.65,36,13069.0,United Kingdom
2,581475,12/9/2019,23235,Storage Tin Vintage Leaf,11.53,12,13069.0,United Kingdom
3,581475,12/9/2019,23272,Tree T-Light Holder Willie Winkie,10.65,12,13069.0,United Kingdom
4,581475,12/9/2019,23239,Set Of 4 Knick Knack Tins Poppies,11.94,6,13069.0,United Kingdom


In [45]:
df.shape

(536350, 8)

In [46]:
df.columns

Index(['TransactionNo', 'Date', 'ProductNo', 'ProductName', 'Price',
       'Quantity', 'CustomerNo', 'Country'],
      dtype='object')

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536350 entries, 0 to 536349
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   TransactionNo  536350 non-null  object 
 1   Date           536350 non-null  object 
 2   ProductNo      536350 non-null  object 
 3   ProductName    536350 non-null  object 
 4   Price          536350 non-null  float64
 5   Quantity       536350 non-null  int64  
 6   CustomerNo     536295 non-null  float64
 7   Country        536350 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 32.7+ MB


### Step 3: Cleaning and Preparing the Data

3.1 Renaming columns for clarity

In [48]:
df.rename(columns={
    "TransactionNo": "transaction_num",
    "Date": "date",
    "ProductNo": "product_num",
    "ProductName": "product_name",
    "Price": "price",
    "Quantity": "quantity",
    "CustomerNo": "customer_num",
    "Country": "country"
}, inplace=True)

df.columns

Index(['transaction_num', 'date', 'product_num', 'product_name', 'price',
       'quantity', 'customer_num', 'country'],
      dtype='object')

3.2 Dropping duplicates

In [49]:
df.duplicated().sum()

np.int64(5200)

In [50]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

np.int64(0)

3.3 Fixing Data types

In [None]:
df["date"] = pd.to_datetime(df["date"])
df["customer_num"] = df["customer_num"].astype("Int64").astype("str")

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 531150 entries, 0 to 536349
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   transaction_num  531150 non-null  object        
 1   date             531150 non-null  datetime64[ns]
 2   product_num      531150 non-null  object        
 3   product_name     531150 non-null  object        
 4   price            531150 non-null  float64       
 5   quantity         531150 non-null  int64         
 6   customer_num     531150 non-null  object        
 7   country          531150 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 36.5+ MB


3.4 Handling Missing Values

In [59]:
df.isna().sum()

transaction_num    0
date               0
product_num        0
product_name       0
price              0
quantity           0
customer_num       0
country            0
dtype: int64

3.5 Dropping cancelled orders

In [63]:
df.tail(30)

Unnamed: 0,transaction_num,date,product_num,product_name,price,quantity,customer_num,country
536320,536585,2018-12-01,37449,Ceramic Cake Stand + Hanging Cakes,20.45,2,17460,United Kingdom
536321,536590,2018-12-01,22776,Sweetheart 3 Tier Cake Stand,20.45,1,13065,United Kingdom
536322,536590,2018-12-01,22622,Box Of Vintage Alphabet Blocks,20.45,2,13065,United Kingdom
536323,536591,2018-12-01,37449,Ceramic Cake Stand + Hanging Cakes,20.45,1,14606,United Kingdom
536324,536597,2018-12-01,22220,Cake Stand Lovebird 2 Tier White,20.45,1,18011,United Kingdom
536325,C536383,2018-12-01,35004C,Set Of 3 Coloured Flying Ducks,15.02,-1,15311,United Kingdom
536326,C536391,2018-12-01,22556,Plasters In Tin Circus Parade,11.94,-12,17548,United Kingdom
536327,C536391,2018-12-01,21984,Pack Of 12 Pink Paisley Tissues,10.55,-24,17548,United Kingdom
536328,C536391,2018-12-01,21983,Pack Of 12 Blue Paisley Tissues,10.55,-24,17548,United Kingdom
536329,C536391,2018-12-01,21980,Pack Of 12 Red Retrospot Tissues,10.55,-24,17548,United Kingdom


In [67]:

df.tail(30)
        

Unnamed: 0,transaction_num,date,product_num,product_name,price,quantity,customer_num,country
536320,536585,2018-12-01,37449,Ceramic Cake Stand + Hanging Cakes,20.45,2,17460,United Kingdom
536321,536590,2018-12-01,22776,Sweetheart 3 Tier Cake Stand,20.45,1,13065,United Kingdom
536322,536590,2018-12-01,22622,Box Of Vintage Alphabet Blocks,20.45,2,13065,United Kingdom
536323,536591,2018-12-01,37449,Ceramic Cake Stand + Hanging Cakes,20.45,1,14606,United Kingdom
536324,536597,2018-12-01,22220,Cake Stand Lovebird 2 Tier White,20.45,1,18011,United Kingdom
536325,C536383,2018-12-01,35004C,Set Of 3 Coloured Flying Ducks,15.02,-1,15311,United Kingdom
536326,C536391,2018-12-01,22556,Plasters In Tin Circus Parade,11.94,-12,17548,United Kingdom
536327,C536391,2018-12-01,21984,Pack Of 12 Pink Paisley Tissues,10.55,-24,17548,United Kingdom
536328,C536391,2018-12-01,21983,Pack Of 12 Blue Paisley Tissues,10.55,-24,17548,United Kingdom
536329,C536391,2018-12-01,21980,Pack Of 12 Red Retrospot Tissues,10.55,-24,17548,United Kingdom
