# Meta Data and Data Overview

- E-commerce Customer Segmentation & Prediction

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [37]:
data = pd.read_csv(r"E:\BIA\Capstone Project\E-commerce_customer_segementation_and_prediction\data\raw\online_retail.csv",
                  encoding= "ISO-8859-1",
                  dtype = {"CustomerID": str}
                  )

df = data.copy()
df.head(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,12/1/2010 8:26,7.65,17850,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,12/1/2010 8:26,4.25,17850,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,12/1/2010 8:28,1.85,17850,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,12/1/2010 8:28,1.85,17850,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,12/1/2010 8:34,1.69,13047,United Kingdom


In [38]:
# Shape

print ("The number of rows in the data: ", df.shape[0])
print ("The number of columns in the data: ", df.shape[1])

The number of rows in the data:  541909
The number of columns in the data:  8


In [39]:
# Column names

print ("Name of the columns in my data: ", df.columns.to_list())
print ("\nName of the numeric columns: \n", df.select_dtypes(include= "number").columns.to_list())
print ("\nName of the categorical columns: \n", df.select_dtypes(include = "object").columns.to_list())

Name of the columns in my data:  ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']

Name of the numeric columns: 
 ['Quantity', 'UnitPrice']

Name of the categorical columns: 
 ['InvoiceNo', 'StockCode', 'Description', 'InvoiceDate', 'CustomerID', 'Country']


In [40]:
#data types

print ("The data types of the columns: \n", df.dtypes)

The data types of the columns: 
 InvoiceNo       object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
UnitPrice      float64
CustomerID      object
Country         object
dtype: object


In [41]:
#null values 

print ("The null values in the columns are: \n", df.isnull().sum())

The null values in the columns are: 
 InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64


In [42]:
#duplicate values

print ("The duplicate values: ", df.duplicated().sum())

The duplicate values:  5268


In [43]:
# Data set info

print ("The info of the dataset: \n")
df.info()

The info of the dataset: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  object 
 7   Country      541909 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 33.1+ MB


In [44]:
# statistical summary

df.describe()

Unnamed: 0,Quantity,UnitPrice
count,541909.0,541909.0
mean,9.55225,4.611114
std,218.081158,96.759853
min,-80995.0,-11062.06
25%,1.0,1.25
50%,3.0,2.08
75%,10.0,4.13
max,80995.0,38970.0


In [45]:
# descriptive summary (categorical)

df.describe(include = "object")

Unnamed: 0,InvoiceNo,StockCode,Description,InvoiceDate,CustomerID,Country
count,541909,541909,540455,541909,406829,541909
unique,25900,4070,4223,23260,4372,38
top,573585,85123A,WHITE HANGING HEART T-LIGHT HOLDER,10/31/2011 14:41,17841,United Kingdom
freq,1114,2313,2369,1114,7983,495478


In [46]:
# Sample Cancelled Orders

cancelled = df[df["InvoiceNo"].str.startswith("C", na=False)]
print(f"Total Cancelled Orders : {len(cancelled):,}")

Total Cancelled Orders : 9,288


In [47]:
# Negative quantities

neg_qty = df[df["Quantity"] < 0]
print(f"Negative Quantity Rows : {len(neg_qty):,}")

Negative Quantity Rows : 10,624


In [48]:
# Zero or Negative Prices

bad_price = df[df["UnitPrice"] <= 0]
print(f"Zero/Negative Price Rows : {len(bad_price):,}")

Zero/Negative Price Rows : 2,517


In [49]:
# Country Distribution

print("Top 10 Countries by Transactions:")
df["Country"].value_counts().head(10)

Top 10 Countries by Transactions:


Country
United Kingdom    495478
Germany             9495
France              8557
EIRE                8196
Spain               2533
Netherlands         2371
Belgium             2069
Switzerland         2002
Portugal            1519
Australia           1259
Name: count, dtype: int64

In [50]:
# Summary


print("METADATA SUMMARY\n\n")

print(f"Total Rows               : {df.shape[0]:,}")
print(f"Total Columns            : {df.shape[1]}")
print(f"Missing CustomerIDs      : {df['CustomerID'].isnull().sum():,}")
print(f"Missing Descriptions     : {df['Description'].isnull().sum():,}")
print(f"Cancelled Orders         : {len(df[df['InvoiceNo'].str.startswith('C', na=False)]):,}")
print(f"Negative Quantity Rows   : {len(df[df['Quantity'] < 0]):,}")
print(f"Zero/Negative Price Rows : {len(df[df['UnitPrice'] <= 0]):,}")
print(f"Duplicate Rows           : {df.duplicated().sum():,}")

print("Metadata complete — proceed to 02_data_cleaning.ipynb")

METADATA SUMMARY


Total Rows               : 541,909
Total Columns            : 8
Missing CustomerIDs      : 135,080
Missing Descriptions     : 1,454
Cancelled Orders         : 9,288
Negative Quantity Rows   : 10,624
Zero/Negative Price Rows : 2,517
Duplicate Rows           : 5,268
Metadata complete — proceed to 02_data_cleaning.ipynb
