## Online Retail Dataset [(source link)](https://archive.ics.uci.edu/dataset/352/online+retail)

In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
online_retail = fetch_ucirepo(id=352) 
  
# data (as pandas dataframes) 
X = online_retail.data.features 
y = online_retail.data.targets 
  
# metadata 
print(online_retail.metadata) 
  
# variable information 
print(online_retail.variables) 


{'uci_id': 352, 'name': 'Online Retail', 'repository_url': 'https://archive.ics.uci.edu/dataset/352/online+retail', 'data_url': 'https://archive.ics.uci.edu/static/public/352/data.csv', 'abstract': 'This is a transactional data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.', 'area': 'Business', 'tasks': ['Classification', 'Clustering'], 'characteristics': ['Multivariate', 'Sequential', 'Time-Series'], 'num_instances': 541909, 'num_features': 6, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': None, 'index_col': ['InvoiceNo', 'StockCode'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2015, 'last_updated': 'Mon Oct 21 2024', 'dataset_doi': '10.24432/C5BW33', 'creators': ['Daqing Chen'], 'intro_paper': {'ID': 361, 'type': 'NATIVE', 'title': 'Data mining for the online retail industry: A case study of RFM model-based customer segmenta

In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio

plt.style.use("dark_background")
pio.templates.default = "plotly_dark"

pd.options.display.float_format = "{:,.2f}".format

In [104]:
online_retail.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,InvoiceNo,ID,Categorical,,a 6-digit integral number uniquely assigned to...,,no
1,StockCode,ID,Categorical,,a 5-digit integral number uniquely assigned to...,,no
2,Description,Feature,Categorical,,product name,,no
3,Quantity,Feature,Integer,,the quantities of each product (item) per tran...,,no
4,InvoiceDate,Feature,Date,,the day and time when each transaction was gen...,,no
5,UnitPrice,Feature,Continuous,,product price per unit,sterling,no
6,CustomerID,Feature,Categorical,,a 5-digit integral number uniquely assigned to...,,no
7,Country,Feature,Categorical,,the name of the country where each customer re...,,no


In [105]:

customer_id = 'CustomerID'
description = 'Description'
invoice_date = 'InvoiceDate'
quantity = 'Quantity'
unit_price = 'UnitPrice'
country = 'Country'

total_price = 'TotalPrice'

In [170]:

data = X.dropna()

data[invoice_date] = pd.to_datetime(data[invoice_date]).dt.date
data[invoice_date] = pd.to_datetime(data[invoice_date])

data[total_price] = data[quantity] * data[unit_price]
data = data[data[quantity] >= 0]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [171]:
data.shape

(397924, 7)

In [172]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 397924 entries, 0 to 541908
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Description  397924 non-null  object        
 1   Quantity     397924 non-null  int64         
 2   InvoiceDate  397924 non-null  datetime64[ns]
 3   UnitPrice    397924 non-null  float64       
 4   CustomerID   397924 non-null  float64       
 5   Country      397924 non-null  object        
 6   TotalPrice   397924 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(2)
memory usage: 24.3+ MB


In [173]:
retail_perday = data.groupby(invoice_date)[total_price].sum().reset_index()
retail_perday[total_price + "Rolling"] = retail_perday.rolling(10)[total_price].mean()
retail_perday.sample(5)

Unnamed: 0,InvoiceDate,TotalPrice,TotalPriceRolling
70,2011-03-03,32855.05,22096.83
141,2011-06-01,15390.89,23028.32
111,2011-04-20,25718.95,23334.17
237,2011-09-22,57869.36,45558.34
149,2011-06-10,58245.12,28391.88


In [174]:
px.line(
    retail_perday,
    x=invoice_date,
    y=[total_price, total_price + "Rolling"],
    width=2000,
)

In [175]:
retail_perday[total_price].describe()

count       305.00
mean     29,217.73
std      17,855.00
min       3,457.11
25%      18,021.48
50%      25,489.66
75%      36,751.25
max     184,349.28
Name: TotalPrice, dtype: float64

In [176]:
px.histogram(
    retail_perday,
    x=total_price,
    nbins=40,
    color_discrete_sequence=['tomato']
)

In [192]:
country_wise = data.groupby(country)[total_price].mean().reset_index()

px.bar(
    # country_wise[country_wise[country] != 'United Kingdom'],
    country_wise,
    x=country,
    y=total_price,
)

In [194]:
px.box(country_wise, x=total_price)

In [198]:
data.groupby(country)[total_price].sum().describe()

count          37.00
mean      240,848.86
std     1,196,639.36
min           145.92
25%         2,667.07
50%        13,590.38
75%        38,378.33
max     7,308,391.55
Name: TotalPrice, dtype: float64