# Retail Store Analaysis

In [1]:
import pandas as pd
data = pd.read_excel("Online Retail.xlsx")

*Dataste Overview*

In [2]:
# The total rows in the Dataset
data.shape[0]

541909

In [3]:
# The column in the Dataset
data.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [5]:
data.head(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


## Data Cleaning and Preprocessing 

*Removing Duplicate Values*

In [6]:
data.duplicated().sum()

np.int64(5268)

In [7]:
data = data.drop_duplicates()
data.duplicated().sum()

np.int64(0)

In [8]:
data.shape[0]

536641

*Handling Null Values*

In [9]:
data.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135037
Country             0
dtype: int64

In [13]:
data['Description'] = data['Description'].fillna(value = 'No Description')
# Removing rows with null values in CustomerId column
data = data.dropna(subset=['CustomerID'])
data.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

In [12]:
data.shape[0]

401604

*Data Type Conversion*

In [15]:
data.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

In [16]:
data['CustomerID'] = data['CustomerID'].astype(int)
data.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID              int64
Country                object
dtype: object

## Data Aggregation

In [30]:
# Total Quantity Sold per Product (StockCode)
product_sales = data.groupby('StockCode')['Quantity'].sum().reset_index()
product_sales.head(5)

Unnamed: 0,StockCode,Quantity
0,10002,823
1,10080,291
2,10120,192
3,10125,1225
4,10133,2363


In [33]:
# Total Revenue per Product

data['TotalPrice'] = data['Quantity'] * data['UnitPrice']

revenue_per_product = data.groupby('StockCode')['TotalPrice'].sum().reset_index().round(2)
revenue_per_product.head(5)

Unnamed: 0,StockCode,TotalPrice
0,10002,699.55
1,10080,114.41
2,10120,40.32
3,10125,929.45
4,10133,1134.79


In [38]:
# Total Revenue by Country

revenue_by_country = data.groupby('Country')['TotalPrice'].sum().reset_index().round(2)
revenue_by_country.head(5)

Unnamed: 0,Country,TotalPrice
0,Australia,137009.77
1,Austria,10154.32
2,Bahrain,548.4
3,Belgium,40910.96
4,Brazil,1143.6


In [45]:
data['Month'] = data['InvoiceDate'].dt.to_period('M')
monthly_sales = data.pivot_table(values = 'TotalPrice', index = 'Month', aggfunc = 'sum').round(2)
monthly_sales

Unnamed: 0_level_0,TotalPrice
Month,Unnamed: 1_level_1
2010-12,552372.86
2011-01,473731.9
2011-02,435534.07
2011-03,578576.21
2011-04,425222.67
2011-05,647011.67
2011-06,606862.52
2011-07,573112.32
2011-08,615078.09
2011-09,929356.23


In [57]:
customer_spending = data.pivot_table(values='TotalPrice', index='CustomerID', aggfunc='sum')
customer_spending.head()

Unnamed: 0_level_0,TotalPrice
CustomerID,Unnamed: 1_level_1
12346,0.0
12347,4310.0
12348,1797.24
12349,1757.55
12350,334.4
