In [2]:
import numpy as np
import pandas as pd
import datetime as dt
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

### Read data

In [5]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.10-py2.py3-none-any.whl (242 kB)
     -------------------------------------- 242.1/242.1 kB 1.3 MB/s eta 0:00:00
Collecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.10
Note: you may need to restart the kernel to use updated packages.


In [4]:
df_=pd.read_excel(r'C:\GITHUB\RFM Analysis\Data\online_retail.xlsx', sheet_name="Year 2009-2010")
df = df_.copy()
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


## Getting to know the variables:
InvoiceNo: The number of the invoice, unique per each purchase. refund invoice numbers contain "C"

StockCode: Unique code per each item

Description: Name of the item

Quantity: The number of items within the invoice

InvoiceDate: Date and time of purchase

UnitPrice: Price of a single item, as of Sterlin

CustomerID: Unique id number per each customer

Country: The country where the custormer is living

### Data Understanding

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      525461 non-null  object        
 1   StockCode    525461 non-null  object        
 2   Description  522533 non-null  object        
 3   Quantity     525461 non-null  int64         
 4   InvoiceDate  525461 non-null  datetime64[ns]
 5   Price        525461 non-null  float64       
 6   Customer ID  417534 non-null  float64       
 7   Country      525461 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 32.1+ MB


How many countries in df:

In [6]:
df['Country'].nunique()

40

In [7]:
df['Country'].value_counts()

United Kingdom          485852
EIRE                      9670
Germany                   8129
France                    5772
Netherlands               2769
Spain                     1278
Switzerland               1187
Portugal                  1101
Belgium                   1054
Channel Islands            906
Sweden                     902
Italy                      731
Australia                  654
Cyprus                     554
Austria                    537
Greece                     517
United Arab Emirates       432
Denmark                    428
Norway                     369
Finland                    354
Unspecified                310
USA                        244
Japan                      224
Poland                     194
Malta                      172
Lithuania                  154
Singapore                  117
RSA                        111
Bahrain                    107
Canada                      77
Hong Kong                   76
Thailand                    76
Israel  

The most expensive products:

In [9]:
df.sort_values(by='Price', ascending=False).head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
241824,C512770,M,Manual,-1,2010-06-17 16:52:00,25111.09,17399.0,United Kingdom
241827,512771,M,Manual,1,2010-06-17 16:53:00,25111.09,,United Kingdom
320581,C520667,BANK CHARGES,Bank Charges,-1,2010-08-27 13:42:00,18910.69,,United Kingdom
517953,C537630,AMAZONFEE,AMAZON FEE,-1,2010-12-07 15:04:00,13541.33,,United Kingdom
519294,C537651,AMAZONFEE,AMAZON FEE,-1,2010-12-07 15:49:00,13541.33,,United Kingdom


Number of unique product:

In [10]:
df['Description'].nunique()

4681

Most purchased items:

In [11]:
df.groupby('Description').agg({"Quantity": 'sum'}).sort_values("Quantity", ascending=False).head()

Unnamed: 0_level_0,Quantity
Description,Unnamed: 1_level_1
WHITE HANGING HEART T-LIGHT HOLDER,57733
WORLD WAR 2 GLIDERS ASSTD DESIGNS,54698
BROCADE RING PURSE,47647
PACK OF 72 RETRO SPOT CAKE CASES,46106
ASSORTED COLOUR BIRD ORNAMENT,44925


Check the number of uniques for StockCode and Description variables:

In [12]:
print(f"Number of uniques in StockCode: {df['StockCode'].nunique()}")
print(f"Number of uniques in Description: {df['Description'].nunique()}")

Number of uniques in StockCode: 4632
Number of uniques in Description: 4681


The values were expected to be equal, so there must be more than one unique value in Description variable for one unique StockCode. Let's check each StockCode value with the corresponding Decription values, get every StockCode that has more than one unique Description in a list form ()

In [14]:
a =df.groupby('StockCode').agg({'Description': "nunique"})
a.reset_index(inplace=True)
a.head()

Unnamed: 0,StockCode,Description
0,10002,1
1,10080,1
2,10109,1
3,10120,2
4,10125,1


In [21]:
b = list(a.loc[a['Description'] > 1, 'StockCode'])


In [23]:
for dup in b:
    print(f"dup = {dup} {df.loc[df['StockCode'] == dup, 'Description'].unique()}")

dup = 10120 ['DOGGY RUBBER' 'Zebra invcing error']
dup = 16011 [' ANIMAL STICKERS' 'ANIMAL STICKERS']
dup = 16012 ['FOOD/DRINK SPUNGE STICKERS' 'FOOD/DRINK SPONGE STICKERS']
dup = 16235 ['RECYCLED PENCIL WITH RABBIT ERASER' '?']
dup = 17033 ['BROCADE PURSE,SMALL ARCH BUTTON' 'sold as 17003?' 'Sold as 17003?' nan]
dup = 20615 ['BLUE SPOTTY PASSPORT COVER' 'BLUE POLKADOT PASSPORT COVER']
dup = 20652 ['BLUE SPOTTY LUGGAGE TAG ' nan 'BLUE POLKADOT LUGGAGE TAG ']
dup = 20658 ['RED SPOTTY LUGGAGE TAG' 'RED RETROSPOT LUGGAGE TAG']
dup = 20661 ['BLUE SPOTTY PURSE ' 'BLUE POLKADOT PURSE ']
dup = 20665 ['RED SPOTTY PURSE ' 'RED RETROSPOT PURSE ']
dup = 20674 ['GREEN SPOTTY BOWL' 'GREEN POLKADOT BOWL']
dup = 20675 ['BLUE SPOTTY BOWL' 'BLUE POLKADOT BOWL']
dup = 20676 ['RED SPOTTY BOWL' 'RED RETROSPOT BOWL']
dup = 20677 ['PINK SPOTTY BOWL' 'PINK POLKADOT BOWL']
dup = 20679 ['EDWARDIAN PARASOL RED' nan '?']
dup = 20681 ['PINK SPOTTY CHILDS UMBRELLA' 'PINK POLKADOT CHILDRENS UMBRELLA']
dup = 20682 [