In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("retail.csv")
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,1.12.2009 07:45,695,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,1.12.2009 07:45,675,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,1.12.2009 07:45,675,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,1.12.2009 07:45,21,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,1.12.2009 07:45,125,13085.0,United Kingdom


### Data Cleaning

In [3]:
df.info() #To get insights into the data types and number of values in each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Invoice      1048575 non-null  object 
 1   StockCode    1048575 non-null  object 
 2   Description  1044203 non-null  object 
 3   Quantity     1048575 non-null  int64  
 4   InvoiceDate  1048575 non-null  object 
 5   Price        1048575 non-null  object 
 6   Customer ID  811893 non-null   float64
 7   Country      1048575 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 64.0+ MB


In [4]:
df.isna().sum() # Finding out the null values in the dataframe

Invoice             0
StockCode           0
Description      4372
Quantity            0
InvoiceDate         0
Price               0
Customer ID    236682
Country             0
dtype: int64

In [5]:
df.shape

(1048575, 8)

In [6]:
df.dropna(inplace=True) # Dropping all the null values as we cannot impute customer IDs or Descriptions

In [7]:
df.shape # Checking how many rows we are left with

(811893, 8)

In [8]:
df.isna().sum() 

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64

In [9]:
# Changing the type of customer ID from float to int and then to object type as it is a categorical variable
df['Customer ID'] = df['Customer ID'].astype(int)
df['Customer ID'] = df['Customer ID'].astype(object)
df['Customer ID'].head()

0    13085
1    13085
2    13085
3    13085
4    13085
Name: Customer ID, dtype: object

In [10]:
# Replacing the ',' in price values with '.' and then changing the prices to float values

l1=[]
s1=''
for i in df['Price']:
    s1=i.replace(',','.')
    l1.append(float(s1))
df['Price']=l1
df.head()
   

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,1.12.2009 07:45,6.95,13085,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,1.12.2009 07:45,6.75,13085,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,1.12.2009 07:45,6.75,13085,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,1.12.2009 07:45,2.1,13085,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,1.12.2009 07:45,1.25,13085,United Kingdom


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 811893 entries, 0 to 1048574
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      811893 non-null  object 
 1   StockCode    811893 non-null  object 
 2   Description  811893 non-null  object 
 3   Quantity     811893 non-null  int64  
 4   InvoiceDate  811893 non-null  object 
 5   Price        811893 non-null  float64
 6   Customer ID  811893 non-null  object 
 7   Country      811893 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 55.7+ MB


In [12]:
# Changing the type of invoice data as we need it to calculate the recency
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%d.%m.%Y %H:%M')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 811893 entries, 0 to 1048574
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      811893 non-null  object        
 1   StockCode    811893 non-null  object        
 2   Description  811893 non-null  object        
 3   Quantity     811893 non-null  int64         
 4   InvoiceDate  811893 non-null  datetime64[ns]
 5   Price        811893 non-null  float64       
 6   Customer ID  811893 non-null  object        
 7   Country      811893 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 55.7+ MB


In [13]:
df.head(2)

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom


In [14]:
#Checking to see if the data in Qunatity is good to work with
df.Quantity.unique()

array([    12,     48,     24,     10,     18,      3,     16,      4,
            2,      6,      8,      1,     28,     30,     60,     32,
           56,      9,     25,     36,     20,    -12,     -6,     -4,
          -24,     -3,     -2,    576,    288,     27,     64,    160,
            5,     -1,     72,     50,     54,      7,    800,     11,
           58,    192,     96,    144,    100,    180,    216,     15,
          240,    108,     33,     90,     81,    -81,    -48,     17,
           -5,     40,     -8,     -9,     13,     21,    120,    200,
          168,     19,    280,    128,   -150,    -18,    -23,     80,
          300,    450,     14,     61,    480,     -7,     46,    504,
           45,   4320,   5184,   4008,     22,    600,    505,    500,
           23,     47,    432,    252,     75,    -16,   -504,   -600,
         -252,   -246,    -36,    360,    107,     49,     42,     66,
          -10,     34,     52,    400,     70,    936,    -32,    -60,
      

In [15]:
#Converting the negative values in quantity column to positive ones by using abs function
df.Quantity=df.Quantity.abs()
df.Quantity.unique()[0:50]

array([ 12,  48,  24,  10,  18,   3,  16,   4,   2,   6,   8,   1,  28,
        30,  60,  32,  56,   9,  25,  36,  20, 576, 288,  27,  64, 160,
         5,  72,  50,  54,   7, 800,  11,  58, 192,  96, 144, 100, 180,
       216,  15, 240, 108,  33,  90,  81,  17,  40,  13,  21], dtype=int64)

In [16]:
df.Price.unique()

array([6.95, 6.75, 2.1 , ..., 2.42, 3.16, 4.7 ])

In [17]:
#Converting negative values in price column(if any) to positive values
df.Price=df.Price.abs()

In [18]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085,United Kingdom


In [19]:
df.to_csv("cleaned_retail.csv")