# RMF模型-数据清洗

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
path = 'Online Retail.xlsx'
sales = pd.read_excel(path, sheet_name='Online Retail', dtype=str)

## 数据理解

In [4]:
sales

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
...,...,...,...,...,...,...,...,...
541903,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680,France
541904,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.1,12680,France
541905,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680,France
541906,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680,France


In [5]:
sales.head(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047,United Kingdom


In [6]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541908 entries, 0 to 541907
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   InvoiceNo    541908 non-null  object
 1   StockCode    541908 non-null  object
 2   Description  540454 non-null  object
 3   Quantity     541908 non-null  object
 4   InvoiceDate  541908 non-null  object
 5   UnitPrice    541908 non-null  object
 6   CustomerID   406828 non-null  object
 7   Country      541908 non-null  object
dtypes: object(8)
memory usage: 33.1+ MB


### 数据类型转换

In [7]:
sales.dtypes

InvoiceNo      object
StockCode      object
Description    object
Quantity       object
InvoiceDate    object
UnitPrice      object
CustomerID     object
Country        object
dtype: object

In [8]:
sales.InvoiceNo = sales.InvoiceNo.astype(str)
sales.StockCode = sales.StockCode.astype(str)
sales.Description = sales.Description.astype(str)
sales.Quantity = sales.Quantity.astype(int)
sales.UnitPrice = sales.UnitPrice.astype(float)
sales.CustomerID = sales.CustomerID.astype(str)
sales.Country = sales.Country.astype(str)

In [9]:
sales.dtypes

InvoiceNo       object
StockCode       object
Description     object
Quantity         int32
InvoiceDate     object
UnitPrice      float64
CustomerID      object
Country         object
dtype: object

In [10]:
sales.rename(columns={'InvoiceDate':'InvoiceTime'}, inplace=True)

In [11]:
sales.dtypes

InvoiceNo       object
StockCode       object
Description     object
Quantity         int32
InvoiceTime     object
UnitPrice      float64
CustomerID      object
Country         object
dtype: object

In [12]:
sales.InvoiceTime = pd.to_datetime(sales.InvoiceTime)

In [13]:
sales.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int32
InvoiceTime    datetime64[ns]
UnitPrice             float64
CustomerID             object
Country                object
dtype: object

In [14]:
sales.InvoiceTime.dt.year

0         2010
1         2010
2         2010
3         2010
4         2010
          ... 
541903    2011
541904    2011
541905    2011
541906    2011
541907    2011
Name: InvoiceTime, Length: 541908, dtype: int64

### 字符串类型转换

In [15]:
sales.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [16]:
sales.Description = sales.Description.str.lower()

In [17]:
sales.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


### 货币单位转换

In [18]:
def pound_to_yuan(x, exchange_rate=8.44):
    return x * exchange_rate

In [19]:
sales['UnitPrice_RMB'] = sales.UnitPrice.map(pound_to_yuan)

In [20]:
sales

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,21.5220
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,23.2100
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116
...,...,...,...,...,...,...,...,...,...
541903,581587,22613,pack of 20 spaceboy napkins,12,2011-12-09 12:50:00,0.85,12680,France,7.1740
541904,581587,22899,children's apron dolly girl,6,2011-12-09 12:50:00,2.10,12680,France,17.7240
541905,581587,23254,childrens cutlery dolly girl,4,2011-12-09 12:50:00,4.15,12680,France,35.0260
541906,581587,23255,childrens cutlery circus parade,4,2011-12-09 12:50:00,4.15,12680,France,35.0260


### 单位小计

In [21]:
sales['SumPrice'] = sales.UnitPrice_RMB * sales.Quantity

In [22]:
sales

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,21.5220,129.1320
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,23.2100,185.6800
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
...,...,...,...,...,...,...,...,...,...,...
541903,581587,22613,pack of 20 spaceboy napkins,12,2011-12-09 12:50:00,0.85,12680,France,7.1740,86.0880
541904,581587,22899,children's apron dolly girl,6,2011-12-09 12:50:00,2.10,12680,France,17.7240,106.3440
541905,581587,23254,childrens cutlery dolly girl,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040
541906,581587,23255,childrens cutlery circus parade,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040


## 重复值

### 查找

In [23]:
sales.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
541903    False
541904    False
541905    False
541906    False
541907    False
Length: 541908, dtype: bool

In [24]:
sales[sales.duplicated()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
516,536409,21866,union jack flag luggage tag,1,2010-12-01 11:45:00,1.25,17908,United Kingdom,10.5500,10.5500
526,536409,22866,hand warmer scotty dog design,1,2010-12-01 11:45:00,2.10,17908,United Kingdom,17.7240,17.7240
536,536409,22900,set 2 tea towels i love london,1,2010-12-01 11:45:00,2.95,17908,United Kingdom,24.8980,24.8980
538,536409,22111,scottie dog hot water bottle,1,2010-12-01 11:45:00,4.95,17908,United Kingdom,41.7780,41.7780
554,536412,22327,round snack boxes set of 4 skulls,1,2010-12-01 11:49:00,2.95,17920,United Kingdom,24.8980,24.8980
...,...,...,...,...,...,...,...,...,...,...
541674,581538,22068,black pirate treasure chest,1,2011-12-09 11:34:00,0.39,14446,United Kingdom,3.2916,3.2916
541688,581538,23318,box of 6 mini vintage crackers,1,2011-12-09 11:34:00,2.49,14446,United Kingdom,21.0156,21.0156
541691,581538,22992,revolver wooden ruler,1,2011-12-09 11:34:00,1.95,14446,United Kingdom,16.4580,16.4580
541698,581538,22694,wicker star,1,2011-12-09 11:34:00,2.10,14446,United Kingdom,17.7240,17.7240


### 删除

In [25]:
row_before = sales.shape[0]
row_before

541908

In [26]:
sales.drop_duplicates(inplace=True)

In [27]:
row_after = sales.shape[0]
row_after

536640

In [28]:
row_before - row_after

5268

In [29]:
sales

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,21.5220,129.1320
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,23.2100,185.6800
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
...,...,...,...,...,...,...,...,...,...,...
541903,581587,22613,pack of 20 spaceboy napkins,12,2011-12-09 12:50:00,0.85,12680,France,7.1740,86.0880
541904,581587,22899,children's apron dolly girl,6,2011-12-09 12:50:00,2.10,12680,France,17.7240,106.3440
541905,581587,23254,childrens cutlery dolly girl,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040
541906,581587,23255,childrens cutlery circus parade,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040


In [30]:
sales.reset_index(drop=True, inplace=True)

In [31]:
sales

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,21.5220,129.1320
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,23.2100,185.6800
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
...,...,...,...,...,...,...,...,...,...,...
536635,581587,22613,pack of 20 spaceboy napkins,12,2011-12-09 12:50:00,0.85,12680,France,7.1740,86.0880
536636,581587,22899,children's apron dolly girl,6,2011-12-09 12:50:00,2.10,12680,France,17.7240,106.3440
536637,581587,23254,childrens cutlery dolly girl,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040
536638,581587,23255,childrens cutlery circus parade,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040


## 缺失值

In [32]:
sales

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,21.5220,129.1320
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,23.2100,185.6800
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
...,...,...,...,...,...,...,...,...,...,...
536635,581587,22613,pack of 20 spaceboy napkins,12,2011-12-09 12:50:00,0.85,12680,France,7.1740,86.0880
536636,581587,22899,children's apron dolly girl,6,2011-12-09 12:50:00,2.10,12680,France,17.7240,106.3440
536637,581587,23254,childrens cutlery dolly girl,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040
536638,581587,23255,childrens cutlery circus parade,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040


### 查找缺失值

In [33]:
sales.isnull().sum()

InvoiceNo        0
StockCode        0
Description      0
Quantity         0
InvoiceTime      0
UnitPrice        0
CustomerID       0
Country          0
UnitPrice_RMB    0
SumPrice         0
dtype: int64

In [34]:
sales[sales.isnull().values==True]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice


### 处理

In [35]:
sales.dtypes

InvoiceNo                object
StockCode                object
Description              object
Quantity                  int32
InvoiceTime      datetime64[ns]
UnitPrice               float64
CustomerID               object
Country                  object
UnitPrice_RMB           float64
SumPrice                float64
dtype: object

#### CustomerID

In [36]:
sales[sales.CustomerID == '0']

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice


In [37]:
sales.CustomerID.fillna('0', inplace=True)

In [38]:
sales.isnull().sum()

InvoiceNo        0
StockCode        0
Description      0
Quantity         0
InvoiceTime      0
UnitPrice        0
CustomerID       0
Country          0
UnitPrice_RMB    0
SumPrice         0
dtype: int64

#### Description

In [39]:
sales.Description.fillna('null', inplace=True)

In [40]:
sales.isnull().sum()

InvoiceNo        0
StockCode        0
Description      0
Quantity         0
InvoiceTime      0
UnitPrice        0
CustomerID       0
Country          0
UnitPrice_RMB    0
SumPrice         0
dtype: int64

### 一致化

### 异常值

In [41]:
sales.describe()

Unnamed: 0,Quantity,UnitPrice,UnitPrice_RMB,SumPrice
count,536640.0,536640.0,536640.0,536640.0
mean,9.620002,4.632661,39.099656,152.9649
std,219.130359,97.233208,820.648277,3212.742
min,-80995.0,-11062.06,-93363.7864,-1421883.0
25%,1.0,1.25,10.55,31.65
50%,3.0,2.08,17.5552,83.3028
75%,10.0,4.13,34.8572,146.856
max,80995.0,38970.0,328906.8,1421883.0


#### Quantity == 0

In [42]:
sales[sales.Quantity == 0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice


#### Quantity < 0

In [43]:
sales[sales.Quantity < 0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
140,C536379,D,discount,-1,2010-12-01 09:41:00,27.50,14527,United Kingdom,232.1000,-232.1000
153,C536383,35004C,set of 3 coloured flying ducks,-1,2010-12-01 09:49:00,4.65,15311,United Kingdom,39.2460,-39.2460
234,C536391,22556,plasters in tin circus parade,-12,2010-12-01 10:24:00,1.65,17548,United Kingdom,13.9260,-167.1120
235,C536391,21984,pack of 12 pink paisley tissues,-24,2010-12-01 10:24:00,0.29,17548,United Kingdom,2.4476,-58.7424
236,C536391,21983,pack of 12 blue paisley tissues,-24,2010-12-01 10:24:00,0.29,17548,United Kingdom,2.4476,-58.7424
...,...,...,...,...,...,...,...,...,...,...
535187,C581490,23144,zinc t-light holder stars small,-11,2011-12-09 09:57:00,0.83,14397,United Kingdom,7.0052,-77.0572
536279,C581499,M,manual,-1,2011-12-09 10:28:00,224.69,15498,United Kingdom,1896.3836,-1896.3836
536446,C581568,21258,victorian sewing box large,-5,2011-12-09 11:57:00,10.95,15311,United Kingdom,92.4180,-462.0900
536447,C581569,84978,hanging heart jar t-light holder,-1,2011-12-09 11:58:00,1.25,17315,United Kingdom,10.5500,-10.5500


In [44]:
query_cancel = sales.InvoiceNo.str.contains('C')
query_cancel

0         False
1         False
2         False
3         False
4         False
          ...  
536635    False
536636    False
536637    False
536638    False
536639    False
Name: InvoiceNo, Length: 536640, dtype: bool

In [45]:
sales_cancel = sales.loc[query_cancel].copy()
sales_cancel

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
140,C536379,D,discount,-1,2010-12-01 09:41:00,27.50,14527,United Kingdom,232.1000,-232.1000
153,C536383,35004C,set of 3 coloured flying ducks,-1,2010-12-01 09:49:00,4.65,15311,United Kingdom,39.2460,-39.2460
234,C536391,22556,plasters in tin circus parade,-12,2010-12-01 10:24:00,1.65,17548,United Kingdom,13.9260,-167.1120
235,C536391,21984,pack of 12 pink paisley tissues,-24,2010-12-01 10:24:00,0.29,17548,United Kingdom,2.4476,-58.7424
236,C536391,21983,pack of 12 blue paisley tissues,-24,2010-12-01 10:24:00,0.29,17548,United Kingdom,2.4476,-58.7424
...,...,...,...,...,...,...,...,...,...,...
535187,C581490,23144,zinc t-light holder stars small,-11,2011-12-09 09:57:00,0.83,14397,United Kingdom,7.0052,-77.0572
536279,C581499,M,manual,-1,2011-12-09 10:28:00,224.69,15498,United Kingdom,1896.3836,-1896.3836
536446,C581568,21258,victorian sewing box large,-5,2011-12-09 11:57:00,10.95,15311,United Kingdom,92.4180,-462.0900
536447,C581569,84978,hanging heart jar t-light holder,-1,2011-12-09 11:58:00,1.25,17315,United Kingdom,10.5500,-10.5500


In [46]:
sales_success = sales.loc[-query_cancel].copy()
sales_success

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,21.5220,129.1320
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,23.2100,185.6800
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
...,...,...,...,...,...,...,...,...,...,...
536635,581587,22613,pack of 20 spaceboy napkins,12,2011-12-09 12:50:00,0.85,12680,France,7.1740,86.0880
536636,581587,22899,children's apron dolly girl,6,2011-12-09 12:50:00,2.10,12680,France,17.7240,106.3440
536637,581587,23254,childrens cutlery dolly girl,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040
536638,581587,23255,childrens cutlery circus parade,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040


#### UnitPrice == 0

In [47]:
sales_success[sales_success.UnitPrice == 0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
604,536414,22139,,56,2010-12-01 11:52:00,0.0,,United Kingdom,0.0,0.0
1933,536545,21134,,1,2010-12-01 14:32:00,0.0,,United Kingdom,0.0,0.0
1934,536546,22145,,1,2010-12-01 14:33:00,0.0,,United Kingdom,0.0,0.0
1935,536547,37509,,1,2010-12-01 14:33:00,0.0,,United Kingdom,0.0,0.0
1950,536549,85226A,,1,2010-12-01 14:34:00,0.0,,United Kingdom,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
531752,581234,72817,,27,2011-12-08 10:33:00,0.0,,United Kingdom,0.0,0.0
533258,581406,46000M,polyester filler pad 45x45cm,240,2011-12-08 13:58:00,0.0,,United Kingdom,0.0,0.0
533259,581406,46000S,polyester filler pad 40x40cm,300,2011-12-08 13:58:00,0.0,,United Kingdom,0.0,0.0
533308,581408,85175,,20,2011-12-08 14:06:00,0.0,,United Kingdom,0.0,0.0


In [48]:
query_free = sales_success.UnitPrice == 0

In [49]:
sales_free = sales_success.loc[query_free].copy()
sales_free

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
604,536414,22139,,56,2010-12-01 11:52:00,0.0,,United Kingdom,0.0,0.0
1933,536545,21134,,1,2010-12-01 14:32:00,0.0,,United Kingdom,0.0,0.0
1934,536546,22145,,1,2010-12-01 14:33:00,0.0,,United Kingdom,0.0,0.0
1935,536547,37509,,1,2010-12-01 14:33:00,0.0,,United Kingdom,0.0,0.0
1950,536549,85226A,,1,2010-12-01 14:34:00,0.0,,United Kingdom,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
531752,581234,72817,,27,2011-12-08 10:33:00,0.0,,United Kingdom,0.0,0.0
533258,581406,46000M,polyester filler pad 45x45cm,240,2011-12-08 13:58:00,0.0,,United Kingdom,0.0,0.0
533259,581406,46000S,polyester filler pad 40x40cm,300,2011-12-08 13:58:00,0.0,,United Kingdom,0.0,0.0
533308,581408,85175,,20,2011-12-08 14:06:00,0.0,,United Kingdom,0.0,0.0


In [50]:
sales_success = sales_success.loc[-query_free]
sales_success

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,21.5220,129.1320
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,23.2100,185.6800
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
...,...,...,...,...,...,...,...,...,...,...
536635,581587,22613,pack of 20 spaceboy napkins,12,2011-12-09 12:50:00,0.85,12680,France,7.1740,86.0880
536636,581587,22899,children's apron dolly girl,6,2011-12-09 12:50:00,2.10,12680,France,17.7240,106.3440
536637,581587,23254,childrens cutlery dolly girl,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040
536638,581587,23255,childrens cutlery circus parade,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040


#### UnitPrice < 0 

In [51]:
sales_success[sales_success.UnitPrice < 0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
297645,A563186,B,adjust bad debt,1,2011-08-12 14:51:00,-11062.06,,United Kingdom,-93363.7864,-93363.7864
297646,A563187,B,adjust bad debt,1,2011-08-12 14:52:00,-11062.06,,United Kingdom,-93363.7864,-93363.7864


In [52]:
query_debt = sales_success.UnitPrice < 0

In [53]:
sales_debt = sales_success.loc[query_debt].copy()
sales_debt

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
297645,A563186,B,adjust bad debt,1,2011-08-12 14:51:00,-11062.06,,United Kingdom,-93363.7864,-93363.7864
297646,A563187,B,adjust bad debt,1,2011-08-12 14:52:00,-11062.06,,United Kingdom,-93363.7864,-93363.7864


In [54]:
sales_success = sales_success.loc[-query_debt]
sales_success

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,21.5220,129.1320
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,23.2100,185.6800
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
...,...,...,...,...,...,...,...,...,...,...
536635,581587,22613,pack of 20 spaceboy napkins,12,2011-12-09 12:50:00,0.85,12680,France,7.1740,86.0880
536636,581587,22899,children's apron dolly girl,6,2011-12-09 12:50:00,2.10,12680,France,17.7240,106.3440
536637,581587,23254,childrens cutlery dolly girl,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040
536638,581587,23255,childrens cutlery circus parade,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040


In [55]:
sales_success.describe()

Unnamed: 0,Quantity,UnitPrice,UnitPrice_RMB,SumPrice
count,524877.0,524877.0,524877.0,524877.0
mean,10.616575,3.922576,33.106544,171.1239
std,156.280179,36.093062,304.625442,2293.096
min,1.0,0.001,0.00844,0.00844
25%,1.0,1.25,10.55,32.916
50%,4.0,2.08,17.5552,83.7248
75%,11.0,4.13,34.8572,149.388
max,80995.0,13541.33,114288.8252,1421883.0


In [56]:
sales_success.reset_index(drop=True, inplace=True)

In [57]:
sales_success

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceTime,UnitPrice,CustomerID,Country,UnitPrice_RMB,SumPrice
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,21.5220,129.1320
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,23.2100,185.6800
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,28.6116,171.6696
...,...,...,...,...,...,...,...,...,...,...
524872,581587,22613,pack of 20 spaceboy napkins,12,2011-12-09 12:50:00,0.85,12680,France,7.1740,86.0880
524873,581587,22899,children's apron dolly girl,6,2011-12-09 12:50:00,2.10,12680,France,17.7240,106.3440
524874,581587,23254,childrens cutlery dolly girl,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040
524875,581587,23255,childrens cutlery circus parade,4,2011-12-09 12:50:00,4.15,12680,France,35.0260,140.1040


In [58]:
path = 'Online Retail Clean1.xlsx'
sales_success.to_excel(path, sheet_name='Online Retail Clean')