## Imports

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [2]:
# Algunas variables útiles
nombre_archivo = "../Data/Online_Retail.csv"
conexion_postgres = "postgresql://postgres:postgres@localhost:5432/oltpretail"
nombre_tabla_staging = "online_retail"

In [3]:
# Leer el archivo Excel o CSV
df = pd.read_csv(nombre_archivo, sep=";")

## Configuraciones

In [4]:
# pd.set_option('display.max_rows', 10)
pd.set_option('display.min_rows', 10)
pd.set_option('display.max_rows', 10)

## Inserto en la BD de Staging original

En esta etapa, los datos se insertan como están, sin analizar

In [5]:
# Corroboro el tipo de dato de cada columna
print(df.dtypes)

InvoiceNo       object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
UnitPrice       object
CustomerID     float64
Country         object
dtype: object


In [6]:
# Edito el tipo de dato de UnitPrice para que encaje con la tabla
df["UnitPrice"] = df["UnitPrice"].str.replace(",", ".", regex=False)
df["UnitPrice"] = df["UnitPrice"].astype(float)

In [7]:
print(df.dtypes)

InvoiceNo       object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
UnitPrice      float64
CustomerID     float64
Country         object
dtype: object


In [8]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,1/12/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,1/12/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,1/12/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,1/12/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,1/12/2010 8:26,3.39,17850.0,United Kingdom


In [9]:
# Conexión a PostgreSQL
engine = create_engine(conexion_postgres)

In [10]:
# Cargar en la tabla
df.to_sql(name=nombre_tabla_staging, schema="oltp", con=engine, if_exists="replace", index=False)

909

## Exploración de datos

In [11]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,1/12/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,1/12/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,1/12/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,1/12/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,1/12/2010 8:26,3.39,17850.0,United Kingdom


In [12]:
df['StockCode'].value_counts()

StockCode
85123A    2313
22423     2203
85099B    2159
47566     1727
20725     1639
          ... 
85179a       1
23617        1
90214U       1
47591b       1
72802c       1
Name: count, Length: 4070, dtype: int64

In [13]:
cant_registros_totales = len(df)
cant_StockCode_distintos = len(df["StockCode"].unique())
noNulos_por_columna = df.count()

print(f"""
Registros Totales: {cant_registros_totales}\n
Distintos StockCodes: {cant_StockCode_distintos}\n
Cantidad de no nulos por columna:\n{noNulos_por_columna}
""")

# Como datos relevantes, hay unas 1500 descripciones nulas, y 130.000 clientes nulos
# El siguiente paso será investigar qué pasa con esos valores


Registros Totales: 541909

Distintos StockCodes: 4070

Cantidad de no nulos por columna:
InvoiceNo      541909
StockCode      541909
Description    540455
Quantity       541909
InvoiceDate    541909
UnitPrice      541909
CustomerID     406829
Country        541909
dtype: int64



In [14]:
# Cantidad de Stock Codes diferentes (y la cantidad de veces que aparece cada uno)
conteo = df.groupby(['StockCode']).size().reset_index(name='Cantidad').sort_values(by="StockCode")
print(conteo)

         StockCode  Cantidad
0            10002        73
1            10080        24
2            10120        30
3           10123C         4
4           10123G         1
...            ...       ...
4065  gift_0001_20        10
4066  gift_0001_30         8
4067  gift_0001_40         3
4068  gift_0001_50         4
4069             m         1

[4070 rows x 2 columns]


In [15]:
# Cantidad de registros agrupados por stockCode y por descripción
conteo = df.groupby(['StockCode', 'Description']).size().reset_index(name='Cantidad').sort_values(by="StockCode")
print(conteo)

         StockCode                          Description  Cantidad
0            10002          INFLATABLE POLITICAL GLOBE         71
1            10080             GROOVY CACTUS INFLATABLE        22
2            10080                                check         1
3            10120                         DOGGY RUBBER        30
4           10123C                HEARTS WRAPPING TAPE          3
...            ...                                  ...       ...
4787  gift_0001_20  to push order througha s stock was          1
4788  gift_0001_30   Dotcomgiftshop Gift Voucher £30.00         7
4789  gift_0001_40   Dotcomgiftshop Gift Voucher £40.00         3
4790  gift_0001_50   Dotcomgiftshop Gift Voucher £50.00         4
4791             m                               Manual         1

[4792 rows x 3 columns]


In [16]:
# De los stockCodes válidos, busco cuál es el menor y cuál el mayor
df[df["StockCode"].str.match(r'^\d', na=False)].sort_values(by="StockCode")

# El menor resulta ser 10002, y el mayor 90214

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
52333,540732,10002,INFLATABLE POLITICAL GLOBE,12,11/1/2011 10:19,0.85,16122.0,United Kingdom
93152,544278,10002,INFLATABLE POLITICAL GLOBE,12,17/2/2011 12:01,0.85,15382.0,United Kingdom
77513,542735,10002,INFLATABLE POLITICAL GLOBE,12,31/1/2011 15:36,0.85,12681.0,France
88536,543806,10002,INFLATABLE POLITICAL GLOBE,1,13/2/2011 12:48,0.85,17085.0,United Kingdom
75792,542610,10002,INFLATABLE POLITICAL GLOBE,14,30/1/2011 14:05,0.85,13148.0,United Kingdom
...,...,...,...,...,...,...,...,...
368235,568949,90214Z,"LETTER ""Z"" BLING KEY RING",1,29/9/2011 15:13,0.83,,United Kingdom
107874,545464,90214Z,"LETTER ""Z"" BLING KEY RING",1,3/3/2011 9:10,0.83,,United Kingdom
527069,580691,90214Z,"LETTER ""Z"" BLING KEY RING",12,5/12/2011 15:48,0.29,13790.0,United Kingdom
278756,561217,90214Z,"LETTER ""Z"" BLING KEY RING",1,25/7/2011 17:09,0.83,,United Kingdom


In [17]:
# Cantidad de descripciones distintas por stockCode
desc_por_stock = (
    df
    .groupby('StockCode')['Description']
    .nunique()
    .reset_index(name='Cant_Descriptions')
)
print(desc_por_stock)

         StockCode  Cant_Descriptions
0            10002                  1
1            10080                  2
2            10120                  1
3           10123C                  1
4           10123G                  0
...            ...                ...
4065  gift_0001_20                  2
4066  gift_0001_30                  1
4067  gift_0001_40                  1
4068  gift_0001_50                  1
4069             m                  1

[4070 rows x 2 columns]


### Analizando cantidad de stock Codes según la cantidad de descripciones diferentes asociadas

En teoría, tenemos tres opciones:
- Una sola descripción asociada (el caso feliz)
- Más de una descripción asociada (hay que analizar más a fondo)
- Ninguna descripción asociada (solo valores nulos en el campo "description")

Como vimos que hay más de 1500 registros con descripción en null, este caso es relevante
Sabemos que existen 4070 StockCodes distintos, por lo que la suma de estos tres casos debería ser 4070

In [18]:
# StockCodes con más de una descripción
stock_con_multiples = pd.DataFrame(desc_por_stock[desc_por_stock['Cant_Descriptions'] > 1])
print(stock_con_multiples)

         StockCode  Cant_Descriptions
1            10080                  2
8            10133                  2
26          15058A                  2
28          15058C                  2
31           16008                  2
...            ...                ...
3972        90195A                  2
4008        90210D                  2
4043      DCGS0003                  2
4050      DCGS0069                  2
4065  gift_0001_20                  2

[650 rows x 2 columns]


In [19]:
# StockCodes con una sola descripción asociada
stock_con_unaDesc = pd.DataFrame(desc_por_stock[desc_por_stock['Cant_Descriptions'] == 1])
descripcion_unica = df[['StockCode', 'Description']].dropna().drop_duplicates(subset='StockCode')
stock_con_unaDesc = stock_con_unaDesc.merge(descripcion_unica, on='StockCode', how='left')

print(stock_con_unaDesc)

         StockCode  Cant_Descriptions                         Description
0            10002                  1         INFLATABLE POLITICAL GLOBE 
1            10120                  1                        DOGGY RUBBER
2           10123C                  1               HEARTS WRAPPING TAPE 
3           10124A                  1         SPOTS ON RED BOOKCOVER TAPE
4           10124G                  1            ARMY CAMO BOOKCOVER TAPE
...            ...                ...                                 ...
3303  gift_0001_10                  1  Dotcomgiftshop Gift Voucher £10.00
3304  gift_0001_30                  1  Dotcomgiftshop Gift Voucher £30.00
3305  gift_0001_40                  1  Dotcomgiftshop Gift Voucher £40.00
3306  gift_0001_50                  1  Dotcomgiftshop Gift Voucher £50.00
3307             m                  1                              Manual

[3308 rows x 3 columns]


In [20]:
# StockCodes sin descripciones asociadas
stock_sin_Descripcion = pd.DataFrame(desc_por_stock[desc_por_stock['Cant_Descriptions'] == 0])
print(stock_sin_Descripcion)

      StockCode  Cant_Descriptions
4        10123G                  0
9         10134                  0
46        16053                  0
85       17011A                  0
157       20689                  0
...         ...                ...
4046   DCGS0057                  0
4047  DCGS0066P                  0
4052   DCGS0071                  0
4053   DCGS0072                  0
4055   DCGS0074                  0

[112 rows x 2 columns]


### StockCodes sin descripción válida asociada

Vemos que la cantidad de stockCodes tiene sentido
112+3308+650 = 4070, nuestra cantidad de StockCodes distintos original

En esta sección, evaluaremos qué hacer con aquellos stockCodes que no tienen ninguna descripción válida asociada
A priori, son solamente alrededor de 112 regsitros en total. Esto, en un universo de más de 540.000, es alrededor de un 0.02%, por lo que podríamos simplemente descartarlos sin más. Sin embargo, se eligió analizar primero esos registros para saber si se pueden rescatar algunos

In [21]:
df_DescripcionNula = df.merge(stock_sin_Descripcion[['StockCode']], on='StockCode')
df_DescripcionNula

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536545,21134,,1,1/12/2010 14:32,0.0,,United Kingdom
1,536549,85226A,,1,1/12/2010 14:34,0.0,,United Kingdom
2,536550,85044,,1,1/12/2010 14:34,0.0,,United Kingdom
3,536552,20950,,1,1/12/2010 14:34,0.0,,United Kingdom
4,536554,84670,,23,1/12/2010 14:35,0.0,,United Kingdom
...,...,...,...,...,...,...,...,...
107,561498,21610,,-14,27/7/2011 14:10,0.0,,United Kingdom
108,561555,37477B,,-11,28/7/2011 10:21,0.0,,United Kingdom
109,561557,37477C,,-31,28/7/2011 10:21,0.0,,United Kingdom
110,567207,35592T,,4,19/9/2011 11:01,0.0,,United Kingdom


In [22]:
df_DescripcionNula[df_DescripcionNula["UnitPrice"] == 0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536545,21134,,1,1/12/2010 14:32,0.0,,United Kingdom
1,536549,85226A,,1,1/12/2010 14:34,0.0,,United Kingdom
2,536550,85044,,1,1/12/2010 14:34,0.0,,United Kingdom
3,536552,20950,,1,1/12/2010 14:34,0.0,,United Kingdom
4,536554,84670,,23,1/12/2010 14:35,0.0,,United Kingdom
...,...,...,...,...,...,...,...,...
107,561498,21610,,-14,27/7/2011 14:10,0.0,,United Kingdom
108,561555,37477B,,-11,28/7/2011 10:21,0.0,,United Kingdom
109,561557,37477C,,-31,28/7/2011 10:21,0.0,,United Kingdom
110,567207,35592T,,4,19/9/2011 11:01,0.0,,United Kingdom


In [23]:
df_DescripcionNula[df_DescripcionNula["CustomerID"].isna()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536545,21134,,1,1/12/2010 14:32,0.0,,United Kingdom
1,536549,85226A,,1,1/12/2010 14:34,0.0,,United Kingdom
2,536550,85044,,1,1/12/2010 14:34,0.0,,United Kingdom
3,536552,20950,,1,1/12/2010 14:34,0.0,,United Kingdom
4,536554,84670,,23,1/12/2010 14:35,0.0,,United Kingdom
...,...,...,...,...,...,...,...,...
107,561498,21610,,-14,27/7/2011 14:10,0.0,,United Kingdom
108,561555,37477B,,-11,28/7/2011 10:21,0.0,,United Kingdom
109,561557,37477C,,-31,28/7/2011 10:21,0.0,,United Kingdom
110,567207,35592T,,4,19/9/2011 11:01,0.0,,United Kingdom


In [24]:
df_DescripcionNula[df_DescripcionNula["Country"] == "United Kingdom"]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536545,21134,,1,1/12/2010 14:32,0.0,,United Kingdom
1,536549,85226A,,1,1/12/2010 14:34,0.0,,United Kingdom
2,536550,85044,,1,1/12/2010 14:34,0.0,,United Kingdom
3,536552,20950,,1,1/12/2010 14:34,0.0,,United Kingdom
4,536554,84670,,23,1/12/2010 14:35,0.0,,United Kingdom
...,...,...,...,...,...,...,...,...
107,561498,21610,,-14,27/7/2011 14:10,0.0,,United Kingdom
108,561555,37477B,,-11,28/7/2011 10:21,0.0,,United Kingdom
109,561557,37477C,,-31,28/7/2011 10:21,0.0,,United Kingdom
110,567207,35592T,,4,19/9/2011 11:01,0.0,,United Kingdom


Como último paso, se validará si todos estos casos pertenecen a facturas de una única fila, o si algunas de estas facturas tienen más de una fila

In [25]:
# Creo un array solamente con los InvoiceId del DF de nulos
invoices_con_solo_nulos = df_DescripcionNula['InvoiceNo'].unique()

# Filtro todas las filas del df original que coincidan con alguna de estas 112 facturas
facturas_en_df = df[df['InvoiceNo'].isin(invoices_con_solo_nulos)]

# Agrupo por ID Factura para saber cuántas filas tiene cada factura
cantidad_por_factura = facturas_en_df.groupby('InvoiceNo').size().reset_index(name='Cant_Lineas')

# Filtro si alguna de esas facturas tiene más de una fila
facturas_con_mas_de_una_linea = cantidad_por_factura[cantidad_por_factura['Cant_Lineas'] > 1]

print(cantidad_por_factura)
print(facturas_con_mas_de_una_linea)

    InvoiceNo  Cant_Lineas
0      536545            1
1      536549            1
2      536550            1
3      536552            1
4      536554            1
..        ...          ...
107    561498            1
108    561555            1
109    561557            1
110    567207            1
111    578360            1

[112 rows x 2 columns]
Empty DataFrame
Columns: [InvoiceNo, Cant_Lineas]
Index: []


**Conclusión final**

- Estas 112 filas representan un 0.02% del universo total de filas
- Todas estas filas tienen un precio unitario igual a cero
- Todas además, tienen un CustomerID desconocido
- Todas relacionadas a Reino Unido
- Todas estas facturas tienen solo una línea

Puede asumirse sin riesgo, que estas filas son producto de tests, o errores de facturación, o de carga de datos
Por otro lado, resulta imposible determinar alguno de los datos en null, puesto que ninguna de las variables que podrían servirnos, efectivamente están cargadas
Además, esas filas están aisladas a una sola factura por fila

Por lo tanto, resulta seguro descartar estas filas. Sin embargo, para mantener la trazabilidad, se creará un dataframe aparte donde se incluirán estas filas para luego guardarlas en la BD. Esto permitirá confirmar que no se perdió ninguna fila en el proceso de ETL

### StockCodes con más de una descripción asociada

In [26]:
# Detalle de los stockCodes que tienen más de una descripción asociada
detalle = df[['StockCode', 'Description']].drop_duplicates()
detalle_filtrado = detalle.merge(stock_con_multiples[['StockCode']], on='StockCode')
print(detalle_filtrado.sort_values(['StockCode', 'Description']))

         StockCode                          Description
575          10080             GROOVY CACTUS INFLATABLE
1473         10080                                check
1018         10080                                  NaN
87           10133         COLOURING PENCILS BROWN TUBE
1530         10133                              damaged
...            ...                                  ...
1017      DCGS0003                                 ebay
542       DCGS0069                OOH LA LA DOGS COLLAR
1016      DCGS0069                                 ebay
499   gift_0001_20   Dotcomgiftshop Gift Voucher £20.00
935   gift_0001_20  to push order througha s stock was 

[1658 rows x 2 columns]


## Limpieza de datos

En esta etapa, generamos una nueva tabla Staging pero con datos limpios
En primer lugar, se generará un nuevo DF que será el staging limpio. En este DF se incluirán, primero, aquellas ventas cuyos stockCodes no presenten problemas. Esto es, StockCodes válidos que, además, no estén repetidos ni tengan más de una descripción asociada. Para todos los demás, hay que revisar caso a caso porque se presentan distintas realidades

En el análisis anterior, se desprende que hay varios casos a tener en cuenta:

- El de los stockCodes válidos, donde un stockCode siempre corresponde a una descripción, y su formato es numérico (ej: 10255)
- El de los stockCodes válidos pero que no cumplen el formato numérico (ej, gift_1). Estos códigos son válidos pero amerita revisarlos aparte
- El de los stockCodes que, aunque cumplen la primera condición, también tienen códigos "hermanos" del tipo 10255 y 10255C. Estos casos deben filtrarse aparte para revisarlos
- Los stockCodes que presentan más de una descripción. En algunos casos es solamente un error de tipeo en el nombre, en otros representan devoluciones u otras cosas que hay que revisar

### Empezando a limpiar

El primer paso será crear nuestros dos dataframes finales. Uno con las filas descartadas, y otro con las filas limpias
Como se vio en el análisis anterior, existe un conjunto de filas con descripción null que no se pueden recuperar

El primer paso será crear un DF temporal sin estas filas, y un DF de descarte solamente con estas filas (por ahora)

In [27]:
df_tmp = df[~df['InvoiceNo'].isin(invoices_con_solo_nulos)].copy()
print(len(df_tmp))

541797


In [28]:
df_discard = df[df['InvoiceNo'].isin(invoices_con_solo_nulos)]
print(len(df_discard))

112


Antes de arrancar a limpiar, necesitamos resolver el problema de los nulos

En este paso, voy a buscar, para todos los códigos que aparecen con solo una descripción, si tienen nulos en la descripción asociados a ese código.

Si los hay, sustituimos la descripción

In [29]:
# Creo un array solamente con los StockCodes del DF de stockCodes con una sola descripción asociada
# Al crear este DF, se filtran aquellas filas cuya descripción sea nula
stockCodes_con_unaDesc = stock_con_unaDesc['StockCode'].unique()

# Filtro todas las filas del df original que coincidan con alguno de estos stockCodes
b = df_tmp[df_tmp['StockCode'].isin(stockCodes_con_unaDesc)]

# Obtengo aquellas filas con Descripción nula
b[b["Description"].isna()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
1971,536546,22145,,1,1/12/2010 14:33,0.0,,United Kingdom
1972,536547,37509,,1,1/12/2010 14:33,0.0,,United Kingdom
2025,536553,37461,,3,1/12/2010 14:35,0.0,,United Kingdom
2406,536589,21777,,-10,1/12/2010 16:50,0.0,,United Kingdom
4347,536764,84952C,,-38,2/12/2010 14:42,0.0,,United Kingdom
...,...,...,...,...,...,...,...,...
519967,580197,22696,,6,2/12/2011 12:04,0.0,,United Kingdom
521333,580359,20775,,-35,2/12/2011 16:11,0.0,,United Kingdom
522160,580379,72225C,,-144,2/12/2011 17:57,0.0,,United Kingdom
522162,580381,21758,,-9,2/12/2011 17:58,0.0,,United Kingdom


Para estos casos, se utilizará un diccionario que permita validar a qué descripción de producto pertenece cada registro en null

#### Llenando nulos

In [30]:
diccionario_descripciones = stock_con_unaDesc.set_index('StockCode')['Description'].to_dict()

# Crear una serie auxiliar con las descripciones a completar
descripcion_faltante = df_tmp['StockCode'].map(diccionario_descripciones)

In [31]:
# Creo dos variables bandera para verificar cuántos nulos se llenaron
# El número final debe coincidir con la cantidad de nulos encontrada más arriba

nulos_antes = df_tmp['Description'].isna().sum()
df_tmp['Description'] = df_tmp['Description'].fillna(descripcion_faltante)
nulos_despues = df_tmp['Description'].isna().sum()

print(nulos_antes - nulos_despues)

1033


In [32]:
# El número de nulos coincide. Vuelvo a verificar que no me quede ningún nulo, por las dudas

# Filtro todas las filas del df original que coincidan con alguno de estos stockCodes
b = df_tmp[df_tmp['StockCode'].isin(stockCodes_con_unaDesc)]

# Obtengo aquellas filas con Descripción nula
b[b["Description"].isna()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


### Primer paso
Crearemos cuatro nuevos DFs

- Uno para los códigos con una sola descripción, que inicien con un número
- Otro para los códigos con una sola descripción, pero que inicien con letras
- El tercero, para los códigos asociados a más de una descripción, y que inician con un número
- Finalmente, los códigos asociados a más de una descripción pero su contenido incluya letras al principio

Cada uno de estos DF se analizará por separado

In [33]:
df_codigos_numericos_single = df_tmp[df_tmp['StockCode'].str.match(r'^\d', na=False)].merge(stock_con_unaDesc[['StockCode']], on='StockCode')
df_codigos_no_numericos_single = df_tmp[~df_tmp['StockCode'].str.match(r'^\d', na=False)].merge(stock_con_unaDesc[['StockCode']], on='StockCode')

df_codigos_numericos_multi = df_tmp[df_tmp['StockCode'].str.match(r'^\d', na=False)].merge(stock_con_multiples[['StockCode']], on='StockCode')
df_codigos_no_numericos_multi = df_tmp[~df_tmp['StockCode'].str.match(r'^\d', na=False)].merge(stock_con_multiples[['StockCode']], on='StockCode')

In [34]:
print(cant_registros_totales)

print(len(df_codigos_numericos_single))
print(len(df_codigos_no_numericos_single))
print(len(df_codigos_numericos_multi))
print(len(df_codigos_no_numericos_multi))

# Verificamos que no se perdió ningún registro en el filtrado

541909
425360
2972
113448
17


### Segundo paso
Códigos con una sola descripción, que inicien con un número

Estos códigos son válidos en general, y no hay mucho análisis para hacer

No obstante, es interesante verificar que todas las descripciones sean válidas

Se vio que las descripciones válidas están en mayúsculas. Entonces, una forma útil de encontrar descripciones no válidas es buscar las que empiecen con minúsculas



In [35]:
# Se vio que las descripciones válidas están en mayúsculas
# Una forma útil de encontrar descripciones no válidas es buscar las que empiecen con minúsculas

df_noDescripcion = df_codigos_numericos_single[df_codigos_numericos_single['Description'].str.match(r'[a-z].*$')].copy()

# Saco estos registros del DF que estaba trabajando para no duplicar
df_codigos_numericos_single = df_codigos_numericos_single[~df_codigos_numericos_single['Description'].str.match(r'[a-z].*$')].copy()
df_noDescripcion

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
40490,540638,72038P,damages,-990,10/1/2011 12:14,0.0,,United Kingdom
66294,543257,84611B,thrown away,-1430,4/2/2011 16:06,0.0,,United Kingdom
66295,543258,84611B,thrown away,1287,4/2/2011 16:06,0.0,,United Kingdom
66296,543259,84612B,thrown away,-162,4/2/2011 16:07,0.0,,United Kingdom
92540,546126,35611B,thrown away,-27,9/3/2011 14:52,0.0,,United Kingdom
92801,546152,72140F,throw away,-5368,9/3/2011 17:25,0.0,,United Kingdom
105326,547559,72759,thrown away-can't sell.,-524,23/3/2011 17:27,0.0,,United Kingdom
105327,547560,72732,thrown away-can't sell,-2472,23/3/2011 17:28,0.0,,United Kingdom


In [36]:
df_codigos_numericos_single[df_codigos_numericos_single["Description"].isna()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


In [37]:
len(df_codigos_numericos_single)

425352

Existen 8 filas donde la descripción indica devoluciones o correcciones de stock
Estos registros deben ser tratados aparte, a fin de poder mostrar luego datos consistentes en los informes

Un problema con estos datos, es que el stockCode presente solo está relacionado a esta descripción. Es decir, nos es imposible relacionar estos stockCodes con un producto real, y de esta manera, saber qué producto es el que se devolvió o descartó. Sin embargo, estos datos pueden ser útiles para algún reporte donde queramos saber cantidades de devoluciones, o productos rotos, o similar. Por tanto, se decidió almacenarlos

Sin embargo, para estos casos se decidió crear una nueva columna "RecordNote" para guardar estas descripciones espurias. Esto se hará también con los demás casos en los otros tres DFs que aún no se analizaron
Para estos 8 casos, la descripción será NULL debido a que no es posible rescatar la información vinculada


In [38]:
df_noDescripcion["RecordNote"] = df_noDescripcion["Description"]
df_noDescripcion["Description"] = np.nan

In [39]:
df_Final = df_noDescripcion.copy()
df_Final

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
40490,540638,72038P,,-990,10/1/2011 12:14,0.0,,United Kingdom,damages
66294,543257,84611B,,-1430,4/2/2011 16:06,0.0,,United Kingdom,thrown away
66295,543258,84611B,,1287,4/2/2011 16:06,0.0,,United Kingdom,thrown away
66296,543259,84612B,,-162,4/2/2011 16:07,0.0,,United Kingdom,thrown away
92540,546126,35611B,,-27,9/3/2011 14:52,0.0,,United Kingdom,thrown away
92801,546152,72140F,,-5368,9/3/2011 17:25,0.0,,United Kingdom,throw away
105326,547559,72759,,-524,23/3/2011 17:27,0.0,,United Kingdom,thrown away-can't sell.
105327,547560,72732,,-2472,23/3/2011 17:28,0.0,,United Kingdom,thrown away-can't sell


Finalmente, se cargarán en df_Final los registros que si tienen descripción válida

In [40]:
df_Final = pd.concat([df_Final, df_codigos_numericos_single])
df_Final

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
40490,540638,72038P,,-990,10/1/2011 12:14,0.00,,United Kingdom,damages
66294,543257,84611B,,-1430,4/2/2011 16:06,0.00,,United Kingdom,thrown away
66295,543258,84611B,,1287,4/2/2011 16:06,0.00,,United Kingdom,thrown away
66296,543259,84612B,,-162,4/2/2011 16:07,0.00,,United Kingdom,thrown away
92540,546126,35611B,,-27,9/3/2011 14:52,0.00,,United Kingdom,thrown away
...,...,...,...,...,...,...,...,...,...
425355,581587,22629,SPACEBOY LUNCH BOX,12,9/12/2011 12:50,1.95,12680.0,France,
425356,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,9/12/2011 12:50,0.85,12680.0,France,
425357,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,9/12/2011 12:50,2.10,12680.0,France,
425358,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,9/12/2011 12:50,4.15,12680.0,France,


In [41]:
# Verificamos que ninguna descripción esté en minúsculas en nuestro DF limpio
df_Final[df_Final['Description'].str.match(r'[a-z].*$', na=False)]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote


### Tercer paso
Códigos con una sola descripción, pero que inicien con letras


In [42]:
df_codigos_no_numericos_single

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536370,POST,POSTAGE,3,1/12/2010 8:45,18.00,12583.0,France
1,C536379,D,Discount,-1,1/12/2010 9:41,27.50,14527.0,United Kingdom
2,536403,POST,POSTAGE,1,1/12/2010 11:27,15.00,12791.0,Netherlands
3,536527,POST,POSTAGE,1,1/12/2010 13:04,18.00,12662.0,Germany
4,536540,C2,CARRIAGE,1,1/12/2010 14:05,50.00,14911.0,EIRE
...,...,...,...,...,...,...,...,...
2967,581498,DOT,DOTCOM POSTAGE,1,9/12/2011 10:26,1714.17,,United Kingdom
2968,C581499,M,Manual,-1,9/12/2011 10:28,224.69,15498.0,United Kingdom
2969,581570,POST,POSTAGE,1,9/12/2011 11:59,18.00,12662.0,Germany
2970,581574,POST,POSTAGE,2,9/12/2011 12:09,18.00,12526.0,Germany


In [43]:
# Creo la nueva columna de notas
df_codigos_no_numericos_single["RecordNote"] = pd.Series(dtype="string")

In [44]:
# Cantidad de códigos distintos
codigos_no_numericos = df_codigos_no_numericos_single["StockCode"].drop_duplicates()
codigos_no_numericos

0           POST
1              D
4             C2
5            DOT
6              M
          ...   
1723    DCGS0073
1725    DCGS0068
1726    DCGS0067
1829           B
1936        CRUK
Name: StockCode, Length: 24, dtype: object

#### Algunos análisis rápidos

In [45]:
# Filas con cantidades negativas
df_codigos_no_numericos_single[df_codigos_no_numericos_single["Quantity"] < 0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
1,C536379,D,Discount,-1,1/12/2010 9:41,27.50,14527.0,United Kingdom,
31,C537164,D,Discount,-1,5/12/2010 13:21,29.29,14527.0,United Kingdom,
48,C537414,POST,POSTAGE,-1,6/12/2010 15:09,4.41,16861.0,United Kingdom,
54,C537572,BANK CHARGES,Bank Charges,-1,7/12/2010 12:00,95.38,,United Kingdom,
55,C537581,S,SAMPLES,-1,7/12/2010 12:03,12.95,,United Kingdom,
...,...,...,...,...,...,...,...,...,...
2927,C580726,CRUK,CRUK Commission,-1,5/12/2011 17:17,1100.44,14096.0,United Kingdom,
2938,C580957,POST,POSTAGE,-1,6/12/2011 14:23,4.50,12839.0,United Kingdom,
2946,C581009,M,Manual,-1,7/12/2011 9:15,125.00,16971.0,United Kingdom,
2949,C581145,M,Manual,-1,7/12/2011 13:48,9.95,17490.0,United Kingdom,


In [46]:
# Filas sin clientes asociados
df_codigos_no_numericos_single[df_codigos_no_numericos_single["CustomerID"].isna()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
5,536544,DOT,DOTCOM POSTAGE,1,1/12/2010 14:32,569.77,,United Kingdom,
8,536592,DOT,DOTCOM POSTAGE,1,1/12/2010 17:06,607.49,,United Kingdom,
14,536862,DOT,DOTCOM POSTAGE,1,3/12/2010 11:13,254.43,,United Kingdom,
15,536864,DOT,DOTCOM POSTAGE,1,3/12/2010 11:27,121.06,,United Kingdom,
16,536865,M,Manual,1,3/12/2010 11:28,2.55,,United Kingdom,
...,...,...,...,...,...,...,...,...,...
2957,581219,DOT,DOTCOM POSTAGE,1,8/12/2011 9:28,1008.96,,United Kingdom,
2959,581238,DOT,DOTCOM POSTAGE,1,8/12/2011 10:53,1683.75,,United Kingdom,
2963,581439,DOT,DOTCOM POSTAGE,1,8/12/2011 16:30,938.59,,United Kingdom,
2964,581492,DOT,DOTCOM POSTAGE,1,9/12/2011 10:03,933.17,,United Kingdom,


In [47]:
# Filas con precio unitario cero
df_codigos_no_numericos_single[df_codigos_no_numericos_single["UnitPrice"] == 0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
244,540699,POST,POSTAGE,1000,11/1/2011 9:32,0.0,,United Kingdom,
799,547966,DOT,DOTCOM POSTAGE,1000,28/3/2011 15:49,0.0,,United Kingdom,
1039,552230,DOT,DOTCOM POSTAGE,1,6/5/2011 15:43,0.0,,United Kingdom,
1239,554857,POST,POSTAGE,800,27/5/2011 10:08,0.0,,United Kingdom,
1723,561249,DCGS0073,ebay,-4,26/7/2011 11:51,0.0,,United Kingdom,
...,...,...,...,...,...,...,...,...,...
2612,575505,POST,POSTAGE,800,10/11/2011 10:29,0.0,,United Kingdom,
2613,575506,C2,CARRIAGE,150,10/11/2011 10:30,0.0,,United Kingdom,
2731,577168,M,Manual,1,18/11/2011 10:42,0.0,12603.0,Germany,
2766,577696,M,Manual,1,21/11/2011 11:57,0.0,16406.0,United Kingdom,


#### Limpieza

Se vio que existen ciertos tipos de códigos que podrían agruparse. Es el caso por ejemplo de los códigos que comienzan con "gift" (son giftcards, se pueden tratar como productos normales) o los que comienzan con "DCGS"

En esta sección, se irán agrupando y agregando al df_final según corresponda, intentando agrupar de la forma más coherente posible

##### Gift

In [48]:
# Filas cuya descripción comienza por "gift"
df_codigos_no_numericos_single[df_codigos_no_numericos_single['StockCode'].str.contains(r'^gift', case=False, na=False)]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
157,539492,gift_0001_40,Dotcomgiftshop Gift Voucher £40.00,1,20/12/2010 10:14,34.04,,United Kingdom,
194,539958,gift_0001_50,Dotcomgiftshop Gift Voucher £50.00,1,23/12/2010 13:26,42.55,,United Kingdom,
210,540238,gift_0001_30,Dotcomgiftshop Gift Voucher £30.00,1,5/1/2011 14:44,25.53,,United Kingdom,
485,544323,gift_0001_30,Dotcomgiftshop Gift Voucher £30.00,1,17/2/2011 15:51,25.00,,United Kingdom,
492,544434,gift_0001_30,Dotcomgiftshop Gift Voucher £30.00,1,18/2/2011 16:12,25.00,,United Kingdom,
...,...,...,...,...,...,...,...,...,...
1933,564761,gift_0001_30,Dotcomgiftshop Gift Voucher £30.00,30,30/8/2011 10:48,0.00,,United Kingdom,
1934,564762,gift_0001_10,Dotcomgiftshop Gift Voucher £10.00,30,30/8/2011 10:48,0.00,,United Kingdom,
1965,564974,gift_0001_10,Dotcomgiftshop Gift Voucher £10.00,2,31/8/2011 15:32,8.33,,United Kingdom,
1975,565231,gift_0001_30,Dotcomgiftshop Gift Voucher £30.00,1,2/9/2011 9:26,25.00,,United Kingdom,


Los códigos gift son pocos casos, y todos son productos válidos. Entonces, estas filas se pasarán tal como están al df_final (y se eliminarán del df de códigos no numéricos para ir filtrando)


In [49]:
df_Final = pd.concat([df_Final, 
                      df_codigos_no_numericos_single[df_codigos_no_numericos_single['StockCode'].str.contains(r'^gift', case=False, na=False)]])

In [50]:
df_codigos_no_numericos_single = df_codigos_no_numericos_single[~df_codigos_no_numericos_single['StockCode'].str.contains(r'^gift', case=False, na=False)]
df_codigos_no_numericos_single

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
0,536370,POST,POSTAGE,3,1/12/2010 8:45,18.00,12583.0,France,
1,C536379,D,Discount,-1,1/12/2010 9:41,27.50,14527.0,United Kingdom,
2,536403,POST,POSTAGE,1,1/12/2010 11:27,15.00,12791.0,Netherlands,
3,536527,POST,POSTAGE,1,1/12/2010 13:04,18.00,12662.0,Germany,
4,536540,C2,CARRIAGE,1,1/12/2010 14:05,50.00,14911.0,EIRE,
...,...,...,...,...,...,...,...,...,...
2967,581498,DOT,DOTCOM POSTAGE,1,9/12/2011 10:26,1714.17,,United Kingdom,
2968,C581499,M,Manual,-1,9/12/2011 10:28,224.69,15498.0,United Kingdom,
2969,581570,POST,POSTAGE,1,9/12/2011 11:59,18.00,12662.0,Germany,
2970,581574,POST,POSTAGE,2,9/12/2011 12:09,18.00,12526.0,Germany,


##### DCGS

In [51]:
# Filas cuya descripción comienza por "DCGS"
df_codigos_no_numericos_single[df_codigos_no_numericos_single['StockCode'].str.contains(r'^DCGS', case=False, na=False)]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
92,538071,DCGS0076,SUNJAR LED NIGHT NIGHT LIGHT,1,9/12/2010 14:09,16.13,,United Kingdom,
163,539631,DCGS0076,SUNJAR LED NIGHT NIGHT LIGHT,2,20/12/2010 15:03,16.13,,United Kingdom,
171,539718,DCGS0070,CAMOUFLAGE DOG COLLAR,1,21/12/2010 13:06,12.72,,United Kingdom,
414,543358,DCGSSBOY,BOYS PARTY BAG,1,7/2/2011 14:04,3.29,,United Kingdom,
415,543358,DCGSSGIRL,GIRLS PARTY BAG,3,7/2/2011 14:04,3.29,,United Kingdom,
...,...,...,...,...,...,...,...,...,...
1945,564825,DCGSSGIRL,GIRLS PARTY BAG,1,30/8/2011 12:26,3.29,,United Kingdom,
2196,568716,DCGSSBOY,BOYS PARTY BAG,2,28/9/2011 16:13,3.29,,United Kingdom,
2406,571931,DCGSSGIRL,GIRLS PARTY BAG,1,19/10/2011 16:59,3.29,,United Kingdom,
2699,576840,DCGSSGIRL,GIRLS PARTY BAG,1,16/11/2011 15:23,3.29,,United Kingdom,


Se etiquetan los registros que comienzan por "DCGS" y se cargan al DF Final

In [52]:
df_codigos_no_numericos_single.loc[df_codigos_no_numericos_single['StockCode'].str.startswith('DCGS'), 'RecordNote'] = 'InternalCode_DCGS'

In [53]:
df_Final = pd.concat([df_Final, 
                      df_codigos_no_numericos_single[df_codigos_no_numericos_single['RecordNote'] == 'InternalCode_DCGS']])

In [54]:
df_codigos_no_numericos_single = df_codigos_no_numericos_single[df_codigos_no_numericos_single['RecordNote'].isna()]
df_codigos_no_numericos_single

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
0,536370,POST,POSTAGE,3,1/12/2010 8:45,18.00,12583.0,France,
1,C536379,D,Discount,-1,1/12/2010 9:41,27.50,14527.0,United Kingdom,
2,536403,POST,POSTAGE,1,1/12/2010 11:27,15.00,12791.0,Netherlands,
3,536527,POST,POSTAGE,1,1/12/2010 13:04,18.00,12662.0,Germany,
4,536540,C2,CARRIAGE,1,1/12/2010 14:05,50.00,14911.0,EIRE,
...,...,...,...,...,...,...,...,...,...
2967,581498,DOT,DOTCOM POSTAGE,1,9/12/2011 10:26,1714.17,,United Kingdom,
2968,C581499,M,Manual,-1,9/12/2011 10:28,224.69,15498.0,United Kingdom,
2969,581570,POST,POSTAGE,1,9/12/2011 11:59,18.00,12662.0,Germany,
2970,581574,POST,POSTAGE,2,9/12/2011 12:09,18.00,12526.0,Germany,


##### POST

El código POST define líneas que no son ítems físicos, sino gastos de envío

Existen dos códigos, POST (envíos físicos) y DOT (envíos de compras web)

Separo las lineas con código POST o DOT, y las excluyo del DF original

In [55]:
df_post = df_codigos_no_numericos_single[df_codigos_no_numericos_single['StockCode'].str.contains(r'^POST', case=False, na=False)].copy()
df_post

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
0,536370,POST,POSTAGE,3,1/12/2010 8:45,18.0,12583.0,France,
2,536403,POST,POSTAGE,1,1/12/2010 11:27,15.0,12791.0,Netherlands,
3,536527,POST,POSTAGE,1,1/12/2010 13:04,18.0,12662.0,Germany,
10,536840,POST,POSTAGE,1,2/12/2010 18:27,18.0,12738.0,Germany,
11,536852,POST,POSTAGE,1,3/12/2010 9:51,18.0,12686.0,France,
...,...,...,...,...,...,...,...,...,...
2965,581493,POST,POSTAGE,1,9/12/2011 10:10,15.0,12423.0,Belgium,
2966,581494,POST,POSTAGE,2,9/12/2011 10:13,18.0,12518.0,Germany,
2969,581570,POST,POSTAGE,1,9/12/2011 11:59,18.0,12662.0,Germany,
2970,581574,POST,POSTAGE,2,9/12/2011 12:09,18.0,12526.0,Germany,


In [56]:
df_post = pd.concat([df_post, 
                      df_codigos_no_numericos_single[df_codigos_no_numericos_single['StockCode'].str.contains(r'^DOT', case=False, na=False)]])
df_post

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
0,536370,POST,POSTAGE,3,1/12/2010 8:45,18.00,12583.0,France,
2,536403,POST,POSTAGE,1,1/12/2010 11:27,15.00,12791.0,Netherlands,
3,536527,POST,POSTAGE,1,1/12/2010 13:04,18.00,12662.0,Germany,
10,536840,POST,POSTAGE,1,2/12/2010 18:27,18.00,12738.0,Germany,
11,536852,POST,POSTAGE,1,3/12/2010 9:51,18.00,12686.0,France,
...,...,...,...,...,...,...,...,...,...
2957,581219,DOT,DOTCOM POSTAGE,1,8/12/2011 9:28,1008.96,,United Kingdom,
2959,581238,DOT,DOTCOM POSTAGE,1,8/12/2011 10:53,1683.75,,United Kingdom,
2963,581439,DOT,DOTCOM POSTAGE,1,8/12/2011 16:30,938.59,,United Kingdom,
2964,581492,DOT,DOTCOM POSTAGE,1,9/12/2011 10:03,933.17,,United Kingdom,


In [57]:
df_codigos_no_numericos_single = df_codigos_no_numericos_single[~df_codigos_no_numericos_single["StockCode"].str.contains(r'^POST')]
df_codigos_no_numericos_single = df_codigos_no_numericos_single[~df_codigos_no_numericos_single["StockCode"].str.contains(r'^DOT')]
df_codigos_no_numericos_single

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
1,C536379,D,Discount,-1,1/12/2010 9:41,27.50,14527.0,United Kingdom,
4,536540,C2,CARRIAGE,1,1/12/2010 14:05,50.00,14911.0,EIRE,
6,536569,M,Manual,1,1/12/2010 15:35,1.25,16274.0,United Kingdom,
7,536569,M,Manual,1,1/12/2010 15:35,18.95,16274.0,United Kingdom,
9,536779,BANK CHARGES,Bank Charges,1,2/12/2010 15:08,15.00,15823.0,United Kingdom,
...,...,...,...,...,...,...,...,...,...
2946,C581009,M,Manual,-1,7/12/2011 9:15,125.00,16971.0,United Kingdom,
2948,581127,BANK CHARGES,Bank Charges,1,7/12/2011 12:45,15.00,16271.0,United Kingdom,
2949,C581145,M,Manual,-1,7/12/2011 13:48,9.95,17490.0,United Kingdom,
2962,581405,M,Manual,3,8/12/2011 13:50,0.42,13521.0,United Kingdom,


In [58]:
df_post[df_post["Quantity"] == 0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote


In [59]:
df_post[df_post["UnitPrice"] == 0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
244,540699,POST,POSTAGE,1000,11/1/2011 9:32,0.0,,United Kingdom,
1239,554857,POST,POSTAGE,800,27/5/2011 10:08,0.0,,United Kingdom,
2001,565556,POST,POSTAGE,750,5/9/2011 12:14,0.0,,United Kingdom,
2612,575505,POST,POSTAGE,800,10/11/2011 10:29,0.0,,United Kingdom,
799,547966,DOT,DOTCOM POSTAGE,1000,28/3/2011 15:49,0.0,,United Kingdom,
1039,552230,DOT,DOTCOM POSTAGE,1,6/5/2011 15:43,0.0,,United Kingdom,
2902,580366,DOT,DOTCOM POSTAGE,1,2/12/2011 16:38,0.0,,United Kingdom,


In [60]:
df_post[df_post["Quantity"] < 0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
48,C537414,POST,POSTAGE,-1,6/12/2010 15:09,4.41,16861.0,United Kingdom,
129,C539063,POST,POSTAGE,-1,15/12/2010 16:50,12.34,15107.0,United Kingdom,
130,C539073,POST,POSTAGE,-1,15/12/2010 17:08,3.50,12971.0,United Kingdom,
143,C539409,POST,POSTAGE,-1,17/12/2010 12:58,18.00,12720.0,Germany,
170,C539712,POST,POSTAGE,-1,21/12/2010 12:40,5.75,15602.0,United Kingdom,
...,...,...,...,...,...,...,...,...,...
2852,C579366,POST,POSTAGE,-1,29/11/2011 11:56,9.72,14205.0,United Kingdom,
2862,C579532,POST,POSTAGE,-1,30/11/2011 9:21,18.00,12494.0,France,
2897,C580161,POST,POSTAGE,-2,2/12/2011 10:49,18.00,12700.0,France,
2938,C580957,POST,POSTAGE,-1,6/12/2011 14:23,4.50,12839.0,United Kingdom,


In [61]:
df_post[df_post["CustomerID"].isna()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
216,C540266,POST,POSTAGE,-1,6/1/2011 11:05,35.09,,United Kingdom,
244,540699,POST,POSTAGE,1000,11/1/2011 9:32,0.00,,United Kingdom,
306,541607,POST,POSTAGE,1,20/1/2011 9:53,29.43,,United Kingdom,
362,C542540,POST,POSTAGE,-1,28/1/2011 14:20,4.41,,United Kingdom,
649,546303,POST,POSTAGE,1,10/3/2011 15:30,8.62,,United Kingdom,
...,...,...,...,...,...,...,...,...,...
2957,581219,DOT,DOTCOM POSTAGE,1,8/12/2011 9:28,1008.96,,United Kingdom,
2959,581238,DOT,DOTCOM POSTAGE,1,8/12/2011 10:53,1683.75,,United Kingdom,
2963,581439,DOT,DOTCOM POSTAGE,1,8/12/2011 16:30,938.59,,United Kingdom,
2964,581492,DOT,DOTCOM POSTAGE,1,9/12/2011 10:03,933.17,,United Kingdom,


In [62]:
df_post.loc[df_post["UnitPrice"] == 0, 'RecordNote'] = "Postage error"
df_post[df_post["RecordNote"] == 'Postage error']

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
244,540699,POST,POSTAGE,1000,11/1/2011 9:32,0.0,,United Kingdom,Postage error
1239,554857,POST,POSTAGE,800,27/5/2011 10:08,0.0,,United Kingdom,Postage error
2001,565556,POST,POSTAGE,750,5/9/2011 12:14,0.0,,United Kingdom,Postage error
2612,575505,POST,POSTAGE,800,10/11/2011 10:29,0.0,,United Kingdom,Postage error
799,547966,DOT,DOTCOM POSTAGE,1000,28/3/2011 15:49,0.0,,United Kingdom,Postage error
1039,552230,DOT,DOTCOM POSTAGE,1,6/5/2011 15:43,0.0,,United Kingdom,Postage error
2902,580366,DOT,DOTCOM POSTAGE,1,2/12/2011 16:38,0.0,,United Kingdom,Postage error


Existen casos donde el precio unitario es cero, y el cliente es null. Estos casos se descartarán como errores

In [63]:
df_discard = pd.concat([df_discard, 
                      df_post[df_post['RecordNote'] == 'Postage error']])

In [64]:
df_post = df_post[df_post['RecordNote'].isna()]
df_post

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
0,536370,POST,POSTAGE,3,1/12/2010 8:45,18.00,12583.0,France,
2,536403,POST,POSTAGE,1,1/12/2010 11:27,15.00,12791.0,Netherlands,
3,536527,POST,POSTAGE,1,1/12/2010 13:04,18.00,12662.0,Germany,
10,536840,POST,POSTAGE,1,2/12/2010 18:27,18.00,12738.0,Germany,
11,536852,POST,POSTAGE,1,3/12/2010 9:51,18.00,12686.0,France,
...,...,...,...,...,...,...,...,...,...
2957,581219,DOT,DOTCOM POSTAGE,1,8/12/2011 9:28,1008.96,,United Kingdom,
2959,581238,DOT,DOTCOM POSTAGE,1,8/12/2011 10:53,1683.75,,United Kingdom,
2963,581439,DOT,DOTCOM POSTAGE,1,8/12/2011 16:30,938.59,,United Kingdom,
2964,581492,DOT,DOTCOM POSTAGE,1,9/12/2011 10:03,933.17,,United Kingdom,


Los casos de código "POST" y Quantity negativa se asume que son devoluciones por concepto de envío
Como tales, se carga el RecordNote y se agregan al df_Final

In [65]:
df_post.loc[df_post["Quantity"] < 0, 'RecordNote'] = "Postage refund"

In [66]:
df_Final = pd.concat([df_Final, 
                      df_post[df_post['RecordNote'] == 'Postage refund']])

In [67]:
df_post = df_post[df_post['RecordNote'].isna()]
df_post

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
0,536370,POST,POSTAGE,3,1/12/2010 8:45,18.00,12583.0,France,
2,536403,POST,POSTAGE,1,1/12/2010 11:27,15.00,12791.0,Netherlands,
3,536527,POST,POSTAGE,1,1/12/2010 13:04,18.00,12662.0,Germany,
10,536840,POST,POSTAGE,1,2/12/2010 18:27,18.00,12738.0,Germany,
11,536852,POST,POSTAGE,1,3/12/2010 9:51,18.00,12686.0,France,
...,...,...,...,...,...,...,...,...,...
2957,581219,DOT,DOTCOM POSTAGE,1,8/12/2011 9:28,1008.96,,United Kingdom,
2959,581238,DOT,DOTCOM POSTAGE,1,8/12/2011 10:53,1683.75,,United Kingdom,
2963,581439,DOT,DOTCOM POSTAGE,1,8/12/2011 16:30,938.59,,United Kingdom,
2964,581492,DOT,DOTCOM POSTAGE,1,9/12/2011 10:03,933.17,,United Kingdom,


In [68]:
df_post[df_post["CustomerID"].isna()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
306,541607,POST,POSTAGE,1,20/1/2011 9:53,29.43,,United Kingdom,
649,546303,POST,POSTAGE,1,10/3/2011 15:30,8.62,,United Kingdom,
812,548219,POST,POSTAGE,1,30/3/2011 9:46,3.95,,United Kingdom,
1300,555869,POST,POSTAGE,1,7/6/2011 14:54,5.15,,United Kingdom,
1356,556514,POST,POSTAGE,1,13/6/2011 10:45,3.30,,United Kingdom,
...,...,...,...,...,...,...,...,...,...
2957,581219,DOT,DOTCOM POSTAGE,1,8/12/2011 9:28,1008.96,,United Kingdom,
2959,581238,DOT,DOTCOM POSTAGE,1,8/12/2011 10:53,1683.75,,United Kingdom,
2963,581439,DOT,DOTCOM POSTAGE,1,8/12/2011 16:30,938.59,,United Kingdom,
2964,581492,DOT,DOTCOM POSTAGE,1,9/12/2011 10:03,933.17,,United Kingdom,


A los casos de customerID en null se les agregará un comentario y se sumarán al df_Final
Esto es porque estos registros tienen unit prices y quantitys válidas, por lo que podrían ser de utilidad en un reporte

In [69]:
df_post.loc[df_post["CustomerID"].isna(), 'RecordNote'] = "Postage without customer"

In [70]:
df_Final = pd.concat([df_Final, 
                      df_post[df_post['RecordNote'] == 'Postage without customer']])

In [71]:
df_post = df_post[df_post['RecordNote'].isna()].copy()
df_post

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
0,536370,POST,POSTAGE,3,1/12/2010 8:45,18.00,12583.0,France,
2,536403,POST,POSTAGE,1,1/12/2010 11:27,15.00,12791.0,Netherlands,
3,536527,POST,POSTAGE,1,1/12/2010 13:04,18.00,12662.0,Germany,
10,536840,POST,POSTAGE,1,2/12/2010 18:27,18.00,12738.0,Germany,
11,536852,POST,POSTAGE,1,3/12/2010 9:51,18.00,12686.0,France,
...,...,...,...,...,...,...,...,...,...
2618,575607,DOT,DOTCOM POSTAGE,1,10/11/2011 12:37,908.16,14096.0,United Kingdom,
2670,576339,DOT,DOTCOM POSTAGE,1,14/11/2011 15:27,1500.36,14096.0,United Kingdom,
2801,578270,DOT,DOTCOM POSTAGE,1,23/11/2011 13:39,1270.06,14096.0,United Kingdom,
2844,579196,DOT,DOTCOM POSTAGE,1,28/11/2011 15:54,1526.76,14096.0,United Kingdom,


Finalmente, el resto de los registros POST son costos de envío válidos. Por lo tanto, se agrega un comentario para trazabilidad y se suman al df_Final

In [72]:
df_post['RecordNote'] = "Postage charge"

In [73]:
df_Final = pd.concat([df_Final, df_post])

#### Bank Charges

Quedan 950 filas para revisar. Existen algunos registros con el stockCode "BANK CHARGES" que se pueden analizar y extraer
Los montos positivos son reembolsos o devoluciones de comisiones
Los monntos negativos son cargos por comisiones bancarias

In [74]:
df_Bank = df_codigos_no_numericos_single[df_codigos_no_numericos_single["StockCode"] == "BANK CHARGES"].copy()
df_Bank

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
9,536779,BANK CHARGES,Bank Charges,1,2/12/2010 15:08,15.00,15823.0,United Kingdom,
54,C537572,BANK CHARGES,Bank Charges,-1,7/12/2010 12:00,95.38,,United Kingdom,
111,C538680,BANK CHARGES,Bank Charges,-1,13/12/2010 17:10,966.92,,United Kingdom,
295,541505,BANK CHARGES,Bank Charges,1,18/1/2011 15:58,15.00,15939.0,United Kingdom,
310,C541653,BANK CHARGES,Bank Charges,-1,20/1/2011 11:50,1050.15,,United Kingdom,
...,...,...,...,...,...,...,...,...,...
2568,574546,BANK CHARGES,Bank Charges,1,4/11/2011 14:59,15.00,13651.0,United Kingdom,
2746,C577342,BANK CHARGES,Bank Charges,-1,18/11/2011 15:11,490.06,,United Kingdom,
2747,C577343,BANK CHARGES,Bank Charges,-1,18/11/2011 15:13,27.21,,United Kingdom,
2837,579137,BANK CHARGES,Bank Charges,1,28/11/2011 12:51,15.00,14704.0,United Kingdom,


In [75]:
df_codigos_no_numericos_single = df_codigos_no_numericos_single[~df_codigos_no_numericos_single["StockCode"].str.contains("BANK CHARGES")]
df_codigos_no_numericos_single

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
1,C536379,D,Discount,-1,1/12/2010 9:41,27.50,14527.0,United Kingdom,
4,536540,C2,CARRIAGE,1,1/12/2010 14:05,50.00,14911.0,EIRE,
6,536569,M,Manual,1,1/12/2010 15:35,1.25,16274.0,United Kingdom,
7,536569,M,Manual,1,1/12/2010 15:35,18.95,16274.0,United Kingdom,
16,536865,M,Manual,1,3/12/2010 11:28,2.55,,United Kingdom,
...,...,...,...,...,...,...,...,...,...
2937,580956,M,Manual,4,6/12/2011 14:23,1.25,17841.0,United Kingdom,
2946,C581009,M,Manual,-1,7/12/2011 9:15,125.00,16971.0,United Kingdom,
2949,C581145,M,Manual,-1,7/12/2011 13:48,9.95,17490.0,United Kingdom,
2962,581405,M,Manual,3,8/12/2011 13:50,0.42,13521.0,United Kingdom,


In [76]:
# Para los casos donde el CustomerID es null, se carga un recordNote

df_Bank.loc[df_Bank["CustomerID"].isna(), 'RecordNote'] = "Bank Charge adjustment"

In [77]:
df_Bank[~df_Bank["CustomerID"].isna()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
9,536779,BANK CHARGES,Bank Charges,1,2/12/2010 15:08,15.0,15823.0,United Kingdom,
295,541505,BANK CHARGES,Bank Charges,1,18/1/2011 15:58,15.0,15939.0,United Kingdom,
889,549717,BANK CHARGES,Bank Charges,1,11/4/2011 14:56,15.0,14606.0,United Kingdom,
1025,551945,BANK CHARGES,Bank Charges,1,5/5/2011 11:09,15.0,16714.0,United Kingdom,
2007,565735,BANK CHARGES,Bank Charges,1,6/9/2011 12:25,15.0,16904.0,United Kingdom,
...,...,...,...,...,...,...,...,...,...
2402,571900,BANK CHARGES,Bank Charges,1,19/10/2011 14:26,15.0,13263.0,United Kingdom,
2527,573586,BANK CHARGES,Bank Charges,1,31/10/2011 14:48,15.0,14704.0,United Kingdom,
2568,574546,BANK CHARGES,Bank Charges,1,4/11/2011 14:59,15.0,13651.0,United Kingdom,
2837,579137,BANK CHARGES,Bank Charges,1,28/11/2011 12:51,15.0,14704.0,United Kingdom,


In [78]:
df_Bank[~df_Bank["CustomerID"].isna() & df_Bank["UnitPrice"] > 0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
9,536779,BANK CHARGES,Bank Charges,1,2/12/2010 15:08,15.0,15823.0,United Kingdom,
295,541505,BANK CHARGES,Bank Charges,1,18/1/2011 15:58,15.0,15939.0,United Kingdom,
889,549717,BANK CHARGES,Bank Charges,1,11/4/2011 14:56,15.0,14606.0,United Kingdom,
1025,551945,BANK CHARGES,Bank Charges,1,5/5/2011 11:09,15.0,16714.0,United Kingdom,
2007,565735,BANK CHARGES,Bank Charges,1,6/9/2011 12:25,15.0,16904.0,United Kingdom,
...,...,...,...,...,...,...,...,...,...
2402,571900,BANK CHARGES,Bank Charges,1,19/10/2011 14:26,15.0,13263.0,United Kingdom,
2527,573586,BANK CHARGES,Bank Charges,1,31/10/2011 14:48,15.0,14704.0,United Kingdom,
2568,574546,BANK CHARGES,Bank Charges,1,4/11/2011 14:59,15.0,13651.0,United Kingdom,
2837,579137,BANK CHARGES,Bank Charges,1,28/11/2011 12:51,15.0,14704.0,United Kingdom,


In [79]:
# Se verificó que los casos con cliente no nulo, todos están relacionados a precios positivos.
# Por lo tanto, se carga un mensaje y se cargan estas filas en el df_Final
df_Bank.loc[~df_Bank["CustomerID"].isna(), 'RecordNote'] = "Bank Charge related to customer"

In [80]:
df_Bank[df_Bank["RecordNote"].isna()]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote


In [81]:
df_Final = pd.concat([df_Final, 
                      df_Bank])

#### Manual

In [84]:
df_Manual = df_codigos_no_numericos_single[df_codigos_no_numericos_single["StockCode"] == "M"].copy()
df_Manual

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
6,536569,M,Manual,1,1/12/2010 15:35,1.25,16274.0,United Kingdom,
7,536569,M,Manual,1,1/12/2010 15:35,18.95,16274.0,United Kingdom,
16,536865,M,Manual,1,3/12/2010 11:28,2.55,,United Kingdom,
21,536981,M,Manual,2,3/12/2010 14:26,0.85,14723.0,United Kingdom,
27,537077,M,Manual,12,5/12/2010 11:59,0.42,17062.0,United Kingdom,
...,...,...,...,...,...,...,...,...,...
2937,580956,M,Manual,4,6/12/2011 14:23,1.25,17841.0,United Kingdom,
2946,C581009,M,Manual,-1,7/12/2011 9:15,125.00,16971.0,United Kingdom,
2949,C581145,M,Manual,-1,7/12/2011 13:48,9.95,17490.0,United Kingdom,
2962,581405,M,Manual,3,8/12/2011 13:50,0.42,13521.0,United Kingdom,


In [85]:
df_codigos_no_numericos_single

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
1,C536379,D,Discount,-1,1/12/2010 9:41,27.50,14527.0,United Kingdom,
4,536540,C2,CARRIAGE,1,1/12/2010 14:05,50.00,14911.0,EIRE,
6,536569,M,Manual,1,1/12/2010 15:35,1.25,16274.0,United Kingdom,
7,536569,M,Manual,1,1/12/2010 15:35,18.95,16274.0,United Kingdom,
16,536865,M,Manual,1,3/12/2010 11:28,2.55,,United Kingdom,
...,...,...,...,...,...,...,...,...,...
2937,580956,M,Manual,4,6/12/2011 14:23,1.25,17841.0,United Kingdom,
2946,C581009,M,Manual,-1,7/12/2011 9:15,125.00,16971.0,United Kingdom,
2949,C581145,M,Manual,-1,7/12/2011 13:48,9.95,17490.0,United Kingdom,
2962,581405,M,Manual,3,8/12/2011 13:50,0.42,13521.0,United Kingdom,


In [86]:
df_codigos_no_numericos_single = df_codigos_no_numericos_single[~df_codigos_no_numericos_single["Description"].str.contains("Manual")]
df_codigos_no_numericos_single

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
1,C536379,D,Discount,-1,1/12/2010 9:41,27.50,14527.0,United Kingdom,
4,536540,C2,CARRIAGE,1,1/12/2010 14:05,50.00,14911.0,EIRE,
31,C537164,D,Discount,-1,5/12/2010 13:21,29.29,14527.0,United Kingdom,
43,537368,C2,CARRIAGE,1,6/12/2010 12:40,50.00,14911.0,EIRE,
45,537378,C2,CARRIAGE,1,6/12/2010 13:06,50.00,14911.0,EIRE,
...,...,...,...,...,...,...,...,...,...
2890,580127,C2,CARRIAGE,1,1/12/2011 17:51,50.00,14911.0,EIRE,
2913,580555,C2,CARRIAGE,1,5/12/2011 10:18,50.00,14911.0,EIRE,
2914,C580604,AMAZONFEE,AMAZON FEE,-1,5/12/2011 11:35,11586.50,,United Kingdom,
2915,C580605,AMAZONFEE,AMAZON FEE,-1,5/12/2011 11:36,17836.46,,United Kingdom,


In [89]:
df_Manual['RecordNote'] = "Manual entry - likely stock adjustment or return"

In [90]:
df_Final = pd.concat([df_Final, 
                      df_Manual])

#### Discount

In [91]:
df_Discount = df_codigos_no_numericos_single[df_codigos_no_numericos_single["StockCode"] == "D"].copy()
df_Discount

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,RecordNote
1,C536379,D,Discount,-1,1/12/2010 9:41,27.50,14527.0,United Kingdom,
31,C537164,D,Discount,-1,5/12/2010 13:21,29.29,14527.0,United Kingdom,
58,C537597,D,Discount,-1,7/12/2010 12:34,281.00,15498.0,United Kingdom,
81,C537857,D,Discount,-1,8/12/2010 16:00,267.12,17340.0,United Kingdom,
124,C538897,D,Discount,-1,15/12/2010 9:14,5.76,16422.0,United Kingdom,
...,...,...,...,...,...,...,...,...,...
2736,C577227,D,Discount,-1,18/11/2011 12:06,19.82,14527.0,United Kingdom,
2737,C577227,D,Discount,-1,18/11/2011 12:06,16.76,14527.0,United Kingdom,
2800,C578239,D,Discount,-1,23/11/2011 12:29,26.33,14912.0,Italy,
2877,C579884,D,Discount,-1,30/11/2011 17:34,20.53,14527.0,United Kingdom,
