# Limpieza y transformación del Dataset

In [36]:
import pandas as pd
# Cargamos el dataset
df = pd.read_csv('../../data/dataset_original.csv', encoding='ISO-8859-1')
# Mostramos las primeras filas
df.head(4)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


************************** TRATAR VALORES ERRONEAOS ****************************


In [37]:
# Las facturas que tienen UnitPrice a 0, no aportan nada. FUERA!
df = df[df['UnitPrice'] != 0]
df[df['UnitPrice']==0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


In [38]:
# Quitar y guardar los de Quantity y UnitPrice negativos
# tratamiento de negativos, los guardo en otro df, y me los quito del principal
mask = df[['UnitPrice']] < 0
#df_negativos = df[mask.any(axis=1)]
df = df[~mask.any(axis=1)]

print("\nNumero de Quantity y UnitPrice negativos antes:")
print(mask.sum())

print("\nNumero de Quantity y UnitPrice negativos ahora:")
print(df[df[['UnitPrice']] < 0].sum())


Numero de Quantity y UnitPrice negativos antes:
UnitPrice    2
dtype: int64

Numero de Quantity y UnitPrice negativos ahora:
InvoiceNo        0
StockCode        0
Description      0
Quantity       0.0
InvoiceDate      0
UnitPrice      0.0
CustomerID     0.0
Country          0
dtype: object


************************** TRATAR LOS DUPLICADOS ****************************

In [39]:
print("\nNumero de duplicados antes:")
print(df.duplicated().sum())

df = df.drop_duplicates()

print("\nNumero de duplicados despues:")
print(df.duplicated().sum())



Numero de duplicados antes:
5263

Numero de duplicados despues:
0


*************** NULOS EN CUSTOMERID, DESCRIPTION *********************
Son los únicos que tienen nulos

In [40]:
# tratamiento CustomerID null
print("\nAntes:")
print(df['CustomerID'].isnull().sum())

# agrupamos por InvoiceNo , y usamos ffill y bfill para rellenar, los valores nulls que no se rellenen les da un valor nuevo unico
#* Si no se usa transform, se inventa el valor si no lo encuentra
df['CustomerID'] = df.groupby(['InvoiceNo'])['CustomerID'].ffill().bfill()

#! Cuando se usa transform, si no encuentra un valor, lo deja en nulo.
#df['CustomerID'] = df.groupby(['InvoiceNo'])['CustomerID'].transform(lambda group: group.ffill().bfill())

# df['CustomerID'] = df['CustomerID'].apply(lambda x: np.random.randint(10000, 99999) if pd.isnull(x) else x)

print("\nDespues:")
print(df['CustomerID'].isnull().sum())


Antes:
132565

Despues:
0


In [41]:
# tratamiento Description null
print("\nAntes:")
print(df['Description'].isnull().sum())

# agrupamos por stockcode, y usamos ffill para rellenar con el anterior valor dentro del grupo, para los primer valor en null tambien uso bfill
df['Description'] = df.groupby(['StockCode'])['Description'].transform(lambda group: group.ffill().bfill())
df['Description'] = df['Description'].fillna('No description')

print("\nDespues:")
print(df['Description'].isnull().sum())



Antes:
0

Despues:
0


In [42]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# sort y quitar datos para la prediccion
df = df.sort_values(by='InvoiceDate')

fecha_limite = pd.to_datetime("11/08/2011 00:00", format="%m/%d/%Y %H:%M")

df_prediction = df[df['InvoiceDate'] > fecha_limite]
df = df[df['InvoiceDate']<=fecha_limite]

df_prediction

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
447063,574943,21108,FAIRY CAKE FLANNEL ASSORTED COLOUR,36,2011-11-08 07:52:00,0.79,13026.0,United Kingdom
447065,574943,22268,DECORATION SITTING BUNNY,48,2011-11-08 07:52:00,0.19,13026.0,United Kingdom
447064,574943,17091J,VANILLA INCENSE IN TIN,36,2011-11-08 07:52:00,0.38,13026.0,United Kingdom
447062,574943,21992,VINTAGE PAISLEY STATIONERY SET,12,2011-11-08 07:52:00,1.25,13026.0,United Kingdom
447058,574943,17012F,ORIGAMI SANDLEWOOD INCENSE/CAND SET,24,2011-11-08 07:52:00,0.85,13026.0,United Kingdom
...,...,...,...,...,...,...,...,...
541896,581587,22555,PLASTERS IN TIN STRONGMAN,12,2011-12-09 12:50:00,1.65,12680.0,France
541895,581587,22556,PLASTERS IN TIN CIRCUS PARADE,12,2011-12-09 12:50:00,1.65,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
541900,581587,22730,ALARM CLOCK BAKELIKE IVORY,4,2011-12-09 12:50:00,3.75,12680.0,France


In [43]:
# para los outliers, usamos el rango intercuartil (IQR)
total_sales = df['Quantity'] * df['UnitPrice']

# para los outliers, usamos el rango intercuartil (IQR)
q1 = total_sales.quantile(0.25)
q3 = total_sales.quantile(0.75)
iqr = q3 - q1

#ajuste de rango para 90%//95%
rango=2.5
limiteInferior = q1 - (rango * iqr)
limiteSuperior = q3 + (rango * iqr)

outliersBoolean = (total_sales >= limiteInferior) & (total_sales <= limiteSuperior)
df_filtrado = df[outliersBoolean]

# porcentaje filtarado
print(f"Porcentaje de datos retenidos: {len(df_filtrado) / len(df) * 100:.2f}%")

df = df_filtrado


Porcentaje de datos retenidos: 94.19%


# Transformación

In [44]:
df['TotalSales'] = df['Quantity'] * df['UnitPrice']
df_prediction['TotalSales'] = df_prediction['Quantity'] * df_prediction['UnitPrice']

In [45]:
def separate_date(df: pd.DataFrame):
  df['Year'] = df['InvoiceDate'].dt.year
  df['Month'] = df['InvoiceDate'].dt.month
  df['Day'] = df['InvoiceDate'].dt.day
  df['DayOfWeek'] = df['InvoiceDate'].dt.dayofweek
  # TODO: O por trimestre
  df['Season'] = df['Month'].apply(
    lambda x: 'Winter' if x in [12, 1, 2] else
      ('Spring' if x in [3, 4, 5] else
      ('Summer' if x in [6, 7, 8] else 'Fall'))
  )
  return df


df = separate_date(df)
df_prediction = separate_date(df_prediction)

df_prediction



Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalSales,Year,Month,Day,DayOfWeek,Season
447063,574943,21108,FAIRY CAKE FLANNEL ASSORTED COLOUR,36,2011-11-08 07:52:00,0.79,13026.0,United Kingdom,28.44,2011,11,8,1,Fall
447065,574943,22268,DECORATION SITTING BUNNY,48,2011-11-08 07:52:00,0.19,13026.0,United Kingdom,9.12,2011,11,8,1,Fall
447064,574943,17091J,VANILLA INCENSE IN TIN,36,2011-11-08 07:52:00,0.38,13026.0,United Kingdom,13.68,2011,11,8,1,Fall
447062,574943,21992,VINTAGE PAISLEY STATIONERY SET,12,2011-11-08 07:52:00,1.25,13026.0,United Kingdom,15.00,2011,11,8,1,Fall
447058,574943,17012F,ORIGAMI SANDLEWOOD INCENSE/CAND SET,24,2011-11-08 07:52:00,0.85,13026.0,United Kingdom,20.40,2011,11,8,1,Fall
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541896,581587,22555,PLASTERS IN TIN STRONGMAN,12,2011-12-09 12:50:00,1.65,12680.0,France,19.80,2011,12,9,4,Winter
541895,581587,22556,PLASTERS IN TIN CIRCUS PARADE,12,2011-12-09 12:50:00,1.65,12680.0,France,19.80,2011,12,9,4,Winter
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,16.60,2011,12,9,4,Winter
541900,581587,22730,ALARM CLOCK BAKELIKE IVORY,4,2011-12-09 12:50:00,3.75,12680.0,France,15.00,2011,12,9,4,Winter


In [46]:
df.to_csv('../../data/dataset_training.csv')
df_prediction.to_csv('../../data/dataset_prediction.csv')