# Limpieza y preprocesamiento de datos con pandas

## 1. Cargamos el dataset

In [None]:
import pandas as pd
# Cargamos el dataset
df = pd.read_csv('../data.csv', encoding='ISO-8859-1')
# Mostramos las primeras filas
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


## 2. Exploración inicial de los datos

### Dimensiones del dataset

In [2]:
print("\nDimensiones del dataset:", df.shape)


Dimensiones del dataset: (541909, 8)


### Información sobre columnas y tipos de datos

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


### Descripción estadística de los datos numéricos

In [4]:
# Descripción estadística de los datos numéricos
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


### Verificación de valores nulos

In [5]:
print(df.isnull().sum())

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64


## 3. Limpieza y procesado de los datos

### Procesar filas con valores nulos
En el punto 2 hemos visto como las columnas con valores nulos son "CustomerID" y "Description". Por lo que, a la hora de procesar estos valores, se incluirá Unknown para los "CustomerID" y 'No description' para "Description".

In [6]:
import random

df['Description'] = df.groupby('StockCode')['Description'].transform(lambda x: x.ffill().bfill())

def generate_unique_customer_id(current_ids):
  while True:
    new_id = random.randint(10000, 99999)
    if new_id not in current_ids:
      return new_id
        
existing_ids = set(df['CustomerID'].dropna().astype(int))

df['CustomerID'] = df['CustomerID'].apply(
  lambda x: generate_unique_customer_id(existing_ids) if pd.isnull(x) else x
)

df.isnull().sum()

### Manejo de Outliers

In [7]:
df['Quantity'] = df['Quantity'].abs()
df['UnitPrice'] = df['UnitPrice'].abs()

print("Valores negativos en 'Quantity':", (df['Quantity'] < 0).sum())
print("Valores negativos en 'UnitPrice':", (df['UnitPrice'] < 0).sum())

Valores negativos en 'Quantity': 0
Valores negativos en 'UnitPrice': 0


### Eliminar duplicados

In [8]:
df = df.drop_duplicates()
print("\nDespués de eliminar duplicados, dimensiones del dataset:", df.shape)


Después de eliminar duplicados, dimensiones del dataset: (536641, 8)


## 4. Completando los datos 

### Calcular las ventas totales por cada transacción

In [9]:
df['TotalSales'] = df['Quantity'] * df['UnitPrice']

### Dividir la fecha en varias columnas

In [10]:
# Convertimos `InvoiceDate` a formato datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Creamos columnas para el año, mes, día, día de la semana y semana del año
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['Day'] = df['InvoiceDate'].dt.day
df['DayOfWeek'] = df['InvoiceDate'].dt.dayofweek
df['Season'] = df['Month'].apply(lambda x: 'Winter' if x in [12, 1, 2] else
                                        ('Spring' if x in [3, 4, 5] else
                                         ('Summer' if x in [6, 7, 8] else 'Fall')))

## 5. Exploración final de los datos

### Dimensiones del dataset

In [11]:
print("\nDimensiones del dataset:", df.shape)


Dimensiones del dataset: (536641, 13)


### Información sobre columnas y tipos de datos

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 536641 entries, 0 to 541908
Data columns (total 13 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    536641 non-null  object        
 1   StockCode    536641 non-null  object        
 2   Description  536641 non-null  object        
 3   Quantity     536641 non-null  int64         
 4   InvoiceDate  536641 non-null  datetime64[ns]
 5   UnitPrice    536641 non-null  float64       
 6   CustomerID   536641 non-null  object        
 7   Country      536641 non-null  object        
 8   TotalSales   536641 non-null  float64       
 9   Year         536641 non-null  int32         
 10  Month        536641 non-null  int32         
 11  Day          536641 non-null  int32         
 12  Season       536641 non-null  object        
dtypes: datetime64[ns](1), float64(2), int32(3), int64(1), object(6)
memory usage: 51.2+ MB


### Descripción estadística de los datos numéricos

In [13]:
# Descripción estadística de los datos numéricos
df.describe()

Unnamed: 0,Quantity,InvoiceDate,UnitPrice,TotalSales,Year,Month,Day
count,536641.0,536641,536641.0,536641.0,536641.0,536641.0,536641.0
mean,11.418315,2011-07-04 08:57:06.087421952,4.71511,21.538076,2010.921771,7.54482,15.02464
min,1.0,2010-12-01 08:26:00,0.0,0.0,2010.0,1.0,1.0
25%,1.0,2011-03-28 10:52:00,1.25,3.75,2011.0,5.0,7.0
50%,3.0,2011-07-19 14:04:00,2.08,9.9,2011.0,8.0,15.0
75%,11.0,2011-10-18 17:05:00,4.13,17.7,2011.0,11.0,22.0
max,80995.0,2011-12-09 12:50:00,38970.0,168469.6,2011.0,12.0,31.0
std,219.043814,,97.229154,380.478352,0.268532,3.508696,8.663351


### Verificación de valores nulos

In [14]:
print(df.isnull().sum())

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
TotalSales     0
Year           0
Month          0
Day            0
Season         0
dtype: int64


## 6. Guardamos el dataset limpio en CSV

In [15]:
# Guardar dataset limpio
df.to_csv('../data_clean.csv', index=False)