# ETL - Limpieza general del dataset


#### _Librerias utilizadas_

In [1]:
## Traemos las librerias empledas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Carga inicial


In [2]:
df = pd.read_csv(r"C:\Users\pablo\OneDrive\Documentos\HENRY\Proyecto\Inventory\SalesFINAL12312016.csv", parse_dates=['SalesDate'])

print("Dimensiones iniciales:", df.shape)
print("\nVista previa:")
display(df.head(3))


Dimensiones iniciales: (1048575, 14)

Vista previa:


Unnamed: 0,InventoryId,Store,Brand,Description,Size,SalesQuantity,SalesDollars,SalesPrice,SalesDate,Volume,Classification,ExciseTax,VendorNo,VendorName
0,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750mL,1,16.49,16.49,2016-01-01,750,1,0.79,12546,JIM BEAM BRANDS COMPANY
1,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750mL,2,32.98,16.49,2016-01-02,750,1,1.57,12546,JIM BEAM BRANDS COMPANY
2,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750mL,1,16.49,16.49,2016-01-03,750,1,0.79,12546,JIM BEAM BRANDS COMPANY


## Renombrar columnas a snake_case


In [8]:
import re

df.columns = (
    df.columns
      .str.strip()                                # remover espacios extremos
      .str.replace(" ", "_")                      # espacios a guiones bajos
      .str.replace("-", "_")                      # guiones a guiones bajos
      .str.replace(r'([a-z])([A-Z])', r'\1_\2', regex=True)  # minúscula + mayúscula -> minúscula + _ + mayúscula
      .str.lower()                                # pasar todo a minúsculas
)

print("\nColumnas renombradas:")
print(df.columns.tolist())


Columnas renombradas:
['inventory_id', 'store', 'brand', 'description', 'size', 'sales_quantity', 'sales_dollars', 'sales_price', 'sales_date', 'volume', 'classification', 'excise_tax', 'vendor_no', 'vendor_name']


In [9]:
df.head(3)

Unnamed: 0,inventory_id,store,brand,description,size,sales_quantity,sales_dollars,sales_price,sales_date,volume,classification,excise_tax,vendor_no,vendor_name
0,1_hardersfield_1004,1,1004,jim beam w/2 rocks glasses,750ml,1,16.49,16.49,2016-01-01,750,1,0.79,12546,jim beam brands company
1,1_hardersfield_1004,1,1004,jim beam w/2 rocks glasses,750ml,2,32.98,16.49,2016-01-02,750,1,1.57,12546,jim beam brands company
2,1_hardersfield_1004,1,1004,jim beam w/2 rocks glasses,750ml,1,16.49,16.49,2016-01-03,750,1,0.79,12546,jim beam brands company


## Normalización de texto en columnas tipo string


In [5]:
for col in df.select_dtypes(include="object").columns:
    df[col] = (df[col]
                 .str.strip()                        # quitar espacios extremos
                 .str.lower()                        # pasar a minúsculas
                 .str.replace(r"\s+", " ", regex=True))  # normalizar espacios internos

## Validación de duplicados


In [6]:
duplicados = df.duplicated().sum()
print(f"\nFilas duplicadas encontradas: {duplicados}")



Filas duplicadas encontradas: 0


## Guardar CSV limpio


In [7]:
df.to_csv("sales_clean.csv", index=False)

print("\nDataset limpio guardado como 'sales_clean.csv'")
print("Dimensiones finales:", df.shape)



Dataset limpio guardado como 'sales_clean.csv'
Dimensiones finales: (1048575, 14)
