# ETL

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv("C:/Users/pablo/Desktop/proyecto_final/2017PurchasePricesDec.csv")

df.isnull().sum()


df.dropna(how='any', inplace=True)

df['Size'] = df['Size'].str.lower()

df['Size'].unique()


array(['750ml', '1000ml', '1750ml', '50ml', '375ml', '100ml 4 pk',
       '50ml 5 pk', '100ml', '200ml', '50ml 4 pk', '50ml 3 pk', '300ml',
       '200ml 4 pk', 'unknown', '750ml 2 pk', '250ml 4 pk', '1500ml',
       '3000ml', '5000ml', '4000ml', '187ml 4 pk', '150ml', '187ml',
       '500ml', '720ml', '650ml', '200ml 3 pk', '187ml 3 pk', '330ml',
       '250ml', '750ml + 3/', '18000ml', '180ml', '750ml + 4/', '6000ml',
       '1000ml 2 pk', '750ml 12 p', '750ml 6 pk', '20ml 5 pk',
       '375ml 2 pk', '20000ml', '50ml 12 pk', '750ml 3 pk', '375ml 3 pk',
       '750ml + 1/', '200ml 5 pk', '162.5ml', '400ml', '1100ml', '600ml',
       '19500ml', '560ml', '3750ml', '750ml  3', '9000ml'], dtype=object)

In [35]:
df_work = df.copy()

# Normalización
s = (df_work['Size'].astype('string').str.lower().str.strip()
       .str.replace(r'\s+', ' ', regex=True)
       .str.replace(r'\bpk\.?\b', ' pk', regex=True)     # "pk." -> " pk"
       .str.replace(r'(?<=\d)\s+p\b', ' pk', regex=True) # "12 p" -> "12 pk"
)


In [36]:
pat = (
    r'^\s*'
    r'(?P<ml_val>\d+(?:\.\d+)?)\s*ml'            # 50ml | 162.5ml | 1750ml
    r'(?:\s*(?P<qty>\d{1,3})(?:\s*pk)?)?'        # "12 pk" o "ml 12" 
    r'(?:\s*\+\s*(?P<free>\d+)\s*/)?'            # "+ 3/" 
    r'\s*$'
)
m = s.str.extract(pat)


In [37]:
# Construir las DOS columnas
#  'size_ml' como string "Nml" (redondeo por si aparece 162.5ml)
ml_num = (pd.to_numeric(m['ml_val'], errors='coerce').round().astype('Int64'))
size_ml = (ml_num.astype('string') + 'ml').where(ml_num.notna(), pd.NA)

In [38]:
qty_pk = pd.to_numeric(m['qty'],  errors='coerce').astype('Int64')
free   = pd.to_numeric(m['free'], errors='coerce').astype('Int64')


In [39]:
size_qty = qty_pk.where(qty_pk.notna(), free)      # si no hay pk, usa free
size_qty = size_qty.where(size_qty.notna(), 1)     # si no hay nada, 1
size_qty = size_qty.where(ml_num.notna(), pd.NA)   # si no hay tamaño, NA

In [40]:
# 4) Asignar al DataFrame final (solo dos columnas nuevas)
df_work['size_ml']  = size_ml
df_work['size_qty'] = size_qty




#### elimino columnas redundantes

In [41]:
df_work.drop(columns=['Size', 'size_ml'], errors='ignore', inplace=True)

#### relleno con 0 los na en volume

In [42]:
#relleno con 0 por que son solo 4 filas, no causa problemas en mi analisis

df_work['Volume'] = df_work['Volume'].fillna(0)

In [44]:
# Asegurar que sean numéricas (por si llegaron como texto)
df_work['Price']   = pd.to_numeric(df_work['Price'], errors='coerce')
df_work['PurchasePrice'] = pd.to_numeric(df_work['PurchasePrice'], errors='coerce')
df_work['Volume']  = pd.to_numeric(df_work['Volume'], errors='coerce').round().astype('Int64')


#creo la coluna 'gross_margin' (diferencia entre valor de compra y valor de venta)
df_work['gross_margin']  = df_work['Price'] - df_work['PurchasePrice']





#### pasar cabecera de columna a snake_case

In [None]:

df_work.columns = (df_work.columns
    .str.strip()
    .str.replace(r'(.)([A-Z][a-z]+)', r'\1_\2', regex=True)
    .str.replace(r'([a-z0-9])([A-Z])', r'\1_\2', regex=True)
    .str.replace(r'[^0-9A-Za-z]+', '_', regex=True)
    .str.strip('_')
    .str.lower()
)


df_work.rename(columns={
    'purchaseprice': 'purchase_price',
    'vendornumber':  'vendor_number',
    'vendorname':    'vendor_name',
    'size_qty': 'sales_quantity'
}, inplace=True)

df_work['description'] = df_work['description'].str.upper()



Unnamed: 0,brand,description,price,volume,classification,purchase_price,vendor_number,vendor_name,sales_quantity,gross_margin
0,58,GEKKEIKAN BLACK & GOLD SAKE,12.99,750,1,9.28,8320,SHAW ROSS INT L IMP LTD,1,3.71
1,62,HERRADURA SILVER TEQUILA,36.99,750,1,28.67,1128,BROWN-FORMAN CORP,1,8.32
2,63,HERRADURA REPOSADO TEQUILA,38.99,750,1,30.46,1128,BROWN-FORMAN CORP,1,8.53
3,72,NO. 3 LONDON DRY GIN,34.99,750,1,26.11,9165,ULTRA BEVERAGE COMPANY LLP,1,8.88
4,75,THREE OLIVES TOMATO VODKA,14.99,750,1,10.94,7245,PROXIMO SPIRITS INC.,1,4.05


#### agregar columna margin_percent


In [47]:
#crear columna margin percent, seria la diferencia entre price y purchaseprice pero en porcentaje
df_work['margin_percent'] = ((df_work['price'] - df_work['purchase_price']) / df_work['price'] * 100).round(2)

df_work.head()

Unnamed: 0,brand,description,price,volume,classification,purchase_price,vendor_number,vendor_name,sales_quantity,gross_margin,margin_percent
0,58,GEKKEIKAN BLACK & GOLD SAKE,12.99,750,1,9.28,8320,SHAW ROSS INT L IMP LTD,1,3.71,28.56
1,62,HERRADURA SILVER TEQUILA,36.99,750,1,28.67,1128,BROWN-FORMAN CORP,1,8.32,22.49
2,63,HERRADURA REPOSADO TEQUILA,38.99,750,1,30.46,1128,BROWN-FORMAN CORP,1,8.53,21.88
3,72,NO. 3 LONDON DRY GIN,34.99,750,1,26.11,9165,ULTRA BEVERAGE COMPANY LLP,1,8.88,25.38
4,75,THREE OLIVES TOMATO VODKA,14.99,750,1,10.94,7245,PROXIMO SPIRITS INC.,1,4.05,27.02


In [48]:
df_work.to_csv("C:/Users/pablo/Desktop/proyecto_final/2017PurchasePricesDec_limpio.csv")