# ETL

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv("C:/Users/pablo/Desktop/proyecto_final/2017PurchasePricesDec.csv")

df.isnull().sum()


df.dropna(how='any', inplace=True)

df['Size'] = df['Size'].str.lower()

df['Size'].unique()


array(['750ml', '1000ml', '1750ml', '50ml', '375ml', '100ml 4 pk',
       '50ml 5 pk', '100ml', '200ml', '50ml 4 pk', '50ml 3 pk', '300ml',
       '200ml 4 pk', 'unknown', '750ml 2 pk', '250ml 4 pk', '1500ml',
       '3000ml', '5000ml', '4000ml', '187ml 4 pk', '150ml', '187ml',
       '500ml', '720ml', '650ml', '200ml 3 pk', '187ml 3 pk', '330ml',
       '250ml', '750ml + 3/', '18000ml', '180ml', '750ml + 4/', '6000ml',
       '1000ml 2 pk', '750ml 12 p', '750ml 6 pk', '20ml 5 pk',
       '375ml 2 pk', '20000ml', '50ml 12 pk', '750ml 3 pk', '375ml 3 pk',
       '750ml + 1/', '200ml 5 pk', '162.5ml', '400ml', '1100ml', '600ml',
       '19500ml', '560ml', '3750ml', '750ml  3', '9000ml'], dtype=object)

In [3]:
df_work = df.copy()

# Normalización
s = (df_work['Size'].astype('string').str.lower().str.strip()
       .str.replace(r'\s+', ' ', regex=True)
       .str.replace(r'\bpk\.?\b', ' pk', regex=True)     # "pk." -> " pk"
       .str.replace(r'(?<=\d)\s+p\b', ' pk', regex=True) # "12 p" -> "12 pk"
)


In [4]:
pat = (
    r'^\s*'
    r'(?P<ml_val>\d+(?:\.\d+)?)\s*ml'            # 50ml | 162.5ml | 1750ml
    r'(?:\s*(?P<qty>\d{1,3})(?:\s*pk)?)?'        # "12 pk" o "ml 12" 
    r'(?:\s*\+\s*(?P<free>\d+)\s*/)?'            # "+ 3/" 
    r'\s*$'
)
m = s.str.extract(pat)


In [5]:
# Construir las DOS columnas
#  'size_ml' como string "Nml" (redondeo por si aparece 162.5ml)
ml_num = (pd.to_numeric(m['ml_val'], errors='coerce').round().astype('Int64'))
size_ml = (ml_num.astype('string') + 'ml').where(ml_num.notna(), pd.NA)

In [6]:
qty_pk = pd.to_numeric(m['qty'],  errors='coerce').astype('Int64')
free   = pd.to_numeric(m['free'], errors='coerce').astype('Int64')


In [7]:
size_qty = qty_pk.where(qty_pk.notna(), free)      # si no hay pk, usa free
size_qty = size_qty.where(size_qty.notna(), 1)     # si no hay nada, 1
size_qty = size_qty.where(ml_num.notna(), pd.NA)   # si no hay tamaño, NA

In [8]:
# 4) Asignar al DataFrame final (solo dos columnas nuevas)
df_work['size_ml']  = size_ml
df_work['size_qty'] = size_qty




#### elimino columnas redundantes

In [9]:
df_work.drop(columns=['Size', 'size_ml'], errors='ignore', inplace=True)

#### relleno con 0 los na en volume

In [10]:
#relleno con 0 por que son solo 4 filas, no causa problemas en mi analisis

df_work['Volume'] = df_work['Volume'].fillna(0)

In [11]:
# Asegurar que sean numéricas (por si llegaron como texto)
df_work['Price']   = pd.to_numeric(df_work['Price'], errors='coerce')
df_work['PurchasePrice'] = pd.to_numeric(df_work['PurchasePrice'], errors='coerce')
df_work['Volume']  = pd.to_numeric(df_work['Volume'], errors='coerce').round().astype('Int64')


#creo la coluna 'gross_margin' (diferencia entre valor de compra y valor de venta)
df_work['gross_margin']  = df_work['Price'] - df_work['PurchasePrice']





#### pasar cabecera de columna a snake_case

In [12]:

df_work.columns = (df_work.columns
    .str.strip()
    .str.replace(r'(.)([A-Z][a-z]+)', r'\1_\2', regex=True)
    .str.replace(r'([a-z0-9])([A-Z])', r'\1_\2', regex=True)
    .str.replace(r'[^0-9A-Za-z]+', '_', regex=True)
    .str.strip('_')
    .str.lower()
)


df_work.rename(columns={
    'purchaseprice': 'purchase_price',
    'vendornumber':  'vendor_number',
    'vendorname':    'vendor_name',
    'size_qty': 'sales_quantity'
}, inplace=True)

df_work['description'] = df_work['description'].str.upper()



#### agregar columna margin_percent


In [13]:
#crear columna margin percent, seria la diferencia entre price y purchaseprice pero en porcentaje
df_work['margin_percent'] = ((df_work['price'] - df_work['purchase_price']) / df_work['price'] * 100).round(2)

cols = ['brand','description','price','volume','classification','purchase_price','vendor_number','vendor_name','sales_quantity','gross_margin','margin_percent']



m1 = df_work['margin_percent'].isna() & df_work['price'].notna() & df_work['purchase_price'].notna()
df_work.loc[m1, 'margin_percent'] = (
    (df_work.loc[m1, 'price'] - df_work.loc[m1, 'purchase_price']) / df_work.loc[m1, 'price'] * 100
)

# fallback: si tengo gross_margin y price
m2 = df_work['margin_percent'].isna() & df_work['gross_margin'].notna() & df_work['price'].notna()
df_work.loc[m2, 'margin_percent'] = df_work.loc[m2, 'gross_margin'] / df_work.loc[m2, 'price'] * 100

# último fallback: mediana por grupo (brand + volume) y luego mediana global
df_work['margin_percent'] = df_work['margin_percent'].fillna(
    df_work.groupby(['brand','volume'])['margin_percent'].transform('median')
)
df_work['margin_percent'] = df_work['margin_percent'].fillna(df_work['margin_percent'].median())


df_work['margin_percent'] = df_work['margin_percent'].round(2)
df_work['gross_margin'] = df_work['gross_margin'].round(2)


# --- 2) Imputar volume y sales_quantity con MODA por grupo -------------------
def fill_mode_group(df, col, by):
    # moda por grupo (si no hay moda, queda NaN)
    grp_mode = df.groupby(by)[col].transform(
        lambda s: s.mode(dropna=True).iloc[0] if not s.mode(dropna=True).empty else np.nan
    )
    df[col] = df[col].fillna(grp_mode)

    # moda global como último recurso
    glob = df[col].mode(dropna=True)
    if not glob.empty:
        df[col] = df[col].fillna(glob.iloc[0])
    return df


df_work = fill_mode_group(df_work, 'volume', ['brand','classification'])
df_work = fill_mode_group(df_work, 'sales_quantity', ['brand'])

# --- 3) Asegurar dtypes (enteros nullable) -----------------------------------
df_work['volume'] = pd.to_numeric(df_work['volume'], errors='coerce').astype('Int64')
df_work['sales_quantity'] = pd.to_numeric(df_work['sales_quantity'], errors='coerce').astype('Int64')

# --- 4) Chequeo rápido --------------------------------------------------------
for c in ['volume','sales_quantity','margin_percent']:
    print(f"{c}: nulos={df_work[c].isna().sum()}  dtype={df_work[c].dtype}")

volume: nulos=0  dtype=Int64
sales_quantity: nulos=0  dtype=Int64
margin_percent: nulos=0  dtype=float64


In [16]:
df_work.to_csv("C:/Users/pablo/Desktop/proyecto_final/2017PurchasePricesDec_limpio.csv",index=False)