In [105]:
import numpy as np
import pandas as pd

In [106]:
# Cargar el archivo CSV
df = pd.read_csv("PurchasesFINAL12312016.csv", encoding="utf-8", low_memory=False)

In [107]:
df.head()

Unnamed: 0,InventoryId,Store,Brand,Description,Size,VendorNumber,VendorName,PONumber,PODate,ReceivingDate,InvoiceDate,PayDate,PurchasePrice,Quantity,Dollars,Classification
0,69_MOUNTMEND_8412,69,8412,Tequila Ocho Plata Fresno,750mL,105,ALTAMAR BRANDS LLC,8124,2015-12-21,2016-01-02,2016-01-04,2016-02-16,35.71,6,214.26,1
1,30_CULCHETH_5255,30,5255,TGI Fridays Ultimte Mudslide,1.75L,4466,AMERICAN VINTAGE BEVERAGE,8137,2015-12-22,2016-01-01,2016-01-07,2016-02-21,9.35,4,37.4,1
2,34_PITMERDEN_5215,34,5215,TGI Fridays Long Island Iced,1.75L,4466,AMERICAN VINTAGE BEVERAGE,8137,2015-12-22,2016-01-02,2016-01-07,2016-02-21,9.41,5,47.05,1
3,1_HARDERSFIELD_5255,1,5255,TGI Fridays Ultimte Mudslide,1.75L,4466,AMERICAN VINTAGE BEVERAGE,8137,2015-12-22,2016-01-01,2016-01-07,2016-02-21,9.35,6,56.1,1
4,76_DONCASTER_2034,76,2034,Glendalough Double Barrel,750mL,388,ATLANTIC IMPORTING COMPANY,8169,2015-12-24,2016-01-02,2016-01-09,2016-02-16,21.32,5,106.6,1


In [108]:
#Revisar estructura inicial
print("Filias y Columnas:", df.shape)
print("\nColumnas del dataset:\n", df.columns)


Filias y Columnas: (2372474, 16)

Columnas del dataset:
 Index(['InventoryId', 'Store', 'Brand', 'Description', 'Size', 'VendorNumber',
       'VendorName', 'PONumber', 'PODate', 'ReceivingDate', 'InvoiceDate',
       'PayDate', 'PurchasePrice', 'Quantity', 'Dollars', 'Classification'],
      dtype='object')


In [109]:
print("\nTipos de datos:") 
print(df.dtypes)


Tipos de datos:
InventoryId        object
Store               int64
Brand               int64
Description        object
Size               object
VendorNumber        int64
VendorName         object
PONumber            int64
PODate             object
ReceivingDate      object
InvoiceDate        object
PayDate            object
PurchasePrice     float64
Quantity            int64
Dollars           float64
Classification      int64
dtype: object


In [110]:
print("\nValores nulos por columna:") 
print(df.isnull().sum())


Valores nulos por columna:
InventoryId       0
Store             0
Brand             0
Description       0
Size              3
VendorNumber      0
VendorName        0
PONumber          0
PODate            0
ReceivingDate     0
InvoiceDate       0
PayDate           0
PurchasePrice     0
Quantity          0
Dollars           0
Classification    0
dtype: int64


In [111]:
# Eliminar duplicados 
df = df.drop_duplicates()


In [112]:
# Manejo de valores nulos
# Completar o reemplazar valores faltantes según el tipo de columna
if 'vendorname' in df.columns:
    df['vendorname'] = df['vendorname'].fillna('Desconocido')

if 'price' in df.columns:
    df['price'] = df['price'].replace(0, np.nan)
    df['price'] = df['price'].fillna(df['price'].mean())

if 'quantity' in df.columns:
    df['quantity'] = df['quantity'].replace(0, np.nan)
    df['quantity'] = df['quantity'].fillna(df['quantity'].mean())

In [113]:
# --- Convertir a tipo fecha real ---
fechas = ['PODate', 'ReceivingDate', 'InvoiceDate', 'PayDate']
for col in fechas:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# --- Mostrar en formato dd-mm-aaaa (solo visualmente) ---
pd.options.display.date_dayfirst = True  # cambia el orden de visualización en el entorno

# Ver las primeras filas con formato visual europeo (día primero)
print(df[fechas].head().apply(lambda x: x.dt.strftime('%d-%m-%Y')))

       PODate ReceivingDate InvoiceDate     PayDate
0  21-12-2015    02-01-2016  04-01-2016  16-02-2016
1  22-12-2015    01-01-2016  07-01-2016  21-02-2016
2  22-12-2015    02-01-2016  07-01-2016  21-02-2016
3  22-12-2015    01-01-2016  07-01-2016  21-02-2016
4  24-12-2015    02-01-2016  09-01-2016  16-02-2016


In [114]:
df.head()

Unnamed: 0,InventoryId,Store,Brand,Description,Size,VendorNumber,VendorName,PONumber,PODate,ReceivingDate,InvoiceDate,PayDate,PurchasePrice,Quantity,Dollars,Classification
0,69_MOUNTMEND_8412,69,8412,Tequila Ocho Plata Fresno,750mL,105,ALTAMAR BRANDS LLC,8124,2015-12-21,2016-01-02,2016-01-04,2016-02-16,35.71,6,214.26,1
1,30_CULCHETH_5255,30,5255,TGI Fridays Ultimte Mudslide,1.75L,4466,AMERICAN VINTAGE BEVERAGE,8137,2015-12-22,2016-01-01,2016-01-07,2016-02-21,9.35,4,37.4,1
2,34_PITMERDEN_5215,34,5215,TGI Fridays Long Island Iced,1.75L,4466,AMERICAN VINTAGE BEVERAGE,8137,2015-12-22,2016-01-02,2016-01-07,2016-02-21,9.41,5,47.05,1
3,1_HARDERSFIELD_5255,1,5255,TGI Fridays Ultimte Mudslide,1.75L,4466,AMERICAN VINTAGE BEVERAGE,8137,2015-12-22,2016-01-01,2016-01-07,2016-02-21,9.35,6,56.1,1
4,76_DONCASTER_2034,76,2034,Glendalough Double Barrel,750mL,388,ATLANTIC IMPORTING COMPANY,8169,2015-12-24,2016-01-02,2016-01-09,2016-02-16,21.32,5,106.6,1


In [115]:
print(df.dtypes)

InventoryId               object
Store                      int64
Brand                      int64
Description               object
Size                      object
VendorNumber               int64
VendorName                object
PONumber                   int64
PODate            datetime64[ns]
ReceivingDate     datetime64[ns]
InvoiceDate       datetime64[ns]
PayDate           datetime64[ns]
PurchasePrice            float64
Quantity                   int64
Dollars                  float64
Classification             int64
dtype: object


In [116]:
# Convertir columnas de texto a string
text_cols = ["Description", "Size", "VendorName"]
df[text_cols] = df[text_cols].astype("string")

In [117]:
print(df.dtypes)

InventoryId               object
Store                      int64
Brand                      int64
Description       string[python]
Size              string[python]
VendorNumber               int64
VendorName        string[python]
PONumber                   int64
PODate            datetime64[ns]
ReceivingDate     datetime64[ns]
InvoiceDate       datetime64[ns]
PayDate           datetime64[ns]
PurchasePrice            float64
Quantity                   int64
Dollars                  float64
Classification             int64
dtype: object


In [119]:

# 8️⃣ Limpiar texto (quitar espacios y poner en formato título)
text_cols = df.select_dtypes(include='object').columns
for col in text_cols:
    df[col] = df[col].astype(str).str.strip().str.title()


In [120]:
# 9️⃣ Resumen final
print("\nDataset limpio:")
print(df.info())
print("\nPrimeras filas limpias:")
print(df.head())



Dataset limpio:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2372474 entries, 0 to 2372473
Data columns (total 16 columns):
 #   Column          Dtype         
---  ------          -----         
 0   InventoryId     object        
 1   Store           int64         
 2   Brand           int64         
 3   Description     string        
 4   Size            string        
 5   VendorNumber    int64         
 6   VendorName      string        
 7   PONumber        int64         
 8   PODate          datetime64[ns]
 9   ReceivingDate   datetime64[ns]
 10  InvoiceDate     datetime64[ns]
 11  PayDate         datetime64[ns]
 12  PurchasePrice   float64       
 13  Quantity        int64         
 14  Dollars         float64       
 15  Classification  int64         
dtypes: datetime64[ns](4), float64(2), int64(6), object(1), string(3)
memory usage: 289.6+ MB
None

Primeras filas limpias:
           InventoryId  Store  Brand                   Description   Size  \
0    69_Mountmend_841

In [121]:
df.head()

Unnamed: 0,InventoryId,Store,Brand,Description,Size,VendorNumber,VendorName,PONumber,PODate,ReceivingDate,InvoiceDate,PayDate,PurchasePrice,Quantity,Dollars,Classification
0,69_Mountmend_8412,69,8412,Tequila Ocho Plata Fresno,750mL,105,ALTAMAR BRANDS LLC,8124,2015-12-21,2016-01-02,2016-01-04,2016-02-16,35.71,6,214.26,1
1,30_Culcheth_5255,30,5255,TGI Fridays Ultimte Mudslide,1.75L,4466,AMERICAN VINTAGE BEVERAGE,8137,2015-12-22,2016-01-01,2016-01-07,2016-02-21,9.35,4,37.4,1
2,34_Pitmerden_5215,34,5215,TGI Fridays Long Island Iced,1.75L,4466,AMERICAN VINTAGE BEVERAGE,8137,2015-12-22,2016-01-02,2016-01-07,2016-02-21,9.41,5,47.05,1
3,1_Hardersfield_5255,1,5255,TGI Fridays Ultimte Mudslide,1.75L,4466,AMERICAN VINTAGE BEVERAGE,8137,2015-12-22,2016-01-01,2016-01-07,2016-02-21,9.35,6,56.1,1
4,76_Doncaster_2034,76,2034,Glendalough Double Barrel,750mL,388,ATLANTIC IMPORTING COMPANY,8169,2015-12-24,2016-01-02,2016-01-09,2016-02-16,21.32,5,106.6,1


In [122]:
# 🔟 Guardar el archivo limpio
df.to_csv("PurchasesFINAL_clean.csv", index=False)
