### Carga y transformación de los datos

#### _Librerias utilizadas_

In [10]:
## Traemos las librerias empledas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Carga y análisis datos

In [11]:
df = pd.read_csv(r"C:\Users\pablo\OneDrive\Documentos\HENRY\Proyecto\Inventory\SalesFINAL12312016.csv", parse_dates=['SalesDate'])  # Cargamos el dataset

In [12]:
df.shape #Analizamos la cantidad de filas y columnas del dataset

(1048575, 14)

In [13]:
df.info() # Información general (columnas, tipos de datos, nulos)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 14 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   InventoryId     1048575 non-null  object        
 1   Store           1048575 non-null  int64         
 2   Brand           1048575 non-null  int64         
 3   Description     1048575 non-null  object        
 4   Size            1048575 non-null  object        
 5   SalesQuantity   1048575 non-null  int64         
 6   SalesDollars    1048575 non-null  float64       
 7   SalesPrice      1048575 non-null  float64       
 8   SalesDate       1048575 non-null  datetime64[ns]
 9   Volume          1048575 non-null  int64         
 10  Classification  1048575 non-null  int64         
 11  ExciseTax       1048575 non-null  float64       
 12  VendorNo        1048575 non-null  int64         
 13  VendorName      1048575 non-null  object        
dtypes: datetime64[ns](

In [14]:
#Lista de columnas
df.columns

Index(['InventoryId', 'Store', 'Brand', 'Description', 'Size', 'SalesQuantity',
       'SalesDollars', 'SalesPrice', 'SalesDate', 'Volume', 'Classification',
       'ExciseTax', 'VendorNo', 'VendorName'],
      dtype='object')

In [15]:
#Analizamos los primeros 10 registros
df.head(10)

Unnamed: 0,InventoryId,Store,Brand,Description,Size,SalesQuantity,SalesDollars,SalesPrice,SalesDate,Volume,Classification,ExciseTax,VendorNo,VendorName
0,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750mL,1,16.49,16.49,2016-01-01,750,1,0.79,12546,JIM BEAM BRANDS COMPANY
1,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750mL,2,32.98,16.49,2016-01-02,750,1,1.57,12546,JIM BEAM BRANDS COMPANY
2,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750mL,1,16.49,16.49,2016-01-03,750,1,0.79,12546,JIM BEAM BRANDS COMPANY
3,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750mL,1,14.49,14.49,2016-01-08,750,1,0.79,12546,JIM BEAM BRANDS COMPANY
4,1_HARDERSFIELD_1005,1,1005,Maker's Mark Combo Pack,375mL 2 Pk,2,69.98,34.99,2016-01-09,375,1,0.79,12546,JIM BEAM BRANDS COMPANY
5,1_HARDERSFIELD_1005,1,1005,Maker's Mark Combo Pack,375mL 2 Pk,1,34.99,34.99,2016-01-15,375,1,0.39,12546,JIM BEAM BRANDS COMPANY
6,1_HARDERSFIELD_1005,1,1005,Maker's Mark Combo Pack,375mL 2 Pk,1,34.99,34.99,2016-01-22,375,1,0.39,12546,JIM BEAM BRANDS COMPANY
7,1_HARDERSFIELD_1005,1,1005,Maker's Mark Combo Pack,375mL 2 Pk,1,34.99,34.99,2016-01-30,375,1,0.39,12546,JIM BEAM BRANDS COMPANY
8,1_HARDERSFIELD_10058,1,10058,F Coppola Dmd Ivry Cab Svgn,750mL,4,59.96,14.99,2016-01-05,750,2,0.45,2000,SOUTHERN WINE & SPIRITS NE
9,1_HARDERSFIELD_10058,1,10058,F Coppola Dmd Ivry Cab Svgn,750mL,1,14.99,14.99,2016-01-06,750,2,0.11,2000,SOUTHERN WINE & SPIRITS NE


#### Revision de nulos y negativos

In [16]:
# Sumar nulos por columna
nulos_por_columna = df.isnull().sum()

# Filtrar solo las que tienen más de 0 nulos
nulos_por_columna = nulos_por_columna[nulos_por_columna > 0]

print(nulos_por_columna)

Series([], dtype: int64)


In [17]:
# Contar valores negativos por columna
negativos_por_columna = (df.select_dtypes(include=[np.number]) < 0).sum()

# Filtrar solo las columnas que tienen al menos un valor negativo
negativos_por_columna = negativos_por_columna[negativos_por_columna > 0]

print(negativos_por_columna)


Series([], dtype: int64)


#### Revision de espacios

In [18]:
#Revisamos si hay espacios en blanco al inicio o al final de las cadenas de texto
espacios = df.apply(lambda col: col.str.contains(r"^\s|\s$", regex=True) if col.dtype == "object" else False)
print(espacios.sum())

InventoryId            0
Store                  0
Brand                  0
Description            0
Size                   0
SalesQuantity          0
SalesDollars           0
SalesPrice             0
SalesDate              0
Volume                 0
Classification         0
ExciseTax              0
VendorNo               0
VendorName        957234
dtype: int64


_Transformamos los textos en las columnas object a minuscula_

In [19]:
df = df.apply(lambda col: col.str.lower() if col.dtype == "object" else col)

### Revisiones especificas de columnas

_Analizando las columnas parece que la columna size es una suma entre las columnas 'Volume' y 'SalesQuantity'. Se decide eliminar esta columna_

In [20]:
df.drop(columns=['Size'], inplace=True)  # Eliminamos la columna 'Size' del DataFrame

In [21]:
df.info() # Verificamos que la columna 'Size' haya sido eliminada

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 13 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   InventoryId     1048575 non-null  object        
 1   Store           1048575 non-null  int64         
 2   Brand           1048575 non-null  int64         
 3   Description     1048575 non-null  object        
 4   SalesQuantity   1048575 non-null  int64         
 5   SalesDollars    1048575 non-null  float64       
 6   SalesPrice      1048575 non-null  float64       
 7   SalesDate       1048575 non-null  datetime64[ns]
 8   Volume          1048575 non-null  int64         
 9   Classification  1048575 non-null  int64         
 10  ExciseTax       1048575 non-null  float64       
 11  VendorNo        1048575 non-null  int64         
 12  VendorName      1048575 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(6), object(3)
memory usage: 104.

_La columna 'Brand' parece ser realmente el 'product_id' para la columna 'Description', la cual parece ser realmente 'product'_

In [22]:
# Agrupamos por brand y contamos cuántos valores únicos de description hay por cada uno
chequeo = df.groupby('Brand')['Description'].nunique()

# Filtramos los casos donde hay más de una descripción por brand
inconsistentes = chequeo[chequeo > 1]

# Mostramos los brand con más de una descripción asociada
print(inconsistentes)

Series([], Name: Description, dtype: int64)


_Los valores se coinciden en el 100% de los casos por lo que asumimos que lo planteado es correcto_

### Guardado de DataFrame limpio

In [23]:
# Guardado de DataFrame limpio
df.to_csv(r"C:\Users\pablo\OneDrive\Documentos\HENRY\Proyecto\Inventory\SalesFINAL_Limpio.csv", index=False)