# MERGE DE TABLAS sales.csv y product_description.csv

## Importar librerías necesarias.

In [43]:
import os
import pandas as pd
import numpy as np

## Obtener dataframes.

In [44]:
# Obtener el directorio de trabajo actual
current_dir = os.getcwd()

# Ir a la carpeta anterior del directorio actual porque ahí se encuentra la carpeta 'data' donde están todos los ficheros .csv
parent_dir = os.path.dirname(current_dir)

DATA_PATH = os.path.join(parent_dir, 'data') # Ruta donde están los ficheros .csv

FILE_NAME1 = 'sales.csv'
FILE_NAME2 = 'product_description.csv'
FILE1 = os.path.join(DATA_PATH, FILE_NAME1)
FILE2 = os.path.join(DATA_PATH, FILE_NAME2)
print('current path:', current_dir)
print('parent path:', parent_dir)
print('file 1:', FILE1)
print('file 2:', FILE2)

current path: c:\Users\odoto\OneDrive\Documentos\EasyMoney\TFM-EasyMoney\sales
parent path: c:\Users\odoto\OneDrive\Documentos\EasyMoney\TFM-EasyMoney
file 1: c:\Users\odoto\OneDrive\Documentos\EasyMoney\TFM-EasyMoney\data\sales.csv
file 2: c:\Users\odoto\OneDrive\Documentos\EasyMoney\TFM-EasyMoney\data\product_description.csv


In [45]:
df_sales = pd.read_csv(FILE1, index_col=0)
df_prod_desc = pd.read_csv(FILE2, index_col=0)

## Merge de dataframes.

In [46]:
df_merged = pd.merge(df_sales, df_prod_desc, left_on='product_ID', right_on='pk_product_ID', how='left')

In [47]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240773 entries, 0 to 240772
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   pk_sale         240773 non-null  int64  
 1   cid             240773 non-null  int64  
 2   month_sale      240773 non-null  object 
 3   product_ID      240773 non-null  int64  
 4   net_margin      240773 non-null  float64
 5   pk_product_ID   240773 non-null  int64  
 6   product_desc    240773 non-null  object 
 7   family_product  240773 non-null  object 
dtypes: float64(1), int64(4), object(3)
memory usage: 14.7+ MB


In [48]:
df_merged

Unnamed: 0,pk_sale,cid,month_sale,product_ID,net_margin,pk_product_ID,product_desc,family_product
0,6666,33620,2018-05-01,2335,952.9,2335,short_term_deposit,investment
1,6667,35063,2018-06-01,2335,1625.2,2335,short_term_deposit,investment
2,6668,37299,2018-02-01,2335,1279.7,2335,short_term_deposit,investment
3,6669,39997,2018-02-01,2335,1511.9,2335,short_term_deposit,investment
4,6670,44012,2018-02-01,2335,1680.3,2335,short_term_deposit,investment
...,...,...,...,...,...,...,...,...
240768,247434,1553456,2019-05-01,4657,56.7,4657,em_acount,account
240769,247435,1553541,2019-05-01,4657,66.5,4657,em_acount,account
240770,247436,1553559,2019-05-01,4657,73.0,4657,em_acount,account
240771,247437,1553565,2019-05-01,4657,82.3,4657,em_acount,account


## Comprobación de variables.

In [49]:
# No hay duplicados
df_merged[df_merged.duplicated(keep=False)]

Unnamed: 0,pk_sale,cid,month_sale,product_ID,net_margin,pk_product_ID,product_desc,family_product


In [50]:
# No hay nulos
df_merged.isna().sum()


pk_sale           0
cid               0
month_sale        0
product_ID        0
net_margin        0
pk_product_ID     0
product_desc      0
family_product    0
dtype: int64

In [51]:
# Eliminamos la columna 'pk_product_ID' porque tiene la misma información que 'product_ID'
df_merged.drop(columns=['pk_product_ID'], inplace=True, errors='ignore')

In [52]:
# Cambiamos el tipo de los registros de la variable 'month_sale' a datetime.
df_merged['month_sale'] = pd.to_datetime(df_merged['month_sale'])

In [53]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240773 entries, 0 to 240772
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   pk_sale         240773 non-null  int64         
 1   cid             240773 non-null  int64         
 2   month_sale      240773 non-null  datetime64[ns]
 3   product_ID      240773 non-null  int64         
 4   net_margin      240773 non-null  float64       
 5   product_desc    240773 non-null  object        
 6   family_product  240773 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(2)
memory usage: 12.9+ MB


## Exportamos a un nuevo fichero .csv

In [54]:
df_merged.to_csv(DATA_PATH + '\\merge_sales_prod_desc_clean.csv', index=False)