## Importar Librerías

In [None]:
from psycopg2 import connect, Error
import pandas as pd
import csv
from io import StringIO
import glob
import functools

## Conexión

In [None]:
def getConnection():
    try:
        connection = connect(host='localhost',database='amazon',user='postgres', password='1234', port='5432')   
    except(Exception, Error) as error:
        connection.rollback()
        print("Error: %s" % error)
    return connection

## Función para cargar tabla a la base de datos

In [None]:
def to_database(dataframe,table):
    try:
        output = StringIO()
        dataframe.to_csv(output, sep='\t', index = False, header = False, quoting=csv.QUOTE_NONE,escapechar='\\')
        output.seek(0)
        connection = getConnection()
        cursor = connection.cursor()
        cursor.copy_from(output,table,null='')
        connection.commit()
        cursor.close()
    except(Exception, Error) as error:
        print("Error: %s" % error)


## Crear DataFrame de los archivos

Se crea el DataFrame con los datos de todos los archivos terminados en ".tsv" del directorio

In [4]:
#pd.concat: Concatena todos los dataFrame creados, uno por cada archivo
#map: Iterador que aplica la funcion a todos los parametros entregados (Los nombres de los archivos)
#pd.read_csv: Lee el archivo .tsv        
#glob.glob: Encuentra el nombre de todos los archivos terminados en .tsv

df = pd.concat(map(functools.partial(pd.read_csv, sep='\t',on_bad_lines='skip', quoting = 3), glob.glob('*.tsv')))

## Cargar tabla Category

In [5]:
df_category = df[["product_category"]]

cantTotal = df_category.count()
cantNull = df_category.isnull().sum()

df_category = df_category.drop_duplicates(subset="product_category")

df_category = df_category.replace('_',' ',regex = True)

cantUnicos = df_category.count()

to_database(df_category,"category")

print("Cantidad total: ",int(cantTotal),"\nCantidad nulos: ",int(cantNull),"\nCantidad únicos: ",int(cantUnicos))

df_category

Cantidad total:  39798923 
Cantidad nulos:  0 
Cantidad únicos:  24


Unnamed: 0,product_category
0,Automotive
0,Baby
0,Camera
0,Digital Music Purchase
0,Digital Software
0,Digital Video Download
0,Digital Video Games
0,Furniture
0,Gift Card
0,Grocery


## Cargar tabla Customer

In [6]:
df_customers_id = df[["customer_id"]]

cantTotal = df_customers_id.count()
cantNull = df_customers_id.isnull().sum()

df_customers_id = df_customers_id.drop_duplicates(subset="customer_id")

cantUnicos = df_customers_id.count()

to_database(df_customers_id,"customer")

print("Cantidad total: ",int(cantTotal),"\nCantidad nulos: ",int(cantNull),"\nCantidad únicos: ",int(cantUnicos))

df_customers_id

Cantidad total:  39798923 
Cantidad nulos:  0 
Cantidad únicos:  14572243


Unnamed: 0,customer_id
0,36075342
1,42462164
2,21241933
3,52570308
4,38200102
...,...
960864,39578015
960867,39508379
960868,44704251
960869,40571775


## Cargar tabla Time

In [15]:
df_time = df[["review_date"]].drop_duplicates(subset="review_date").dropna()

df_time[[ "year","month", "day"]] = df_time["review_date"].str.split("-", expand = True)

#print("Rango de fechas: ", df_time["review_date"].iloc[0], " - ", df_time["review_date"].iloc[df_time.size-1])

to_database(df_time,"time")

df_time

Unnamed: 0,review_date,year,month,day
0,2015-08-31,2015,08,31
6507,2015-08-30,2015,08,30
11892,2015-08-29,2015,08,29
17486,2015-08-28,2015,08,28
21935,2015-08-27,2015,08,27
...,...,...,...,...
380599,1996-08-17,1996,08,17
380600,1996-06-11,1996,06,11
380601,1996-06-05,1996,06,05
380602,1995-12-29,1995,12,29


## Cargar tabla Product

In [8]:
df_products = df[["product_id", "product_parent", "product_title", "product_category"]].drop_duplicates(subset="product_id")

print("Cantidad total: ", df_products["product_id"].count())

df_products["product_category"] = df_products["product_category"].replace('_',' ',regex = True)

to_database(df_products,"product")

df_products

Cantidad total:  6074782


Unnamed: 0,product_id,product_parent,product_title,product_category
0,B00LPRXQ4Y,339193102,"17"" 2003-2006 Ford EXPEDITION Ford F150 2004-2...",Automotive
1,B000C7S0TO,907684644,Spectra Premium CU1909 Complete Radiator for T...,Automotive
2,B000CO9WE4,752246352,K&N E-4665 High Performance Replacement Indust...,Automotive
3,B000GKD5NI,105401756,Suncutters Rear Window Shade,Automotive
4,B009SDA7TE,728471129,Lug Nuts Landcruiser Tundra OEM Mag 14x1.5 Thr...,Automotive
...,...,...,...,...
960819,B00008IM8V,967363878,Timex Dress Metals 23001,Watches
960829,B00006J6TZ,110036990,,Watches
960834,B000066OSE,384427633,Timex Women's Fashion Watch 17961,Watches
960836,B00006441M,126100690,Fossil - FS2857 (Size: men),Watches


## Cargar tabla Review

In [16]:
df_review = df[["marketplace", "customer_id", "review_id", "product_id", "star_rating", "helpful_votes", "total_votes", "vine", "verified_purchase", "review_headline", "review_body","review_date"]]

to_database(df_review,"review")

df_review