## Importar Librerías

In [1]:
from psycopg2 import connect, Error
import pandas as pd
import csv
from io import StringIO
import glob
import functools

## Conexión

In [2]:
def getConnection():
    try:
        connection = connect(host='localhost',database='amazon',user='postgres', password='postgres', port='5432')   
    except(Exception, Error) as error:
        connection.rollback()
        print("Error: %s" % error)
    return connection

## Función para cargar tabla a la base de datos

In [3]:
def to_database(dataframe,table):
    try:
        output = StringIO()
        dataframe.to_csv(output, sep='\t', index = False, header = False, quoting=csv.QUOTE_NONE,escapechar='\\')
        output.seek(0)
        connection = getConnection()
        cursor = connection.cursor()
        cursor.copy_from(output,table,null='')
        connection.commit()
        cursor.close()
    except(Exception, Error) as error:
        print("Error: %s" % error)


## Crear DataFrame de los archivos

Se crea el DataFrame con los datos de todos los archivos terminados en ".tsv" del directorio

In [4]:
#pd.concat: Concatena todos los dataFrame creados, uno por cada archivo
#map: Iterador que aplica la funcion a todos los parametros entregados (Los nombres de los archivos)
#pd.read_csv: Lee el archivo .tsv
    #parametros:
        
#glob.glob: Encuentra el nombre de todos los archivos terminados en .tsv

df = pd.concat(map(functools.partial(pd.read_csv, sep='\t',on_bad_lines='skip', quoting = 3), glob.glob('*.tsv')))

## Cargar tabla Category

In [5]:
df_category = df[["product_category"]]

cantTotal = df_category.count()
cantNull = df_category.isnull().sum()

df_category = df_category.drop_duplicates(subset="product_category")

df_category = df_category.replace('_',' ',regex = True)

cantUnicos = df_category.count()

to_database(df_category,"category")

print("Cantidad total: ",int(cantTotal),"\nCantidad nulos: ",int(cantNull),"\nCantidad únicos: ",int(cantUnicos))

df_category

Cantidad total:  684458 
Cantidad nulos:  0 
Cantidad únicos:  6


Unnamed: 0,product_category
0,Digital Software
0,Digital Video Games
0,Gift Card
0,Major Appliances
0,Mobile Electronics
0,Personal Care Appliances


## Cargar tabla Customer

In [6]:
df_customers_id = df[["customer_id"]]

cantTotal = df_customers_id.count()
cantNull = df_customers_id.isnull().sum()

df_customers_id = df_customers_id.drop_duplicates(subset="customer_id")

cantUnicos = df_customers_id.count()

to_database(df_customers_id,"customer")

print("Cantidad total: ",int(cantTotal),"\nCantidad nulos: ",int(cantNull),"\nCantidad únicos: ",int(cantUnicos))

df_customers_id

Cantidad total:  684458 
Cantidad nulos:  0 
Cantidad únicos:  606828


Unnamed: 0,customer_id
0,17747349
1,10956619
2,13132245
3,35717248
4,17710652
...,...
85976,52536938
85977,47435897
85978,48581323
85979,51085800


## Cargar tabla Time

In [7]:
df_time = df[["review_date"]].drop_duplicates(subset="review_date").dropna()

print("Rango de fechas: ", df_time["review_date"].iloc[0], " - ", df_time["review_date"].iloc[df_time.size-1])

to_database(df_time,"time")

df_time

Rango de fechas:  2015-08-31  -  2000-10-29


Unnamed: 0,review_date
0,2015-08-31
70,2015-08-30
152,2015-08-29
233,2015-08-28
318,2015-08-27
...,...
85975,2000-11-16
85976,2000-11-12
85978,2000-11-11
85979,2000-11-08


## Cargar tabla Product

In [8]:
df_products = df[["product_id", "product_parent", "product_title", "product_category"]].drop_duplicates(subset="product_id")

print("Cantidad total: ", df_products["product_id"].count())

df_products["product_category"] = df_products["product_category"].replace('_',' ',regex = True)

to_database(df_products,"product")

df_products

Cantidad total:  67781


Unnamed: 0,product_id,product_parent,product_title,product_category
0,B00U7LCE6A,106182406,CCleaner Free [Download],Digital Software
1,B00HRJMOM4,162269768,ResumeMaker Professional Deluxe 18,Digital Software
2,B00P31G9PQ,831433899,Amazon Drive Desktop [PC],Digital Software
3,B00FGDEPDY,991059534,Norton Internet Security 1 User 3 Licenses,Digital Software
4,B00FZ0FK0U,574904556,SecureAnywhere Intermet Security Complete 5 De...,Digital Software
...,...,...,...,...
85855,B000056J7U,901342774,The First Years: Underarm Thermometer,Personal Care Appliances
85862,B00005EB7C,366331529,Tanita BF-541 Body Fat Monitor and Scale,Personal Care Appliances
85873,B00004YMB6,364721162,Digital Alcohol Detector BT3300,Personal Care Appliances
85966,B000050FEY,367384967,HoMedics FS-1H-K Foot Salon Ultra Foot Massager,Personal Care Appliances


## Cargar tabla Review

In [9]:
#LOS BOLEANOS EN LA REVIEW HAY QUE CAMBIARLOS

df_review = df[["marketplace", "customer_id", "review_id", "product_id", "star_rating", "helpful_votes", "total_votes", "vine", "verified_purchase", "review_headline", "review_body","review_date"]]

to_database(df_review,"review")

df_review

Unnamed: 0,marketplace,customer_id,review_id,product_id,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,17747349,R2EI7QLPK4LF7U,B00U7LCE6A,4,0,0,N,Y,Four Stars,So far so good,2015-08-31
1,US,10956619,R1W5OMFK1Q3I3O,B00HRJMOM4,3,0,0,N,Y,Three Stars,Needs a little more work.....,2015-08-31
2,US,13132245,RPZWSYWRP92GI,B00P31G9PQ,1,1,2,N,Y,One Star,Please cancel.,2015-08-31
3,US,35717248,R2WQWM04XHD9US,B00FGDEPDY,5,0,0,N,Y,Works as Expected!,Works as Expected!,2015-08-31
4,US,17710652,R1WSPK2RA2PDEF,B00FZ0FK0U,4,1,2,N,Y,Great antivirus. Worthless customer support,I've had Webroot for a few years. It expired a...,2015-08-31
...,...,...,...,...,...,...,...,...,...,...,...,...
85976,US,52536938,R1NUYN39WEVD9X,B000050FDR,5,96,98,N,N,Awesome Shaver!,This shaver gives a very percise shave and is ...,2000-11-12
85977,US,47435897,RD17SQQ58L34O,B000050FDP,5,46,48,N,N,This shaver rocks!,Talk about a smooth shave. The blades are clo...,2000-11-12
85978,US,48581323,R30DX2RCMIKQ90,B000050FDN,5,31,34,N,N,The Braun Rechargeable Shaver,This shaver was amazing! Not only did I get on...,2000-11-11
85979,US,51085800,R1O8C9XEYHQUIH,B000050G03,2,38,41,N,N,Not high quality you would expect from,I have the rowenta iron and its great so i tho...,2000-11-08
