## Importar Librerías

In [1]:
from psycopg2 import connect, Error
import pandas as pd
import csv
from io import StringIO
import glob
import functools

## Conexión

In [2]:
def getConnection():
    try:
        connection = connect(host='localhost',database='amazon',user='postgres', password='postgres', port='5432')   
    except(Exception, Error) as error:
        connection.rollback()
        print("Error: %s" % error)
    return connection

## Función para cargar tabla a la base de datos

In [3]:
def to_database(dataframe,table):
    try:
        output = StringIO()
        dataframe.to_csv(output, sep='\t', index = False, header = False, quoting=csv.QUOTE_NONE,escapechar='\\')
        output.seek(0)
        connection = getConnection()
        cursor = connection.cursor()
        cursor.copy_from(output,table,null='')
        connection.commit()
        cursor.close()
    except(Exception, Error) as error:
        print("Error: %s" % error)


## Crear DataFrame de los archivos

Se crea el DataFrame con los datos de todos los archivos terminados en ".tsv" del directorio

In [4]:
#pd.concat: Concatena todos los dataFrame creados, uno por cada archivo
#map: Iterador que aplica la funcion a todos los parametros entregados (Los nombres de los archivos)
#pd.read_csv: Lee el archivo .tsv        
#glob.glob: Encuentra el nombre de todos los archivos terminados en .tsv

df = pd.concat(map(functools.partial(pd.read_csv, sep='\t',on_bad_lines='skip', quoting = 3), glob.glob('*.tsv')))

## Cargar tabla Category

In [5]:
df_category = df[["product_category"]]

cantTotal = df_category.count()
cantNull = df_category.isnull().sum()

df_category = df_category.drop_duplicates(subset="product_category")

df_category = df_category.replace('_',' ',regex = True)

df_category.dropna(how='any', inplace=True)

cantUnicos = df_category.count()

to_database(df_category,"category")

print("Cantidad total: ",int(cantTotal),"\nCantidad nulos: ",int(cantNull),"\nCantidad únicos: ",int(cantUnicos))

df_category

Cantidad total:  21561652 
Cantidad nulos:  1 
Cantidad únicos:  20


Unnamed: 0,product_category
0,Baby
0,Camera
0,Digital Music Purchase
0,Digital Software
0,Digital Video Games
0,Furniture
0,Gift Card
0,Grocery
0,Major Appliances
0,Mobile Electronics


## Cargar tabla Customer

In [6]:
df_customers_id = df[["customer_id"]]

cantTotal = df_customers_id.count()
cantNull = df_customers_id.isnull().sum()

df_customers_id = df_customers_id.drop_duplicates(subset="customer_id")

cantUnicos = df_customers_id.count()

df_customers_id.dropna(how='any', inplace=True)

to_database(df_customers_id,"customer")

print("Cantidad total: ",int(cantTotal),"\nCantidad nulos: ",int(cantNull),"\nCantidad únicos: ",int(cantUnicos))

df_customers_id

Cantidad total:  21561653 
Cantidad nulos:  0 
Cantidad únicos:  9225148


Unnamed: 0,customer_id
0,9970739
1,23538442
2,8273344
3,24557753
4,46263340
...,...
1377532,621251
1377533,5828305
1377535,9914408
1377536,10180753


## Cargar tabla Time

In [7]:
df_time = df[["review_date"]].drop_duplicates(subset="review_date").dropna()

df_time[[ "year","month", "day"]] = df_time["review_date"].str.split("-", expand = True)

to_database(df_time,"time")

df_time

Unnamed: 0,review_date,year,month,day
0,2015-08-31,2015,08,31
2376,2015-08-30,2015,08,30
4312,2015-08-29,2015,08,29
6219,2015-08-28,2015,08,28
8124,2015-08-27,2015,08,27
...,...,...,...,...
380599,1996-08-17,1996,08,17
380600,1996-06-11,1996,06,11
380601,1996-06-05,1996,06,05
380602,1995-12-29,1995,12,29


## Cargar tabla Product

In [8]:
df_products = df[["product_id", "product_parent", "product_title", "product_category"]].drop_duplicates(subset="product_id")

print("Cantidad total: ", df_products["product_id"].count())

df_products["product_category"] = df_products["product_category"].replace('_',' ',regex = True)

csv_file = "product.csv"
table = "product"

df_products.to_csv(csv_file, sep = ';', index = False, header = False, quoting = csv.QUOTE_NONE, escapechar = '\\')

df_products

Cantidad total:  3061542


Unnamed: 0,product_id,product_parent,product_title,product_category
0,B00GSP5D94,329991347,Summer Infant SwaddleMe Adjustable Infant Wrap...,Baby
1,B00YYDDZGU,646108902,Pacifier Clip Girl (3 Pack) Ziggy Baby 2-Sided...,Baby
2,B00BUBNZC8,642922361,Udder Covers - Breast Feeding Nursing Cover,Baby
3,B00AWLZFTS,494272733,Gerber Graduates Fun Pack Utensils,Baby
4,B00KM60D3Q,305813185,Summer Infant Ultra Sight Pan/Scan/Zoom Video ...,Baby
...,...,...,...,...
1377506,B00CY4UEI8,967315020,The World - Vintage Map - Map 05 - Iphone 4/4S...,Wireless
1377528,B00QB3HH5Q,22013033,Cellphone Trendz HARD & SOFT RUBBER HYBRID ROC...,Wireless
1377533,B00NXAF90A,390870832,"4 in 1 Combo for Nokia Lumia 635, Nokia Lumia ...",Wireless
1377534,B00V9L8XTE,82127948,DuroCase ® HTC One M8 Kickstand Bumper Case - ...,Wireless


## Cargar tabla Review

In [21]:
df_review = df[["marketplace", "customer_id", "review_id", "product_id", "star_rating", "helpful_votes", "total_votes", "vine", "verified_purchase", "review_headline", "review_body","review_date"]]

df_review.dropna(subset=["star_rating"], inplace = True)

csv_file = "review.csv"
table = "review"

df_review.to_csv(csv_file, sep = ';', index = False, header = False, quoting = csv.QUOTE_NONE, escapechar = '\\')

df_review

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_review.dropna(subset=["star_rating"], inplace = True)


Unnamed: 0,marketplace,customer_id,review_id,product_id,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,9970739,R8EWA1OFT84NX,B00GSP5D94,5.0,0.0,0.0,N,Y,Great swaddled blankets,Loved these swaddle blankets and so did my dau...,2015-08-31
1,US,23538442,R2JWY4YRQD4FOP,B00YYDDZGU,5.0,0.0,0.0,N,N,Too cute and really nice,These are adorable pacifier clips. SavvyBaby h...,2015-08-31
2,US,8273344,RL5ESX231LZ0B,B00BUBNZC8,5.0,0.0,0.0,N,Y,Five Stars,Great gift,2015-08-31
3,US,24557753,RRMS9ZWJ2KD08,B00AWLZFTS,5.0,0.0,0.0,N,Y,Cute; wash up nicely in dishwasher.,These forks are great for my 10 month old daug...,2015-08-31
4,US,46263340,R14I3ZG5E6S7YM,B00KM60D3Q,5.0,0.0,0.0,N,Y,Love it!,I wanted something for piece of mind with my l...,2015-08-31
...,...,...,...,...,...,...,...,...,...,...,...,...
1377539,US,25447436,R1XXWQG3VSTTYA,B0073FCPSK,3.0,0.0,0.0,N,Y,"It works OK, just that on the picture it ...","It works OK, just that on the picture it looks...",2015-05-07
1377540,US,37097321,R29SUCTYQ8JBR,B00TOYBSH2,5.0,0.0,0.0,N,Y,This is the best screen protector I have ever ...,This is the best screen protector I have ever ...,2015-05-07
1377541,US,43608564,R20KYGMZYMGOZD,B00A1UI02Y,4.0,0.0,0.0,N,Y,Worth the Wait,"It did take a little bit for the case arrive, ...",2015-05-07
1377542,US,48974083,RCOXQL7G0TYG0,B00Q275LB8,5.0,0.0,0.0,N,Y,nice,works well in my SUV and others I rented,2015-05-07
