## Importar Librerías

In [203]:
from psycopg2 import connect, Error
import pandas as pd
import csv
from io import StringIO
import glob
import functools
from sqlalchemy import create_engine

## Conexión

In [204]:
def getConnection():
    try:
        connection = connect(host='localhost',database='amazon',user='postgres', password='1234', port='5432')   
    except(Exception, Error) as error:
        connection.rollback()
        print("Error: %s" % error)
    return connection

## Función para cargar tabla a la base de datos

In [205]:
def to_database(dataframe,table):
    #getConnection()
    engine = create_engine("postgresql+psycopg2://postgres:1234@localhost:5432/amazon")
    dataframe.to_sql(name=table,con = engine,if_exists='append',index = None,chunksize= 200000)

## Crear DataFrame de los archivos

Se crea el DataFrame con los datos de todos los archivos terminados en ".tsv" del directorio

In [206]:
#pd.concat: Concatena todos los dataFrame creados, uno por cada archivo
#map: Iterador que aplica la funcion a todos los parametros entregados (Los nombres de los archivos)
#pd.read_csv: Lee el archivo .tsv        
#glob.glob: Encuentra el nombre de todos los archivos terminados en .tsv

df = pd.concat(map(functools.partial(pd.read_csv, sep='\t',on_bad_lines='skip', quoting = 3), glob.glob('*.tsv')))

## Cargar tabla Category

In [None]:
df_category = df[["product_category"]]

cantTotal = df_category.count()
cantNull = df_category.isnull().sum()

df_category = df_category.drop_duplicates(subset="product_category")

df_category = df_category.replace('_',' ',regex = True)

df_category.dropna(how='any', inplace=True)

cantUnicos = df_category.count()

to_database(df_category,"category")

print("Cantidad total: ",int(cantTotal),"\nCantidad nulos: ",int(cantNull),"\nCantidad únicos: ",int(cantUnicos))

df_category

Cantidad total:  149086 
Cantidad nulos:  0 
Cantidad únicos:  1


Unnamed: 0,product_category
0,Gift Card


## Cargar tabla Customer

In [None]:
df_customers_id = df[["customer_id"]]

cantTotal = df_customers_id.count()
cantNull = df_customers_id.isnull().sum()

df_customers_id = df_customers_id.drop_duplicates(subset="customer_id")

cantUnicos = df_customers_id.count()

df_customers_id.dropna(how='any', inplace=True)

to_database(df_customers_id,"customer")

print("Cantidad total: ",int(cantTotal),"\nCantidad nulos: ",int(cantNull),"\nCantidad únicos: ",int(cantUnicos))

df_customers_id

Cantidad total:  149086 
Cantidad nulos:  0 
Cantidad únicos:  143181


Unnamed: 0,customer_id
0,24371595
1,42489718
2,861463
3,25283295
4,397970
...,...
149080,26443276
149081,40383801
149082,15124244
149084,30603398


## Cargar tabla Time

In [None]:
df_time = df[["review_date"]].drop_duplicates(subset="review_date").dropna()

df_time[[ "year","month", "day"]] = df_time["review_date"].str.split("-", expand = True)

to_database(df_time,"time")

df_time

Unnamed: 0,review_date,year,month,day
0,2015-08-31,2015,08,31
163,2015-08-30,2015,08,30
293,2015-08-29,2015,08,29
419,2015-08-28,2015,08,28
493,2015-08-27,2015,08,27
...,...,...,...,...
149081,2005-01-21,2005,01,21
149082,2004-12-17,2004,12,17
149083,2004-11-30,2004,11,30
149084,2004-11-10,2004,11,10


## Cargar tabla Product

In [None]:
df_products = df[["product_id", "product_parent", "product_title", "product_category"]].drop_duplicates(subset="product_id")

print("Cantidad total: ", df_products["product_id"].count())

df_products["product_category"] = df_products["product_category"].replace('_',' ',regex = True)

csv_file = "product.csv"
table = "product"

#df_products.to_csv(csv_file, sep = ';', index = False, header = False, quoting = csv.QUOTE_NONE, escapechar = '\\')
to_database(df_products,table)
df_products

Cantidad total:  1780


Unnamed: 0,product_id,product_parent,product_title,product_category
0,B004LLIL5A,346014806,Amazon eGift Card - Celebrate,Gift Card
1,B004LLIKVU,473048287,Amazon.com eGift Cards,Gift Card
2,B00IX1I3G6,926539283,Amazon.com Gift Card Balance Reload,Gift Card
4,B005ESMGV4,379368939,"Amazon.com Gift Cards, Pack of 3 (Various Desi...",Gift Card
5,B004KNWWU4,326384774,Amazon Gift Card - Print - Happy Birthday (Birds),Gift Card
...,...,...,...,...
147520,BT00CTP2B2,775486538,Amazon.com Gift Card in a Greeting Card (Vario...,Gift Card
147581,BT00CTP1EK,518174629,Amazon.com Gift Card - $500 (Christmas Tree de...,Gift Card
147630,B001H53QE4,825899505,"Amazon.com Gift Cards, Pack of 50 (Old Version...",Gift Card
148905,B000LGKQHU,941684275,Apple iTunes $25 Music Card,Gift Card


## Cargar tabla Review

In [None]:
df_review = df[["marketplace", "customer_id", "review_id", "product_id", "star_rating", "helpful_votes", "total_votes", "vine", "verified_purchase", "review_headline", "review_body","review_date"]]

df_review.dropna(subset=["star_rating"], inplace = True)

csv_file = "review.csv"
table = "review"

#df_review.to_csv(csv_file, sep = ';', index = False, header = False, quoting = csv.QUOTE_NONE, escapechar = '\\')

to_database(df_review,table)
df_review

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_review.dropna(subset=["star_rating"], inplace = True)


Unnamed: 0,marketplace,customer_id,review_id,product_id,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,24371595,R27ZP1F1CD0C3Y,B004LLIL5A,5,0,0,N,Y,Five Stars,Great birthday gift for a young adult.,2015-08-31
1,US,42489718,RJ7RSBCHUDNNE,B004LLIKVU,5,0,0,N,Y,Gift card for the greatest selection of items ...,It's an Amazon gift card and with over 9823983...,2015-08-31
2,US,861463,R1HVYBSKLQJI5S,B00IX1I3G6,5,0,0,N,Y,Five Stars,Good,2015-08-31
3,US,25283295,R2HAXF0IIYQBIR,B00IX1I3G6,1,0,0,N,Y,One Star,Fair,2015-08-31
4,US,397970,RNYLPX611NB7Q,B005ESMGV4,5,0,0,N,Y,Five Stars,I can't believe how quickly Amazon can get the...,2015-08-31
...,...,...,...,...,...,...,...,...,...,...,...,...
149081,US,40383801,R57O26VBSLMP1,B0002CZPPG,5,10,10,N,N,Way easier than explaining your musical taste ...,Finally there is a way for your family to buy ...,2005-01-21
149082,US,15124244,R375D634NGSSPI,B0002CZPPG,4,8,44,N,N,itunes paid for,its very convenient to have an idea of how mut...,2004-12-17
149083,US,40383801,R2NCVKVC9B7I9C,B0002CZPPG,5,20,30,N,N,Way easier than explaining your musical taste ...,Finally there is a way for your family to buy ...,2004-11-30
149084,US,30603398,RC2BIM4XKDCY4,B0002CZPPG,4,63,72,N,N,A great way to turn cash into songs,I picked up a few of these at Target a while b...,2004-11-10
