In [None]:
###########################################################################################################
# 002_read_CSV_export.ipynb
#
# Read the CSV of Amazon Reviews from the McAuley-Lab data as a data frame, 
#       see https://github.com/Stephen-McDaniel/Download_Amazon_Reviews_2023
#       then:
#           1) Export it to DuckDB for use by Ibis in later programs
#           2) Export it to feather file for ultra fast access from Pandas
#
# Before running:
#   1) Create the duckdb and feather directories
#   2) Change the core_path variable
#
# Stephen McDaniel at https://PeakPython.com
# 2024-NOV-19
#
# License: MIT License
###########################################################################################################
 
core_path = '/root/pc_01_pandasibis/110_McAuley_Amazon_Data/'

In [None]:
import os

import pandas as pd

import ibis

import ibis.selectors as s
from ibis import _

ibis.options.interactive = True

from itables import show
from itables.javascript import init_notebook_mode

# Initialize itables
init_notebook_mode(all_interactive=True)

In [None]:
table_name = 'Health_and_Household'
csv_in = core_path + '/amazon_reviews/processed/' + table_name + '.csv'

df = pd.read_csv(csv_in, nrows=100)  # Skip header + read 10 rows

# To see all columns without truncation
pd.set_option('display.max_columns', None)
print(df)

# To see the data types of columns
print(df.dtypes)

# Check shape (rows, columns)
print(df.shape)

In [None]:
show(df)

In [None]:
import ibis

df = pd.read_csv(csv_in)

df['date_time'] = pd.to_datetime(df['timestamp']/1000, unit='s').dt.round('s')

df = df.drop('timestamp', axis=1)

In [None]:
show(df)

In [None]:
con = ibis.duckdb.connect(core_path + "/duckdb/db.duckdb")

con.create_table(table_name, df, overwrite=True)

In [None]:
con.disconnect()

In [None]:
# Check if file exists before attempting to delete
if os.path.exists(csv_in):
    print(f"File '{csv_in}' has been deleted.")
else:
    print(f"File '{csv_in}' does not exist.")

In [None]:
# Writing for Pandas read
df.to_feather(core_path + "/feather/" + table_name + ".feather")



In [None]:
df.columns