In [None]:
###########################################################################################################
# 003b_split_tables_duckdb.ipynb
#
# Read the table from DuckdDB of Amazon Reviews from the McAuley-Lab data, 
#       see https://github.com/Stephen-McDaniel/Download_Amazon_Reviews_2023
#       then:
#           1) Split it into a table of most columns, except the large text columns
#           2) Split it into a 2nd table of the large text columns
# row_number is the unique identifier field common to both tables
#
# Before running:
#   1) Change the core_path variable
#
# Stephen McDaniel at https://PeakPython.com
# 2024-NOV-19
#
# License: MIT License
###########################################################################################################

core_path = '/root/pc_01_pandasibis/110_McAuley_Amazon_Data/'

In [None]:
# Start of notebook
import time
notebook_start_time = time.time()


In [None]:
import pandas as pd

import ibis

import ibis.selectors as s
from ibis import _

ibis.options.interactive = True

from itables import show
from itables.javascript import init_notebook_mode

# Initialize itables
init_notebook_mode(all_interactive=True)



In [None]:

# Connect to the DuckDB file
con = ibis.duckdb.connect(core_path + "/duckdb/db.duckdb")

# Assume an existing table in DuckDB named 'original_table'
# Reference the table using Ibis
original_table = con.table("Health_and_Household")

# Define columns for the new tables
columns_table1 = ['row_number', 'rating', 'asin', 'parent_asin', 'user_id', 'helpful_vote', 'verified_purchase', 'date_time']
columns_table2 = ['row_number', 'title', 'text']

con.create_table(
    'Health_and_Household_1',    # Name for the new permanent table
    (
        original_table.
            select(columns_table1)
    ),
    temp=False, 
    overwrite=True            # Will replace if table already exists
)

con.create_table(
    'Health_and_Household_2',    # Name for the new permanent table
    (
        original_table.
            select(columns_table2)
    ),
    temp=False , 
    overwrite=True            # Will replace if table already exists
)

con.disconnect()

In [None]:
# End of notebook
notebook_end_time = time.time()
runtime_minutes = (notebook_end_time - notebook_start_time) / 60
print(f"Total notebook runtime: {runtime_minutes:.2f} minutes")
