In [6]:
import pandas as pd
import sqlite3
from pathlib import Path
import os

# Define paths
data_dir = Path('.')
output_dir = Path('loaded_data')
output_dir.mkdir(exist_ok=True)

full_csv_path = data_dir / 'transformed_full.csv'
incremental_csv_path = data_dir / 'transformed_incremental.csv'

sqlite_full_path = output_dir / 'full_data.db'
sqlite_incremental_path = output_dir / 'incremental_data.db'
parquet_full_path = output_dir / 'full_data.parquet'


In [8]:
# Load full transformed data from CSV
df_full = pd.read_csv(full_csv_path)

# Save to SQLite database
conn_full = sqlite3.connect(sqlite_full_path)
df_full.to_sql('full_data', conn_full, if_exists='replace', index=False)
conn_full.close()

# Save full data as Parquet file
df_full.to_parquet(parquet_full_path, index=False)



In [12]:
# Load incremental transformed data
df_incremental = pd.read_csv(incremental_csv_path)

# Save to SQLite database
conn_incremental = sqlite3.connect(sqlite_incremental_path)
df_incremental.to_sql('incremental_data', conn_incremental, if_exists='replace', index=False)
conn_incremental.close()


In [10]:
# Read from SQLite to verify full_data
print("Full Data (SQLite):")
conn = sqlite3.connect(sqlite_full_path)
print(pd.read_sql("SELECT * FROM full_data LIMIT 5;", conn))
conn.close()


Full Data (SQLite):
     id customer        date  amount         last_updated  amount_with_tax  \
0  3636  Walmart  2025-04-02    1386  2025-04-02 22:55:00          1607.76   
1  2615  Walmart  2025-04-02     925  2025-04-02 01:48:00          1073.00   
2  6180   Amazon  2025-04-02    1950  2025-04-02 01:35:00          2262.00   
3  7060  BestBuy  2025-04-02     355  2025-04-02 13:22:00           411.80   
4  8236   Costco  2025-04-02     581  2025-04-02 10:29:00           673.96   

  amount_category  
0            High  
1          Medium  
2       Very High  
3             Low  
4          Medium  


In [11]:
# Check and preview the Parquet file
print("\nFull Data (Parquet):")
if parquet_full_path.exists():
    df_parquet = pd.read_parquet(parquet_full_path)
    print(df_parquet.head())
else:
    print("Parquet file does not exist. Please generate it first.")



Full Data (Parquet):
     id customer        date  amount         last_updated  amount_with_tax  \
0  3636  Walmart  2025-04-02    1386  2025-04-02 22:55:00          1607.76   
1  2615  Walmart  2025-04-02     925  2025-04-02 01:48:00          1073.00   
2  6180   Amazon  2025-04-02    1950  2025-04-02 01:35:00          2262.00   
3  7060  BestBuy  2025-04-02     355  2025-04-02 13:22:00           411.80   
4  8236   Costco  2025-04-02     581  2025-04-02 10:29:00           673.96   

  amount_category  
0            High  
1          Medium  
2       Very High  
3             Low  
4          Medium  
