# Data Ingestion to Snowflake

This notebook handles the process of loading the synthetic e-commerce data into Snowflake.

## Objectives
1. Generate synthetic data (if not already present)
2. Connect to Snowflake
3. Create the required database schema and tables
4. Load data from CSV files into Snowflake tables
5. Verify the data load

In [None]:
# Import libraries
import sys
sys.path.append("/home/ubuntu/snowflake_ds_project")

import os
import pandas as pd
from pathlib import Path

from src.snowflake_connector import SnowflakeConnector
from src.data_loader import DataLoader
from data.generate_synthetic_data import main as generate_data
from config import config

print("Libraries imported successfully!")

## 1. Generate Synthetic Data

In [None]:
# Check if data files exist, if not, generate them
data_dir = Path("/home/ubuntu/snowflake_ds_project/data")
customers_file = data_dir / "customers.csv"

if not customers_file.exists():
    print("Data files not found. Generating synthetic data...")
    generate_data()
else:
    print("Data files already exist. Skipping generation.")

## 2. Connect to Snowflake

In [None]:
# Validate configuration
try:
    config.validate()
    print("Configuration validated successfully.")
except ValueError as e:
    print(f"Configuration Error: {e}")
    print("Please create a `.env` file in the root directory with your Snowflake credentials.")

In [None]:
# Create connector and connect
connector = SnowflakeConnector(config.get_connection_params())
connector.connect()

print("Connected to Snowflake successfully!")

## 3. Create Database Schema and Tables

In [None]:
# Execute the SQL script to create the database structure
sql_file_path = "/home/ubuntu/snowflake_ds_project/sql/create_tables.sql"

try:
    print(f"Executing SQL script: {sql_file_path}")
    connector.execute_sql_file(sql_file_path)
    print("Database, schemas, and tables created successfully.")
except Exception as e:
    print(f"Error executing SQL script: {e}")

## 4. Load Data into Snowflake

In [None]:
# Create data loader
loader = DataLoader(connector)

# Load all data from CSV files to Snowflake
loader.load_all_data()

## 5. Verify Data Load

In [None]:
# Verify row counts in Snowflake tables
row_counts = loader.verify_data_load()

# Compare with CSV file row counts
print("\nComparing Snowflake row counts with local CSV files:")
print("-" * 50)

file_table_mapping = {
    "customers.csv": "CUSTOMERS",
    "transactions.csv": "TRANSACTIONS",
    "customer_activity.csv": "CUSTOMER_ACTIVITY"
}

for csv_file, table_name in file_table_mapping.items():
    csv_path = data_dir / csv_file
    df = pd.read_csv(csv_path)
    csv_rows = len(df)
    snowflake_rows = row_counts.get(table_name, 0)
    
    print(f"Table: {table_name}")
    print(f"  - CSV Rows:      {csv_rows:,}")
    print(f"  - Snowflake Rows: {snowflake_rows:,}")
    print(f"  - Match:         {"Yes" if csv_rows == snowflake_rows else "No"}")
    print()

## 6. Cleanup

In [None]:
# Disconnect from Snowflake
connector.disconnect()
print("Disconnected from Snowflake")