# Data Lake

This example demonstrates how raw data can be stored in their original JSON format within a data lake, preserving its flexibility for future analysis and processing.
This code mounts Google Drive, saves raw JSON data to a file in Google Drive, and then reads and displays the contents of the JSON file.

In [None]:
from google.colab import drive
import json

# Mount Google Drive
drive.mount('/content/drive')

# Raw JSON data
raw_data = {
    'user_id': 12345,
    'timestamp': '2023-05-01T12:34:56Z',
    'event_type': 'purchase',
    'product_id': 'ABC123',
    'amount': 99.99
}

# Define the file path in Google Drive
file_path = '/content/drive/My Drive/event_12345.json'

# Save raw data to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(raw_data, json_file)

print(f"Data stored in Google Drive at: {file_path}")

# Read and display the JSON data
with open(file_path, 'r') as json_file:
    data = json.load(json_file)

print("Contents of the JSON file:")
print(json.dumps(data, indent=4))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data stored in Google Drive at: /content/drive/My Drive/event_12345.json
Contents of the JSON file:
{
    "user_id": 12345,
    "timestamp": "2023-05-01T12:34:56Z",
    "event_type": "purchase",
    "product_id": "ABC123",
    "amount": 99.99
}


# Data warehouse architectures

This code creates an SQLite database with three tables (dim_product, dim_customer, fact_sales), inserts sample data into them, and executes a SQL query to aggregate and summarize sales data by product category and customer segment, displaying the results in a Pandas DataFrame.

In [None]:
import sqlite3
import pandas as pd

# Connect to SQLite database (creates the database if it doesn't exist)
conn = sqlite3.connect('sales_data.db')
cursor = conn.cursor()

# Create tables
cursor.execute('''
CREATE TABLE IF NOT EXISTS dim_product (
    product_id INTEGER PRIMARY KEY,
    product_category TEXT
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS dim_customer (
    customer_id INTEGER PRIMARY KEY,
    customer_segment TEXT
)
''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS fact_sales (
    sale_id INTEGER PRIMARY KEY,
    product_id INTEGER,
    customer_id INTEGER,
    sales_amount REAL,
    sale_date TEXT,
    FOREIGN KEY (product_id) REFERENCES dim_product (product_id),
    FOREIGN KEY (customer_id) REFERENCES dim_customer (customer_id)
)
''')

# Insert sample data
cursor.executemany('''
INSERT INTO dim_product (product_id, product_category) VALUES (?, ?)
''', [(1, 'Electronics'), (2, 'Furniture'), (3, 'Clothing')])

cursor.executemany('''
INSERT INTO dim_customer (customer_id, customer_segment) VALUES (?, ?)
''', [(1, 'Retail'), (2, 'Wholesale'), (3, 'Online')])

cursor.executemany('''
INSERT INTO fact_sales (sale_id, product_id, customer_id, sales_amount, sale_date) VALUES (?, ?, ?, ?, ?)
''', [
    (1, 1, 1, 1200.50, '2023-01-15'),
    (2, 1, 2, 800.75, '2023-02-20'),
    (3, 2, 1, 1500.00, '2023-03-10'),
    (4, 3, 3, 500.25, '2023-01-25'),
    (5, 2, 2, 300.00, '2023-02-28')
])

# Commit changes
conn.commit()


In [None]:
# SQL query to analyze sales data from a data warehouse
query = '''
SELECT
    p.product_category,
    c.customer_segment,
    SUM(f.sales_amount) as total_sales,
    AVG(f.sales_amount) as avg_sales,
    COUNT(DISTINCT f.customer_id) as unique_customers
FROM
    fact_sales f
JOIN
    dim_product p ON f.product_id = p.product_id
JOIN
    dim_customer c ON f.customer_id = c.customer_id
WHERE
    f.sale_date BETWEEN '2023-01-01' AND '2023-03-31'
GROUP BY
    p.product_category, c.customer_segment
ORDER BY
    total_sales DESC
LIMIT 10;
'''

# Execute query and fetch results
df = pd.read_sql_query(query, conn)
print(df)


  product_category customer_segment  total_sales  avg_sales  unique_customers
0        Furniture           Retail      1500.00    1500.00                 1
1      Electronics           Retail      1200.50    1200.50                 1
2      Electronics        Wholesale       800.75     800.75                 1
3         Clothing           Online       500.25     500.25                 1
4        Furniture        Wholesale       300.00     300.00                 1


In [None]:
conn.close()


# Hybrid approaches: Data lakehouse

This example demonstrates how a data lake house can support SQL-like queries on large datasets stored in open formats, combining the flexibility of a data lake with the analytical capabilities of a data warehouse.

1. *Set Up Environment :  The code sets Spark and Java environment variables, then initializes a Spark session.*
2. *Create DataFrames and Perform SQL Query :  Two example DataFrames are created, registered as temporary views, and an SQL query aggregates sales data by product category.*
3. *Display Results :  The query results are displayed, showcasing data querying capabilities in a Spark-based data lakehouse environment.*

In [None]:
# Install necessary packages
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!pip install pyspark

In [None]:
import os
from pyspark.sql import SparkSession

# Set Spark environment variables
os.environ["SPARK_HOME"] = "/usr/local/lib/python3.10/dist-packages/pyspark"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

# Initialize Spark session
spark = SparkSession.builder \
    .appName("DataLakehouseQuery") \
    .getOrCreate()


In [None]:
# Create example dataframes (use CSV format for simplicity)
sales_data = spark.createDataFrame([
    (1, '2023-01-15', 'P001', 150.00),
    (2, '2023-02-17', 'P002', 200.00),
    (3, '2023-03-20', 'P001', 250.00),
    (4, '2023-03-25', 'P003', 300.00),
], ["id", "sale_date", "product_id", "amount"])

product_data = spark.createDataFrame([
    ('P001', 'Electronics'),
    ('P002', 'Clothing'),
    ('P003', 'Electronics'),
], ["product_id", "category"])

# Register dataframes as temporary views
sales_data.createOrReplaceTempView("sales")
product_data.createOrReplaceTempView("products")

In [None]:
# Perform SQL query
result = spark.sql("""
SELECT
    p.category,
    SUM(s.amount) as total_sales,
    AVG(s.amount) as avg_sale_amount
FROM
    sales s
JOIN
    products p ON s.product_id = p.product_id
WHERE
    s.sale_date >= '2023-01-01'
GROUP BY
    p.category
ORDER BY
    total_sales DESC
LIMIT 5
""")

# Show results
result.show()


+-----------+-----------+------------------+
|   category|total_sales|   avg_sale_amount|
+-----------+-----------+------------------+
|Electronics|      700.0|233.33333333333334|
|   Clothing|      200.0|             200.0|
+-----------+-----------+------------------+

