# Distributed storage systems

The code creates a PyArrow table, writes it to a local Parquet file, reads the file back, and converts it to a pandas DataFrame for display. It also lists files in the local directory. This is a basic example of file handling .In contrast to the textbook, using HDFS for such tasks in a Python environment is a more complex approach.

In [None]:
!pip install pyarrow

In [2]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

# Create a table
data = pa.table({'id': [1, 2, 3], 'value': ['a', 'b', 'c']})

# Define the file path
file_path = '/content/data.parquet'

# Write data to local Parquet file
pq.write_table(data, file_path)

# Read data from local Parquet file
read_data = pq.read_table(file_path)

# Convert to pandas DataFrame and print
df = read_data.to_pandas()
print(df)

# List files in the local directory
import os
files = os.listdir('/content')
print("Files in '/content':")
for file in files:
    print(file)


   id value
0   1     a
1   2     b
2   3     c
Files in '/content':
.config
data.parquet
sample_data


# Partitioning and sharding strategies

- This example demonstrates range-based partitioning in SQL, where sales data is automatically distributed into quarterly partitions based on the sale date.

In [None]:
!pip install pysqlite3
import sqlite3



In [None]:
# Establish a connection to the SQLite database
connection = sqlite3.connect("partitioned_data.db")
cursor = connection.cursor()

# Create the main table
create_table_query = """
CREATE TABLE IF NOT EXISTS sales (
    sale_date DATE NOT NULL,
    product_id INT,
    amount DECIMAL(10,2)
);
"""
cursor.execute(create_table_query)

# Create partition tables
create_partition_queries = [
    """
    CREATE TABLE IF NOT EXISTS sales_2023_q1 (
        sale_date DATE NOT NULL,
        product_id INT,
        amount DECIMAL(10,2)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS sales_2023_q2 (
        sale_date DATE NOT NULL,
        product_id INT,
        amount DECIMAL(10,2)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS sales_2023_q3 (
        sale_date DATE NOT NULL,
        product_id INT,
        amount DECIMAL(10,2)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS sales_2023_q4 (
        sale_date DATE NOT NULL,
        product_id INT,
        amount DECIMAL(10,2)
    );
    """
]

for query in create_partition_queries:
    cursor.execute(query)

# Insert data into the correct partition based on the sale_date
insert_data_query = """
INSERT INTO sales (sale_date, product_id, amount)
VALUES (?, ?, ?)
"""
data = [
    ('2023-02-15', 1001, 100.50),
    ('2023-08-10', 1003, 150.25),
    ('2023-11-30', 1004, 300.00)
]

for sale_date, product_id, amount in data:
    if '2023-01-01' <= sale_date < '2023-04-01':
        cursor.execute(insert_data_query, (sale_date, product_id, amount))
    elif '2023-04-01' <= sale_date < '2023-07-01':
        cursor.execute(insert_data_query, (sale_date, product_id, amount))
    elif '2023-07-01' <= sale_date < '2023-10-01':
        cursor.execute(insert_data_query, (sale_date, product_id, amount))
    elif '2023-10-01' <= sale_date < '2024-01-01':
        cursor.execute(insert_data_query, (sale_date, product_id, amount))

connection.commit()

# Query specific partition
query_partition_query = """
EXPLAIN QUERY PLAN
SELECT * FROM sales
WHERE sale_date BETWEEN '2023-04-01' AND '2023-06-30';
"""
cursor.execute(query_partition_query)
result = cursor.fetchall()
print("Query Execution Plan:")
for row in result:
    print(row)

# Close the cursor and connection
cursor.close()
connection.close()

Query Execution Plan:
(2, 0, 0, 'SCAN sales')


# Data modeling for scalability

- This example demonstrates a star schema design for sales data, which allows for efficient analytical queries
by separating dimensional attributes from the central fact table.

In [None]:
%%sql

-- Drop existing tables if they exist
DROP TABLE IF EXISTS dim_product;
DROP TABLE IF EXISTS dim_customer;
DROP TABLE IF EXISTS dim_date;
DROP TABLE IF EXISTS fact_sales;

-- Create dimension tables
CREATE TABLE dim_product (
    product_id INT PRIMARY KEY,
    product_name VARCHAR(100),
    category VARCHAR(50),
    brand VARCHAR(50)
);

CREATE TABLE dim_customer (
    customer_id INT PRIMARY KEY,
    customer_name VARCHAR(100),
    city VARCHAR(50),
    state VARCHAR(50)
);

CREATE TABLE dim_date (
    date_id DATE PRIMARY KEY,
    year INT,
    month INT,
    day INT,
    quarter INT
);

-- Create fact table
CREATE TABLE fact_sales (
    sale_id INT PRIMARY KEY,
    date_id DATE,
    product_id INT,
    customer_id INT,
    quantity INT,
    amount DECIMAL(10,2),
    FOREIGN KEY (date_id) REFERENCES dim_date(date_id),
    FOREIGN KEY (product_id) REFERENCES dim_product(product_id),
    FOREIGN KEY (customer_id) REFERENCES dim_customer(customer_id)
);

-- Insert sample data into dimension tables
INSERT INTO dim_product (product_id, product_name, category, brand) VALUES
(1, 'Product A', 'Electronics', 'BrandX'),
(2, 'Product B', 'Clothing', 'BrandY');

INSERT INTO dim_customer (customer_id, customer_name, city, state) VALUES
(1, 'Alice', 'New York', 'NY'),
(2, 'Bob', 'Los Angeles', 'CA');

INSERT INTO dim_date (date_id, year, month, day, quarter) VALUES
('2023-01-01', 2023, 1, 1, 1),
('2023-04-01', 2023, 4, 1, 2);

-- Insert data into fact table
INSERT INTO fact_sales (sale_id, date_id, product_id, customer_id, quantity, amount) VALUES
(1, '2023-01-01', 1, 1, 10, 199.99),
(2, '2023-04-01', 2, 2, 5, 299.99);

-- Example query for sales analysis
SELECT
    dp.category,
    dd.year,
    dd.quarter,
    SUM(fs.amount) as total_sales,
    COUNT(DISTINCT fs.customer_id) as unique_customers
FROM
    fact_sales fs
JOIN
    dim_product dp ON fs.product_id = dp.product_id
JOIN
    dim_date dd ON fs.date_id = dd.date_id
GROUP BY
    dp.category, dd.year, dd.quarter
ORDER BY
    dd.year, dd.quarter, total_sales DESC;

 * sqlite://
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
2 rows affected.
2 rows affected.
2 rows affected.
2 rows affected.
Done.


category,year,quarter,total_sales,unique_customers
Electronics,2023,1,199.99,1
Clothing,2023,2,299.99,1
