# Cloud storage solutions

The code mounts Google Drive, saves a pandas DataFrame as a CSV file to Google Drive, and then reads it back into a DataFrame. This contrasts with the textbook's use of Amazon S3 for data storage and retrieval.

In [None]:
from google.colab import drive
import pandas as pd
from io import StringIO
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define functions
def save_to_drive(df, file_path):
    # Create directories if they do not exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    with open(file_path, 'w') as f:
        f.write(csv_buffer.getvalue())

def read_from_drive(file_path):
    return pd.read_csv(file_path)

# Example usage
file_path = '/content/drive/My Drive/data/sales_2023.csv'

# Create sample DataFrame
df = pd.DataFrame({
    'date': pd.date_range(start='2023-01-01', periods=5),
    'sales': [100, 150, 200, 120, 180]
})

# Save DataFrame to Google Drive
save_to_drive(df, file_path)
print(f"Data saved to Google Drive at: {file_path}")

# Read DataFrame from Google Drive
df_from_drive = read_from_drive(file_path)
print("Data read from Google Drive:")
print(df_from_drive)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data saved to Google Drive at: /content/drive/My Drive/data/sales_2023.csv
Data read from Google Drive:
         date  sales
0  2023-01-01    100
1  2023-01-02    150
2  2023-01-03    200
3  2023-01-04    120
4  2023-01-05    180


# Distributed file systems

The code mounts Google Drive, writes a pandas DataFrame to a Parquet file on Google Drive, and then reads it back into a DataFrame. The textbook uses HDFS for similar operations, which involves a more complex setup for big data storage and retrieval.

In [None]:
from google.colab import drive
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from io import BytesIO
import os

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Define functions
def write_to_drive(df, path):
    table = pa.Table.from_pandas(df)
    buffer = BytesIO()
    pq.write_table(table, buffer)
    with open(path, 'wb') as f:
        f.write(buffer.getvalue())

def read_from_drive(path):
    with open(path, 'rb') as f:
        buffer = BytesIO(f.read())
    table = pq.read_table(buffer)
    return table.to_pandas()

# Example usage
file_path = '/content/drive/My Drive/data/sales_2023.parquet'

# Ensure the directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Create sample DataFrame
df = pd.DataFrame({
    'date': pd.date_range(start='2023-01-01', periods=5),
    'sales': [100, 150, 200, 120, 180]
})

# Write DataFrame to Google Drive as Parquet
write_to_drive(df, file_path)
print(f"Data written to Google Drive: {file_path}")

# Read DataFrame from Google Drive
df_from_drive = read_from_drive(file_path)
print("Data read from Google Drive:")
print(df_from_drive)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data written to Google Drive: /content/drive/My Drive/data/sales_2023.parquet
Data read from Google Drive:
        date  sales
0 2023-01-01    100
1 2023-01-02    150
2 2023-01-03    200
3 2023-01-04    120
4 2023-01-05    180


# Database management systems for DataOps

Demonstrates how to use PostgreSQL in a DataOps workflow, including creating tables, inserting data, and performing analytical queries. It showcases the integration of DBMS with data processing libraries like pandas

In [None]:
!pip install pandas sqlalchemy

In [3]:
import pandas as pd
from sqlalchemy import create_engine, text

# Create a SQLite engine that uses an in-memory database
engine = create_engine('sqlite:///:memory:')

# Connect to the engine and create a table
with engine.connect() as conn:
    # Execute SQL command to create a table using the `text()` function to ensure the SQL command is executable
    conn.execute(text("""
    CREATE TABLE IF NOT EXISTS sales (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        date DATE,
        amount FLOAT
    )
    """))

    # Prepare data for insertion
    df = pd.DataFrame({
        'date': pd.date_range('2023-01-01', periods=5),
        'amount': [100, 150, 200, 120, 180]
    })

    # Insert data using pandas DataFrame.to_sql method
    df.to_sql('sales', con=engine, if_exists='append', index=False)  # Note the change from conn to engine for `to_sql`

    # Prepare SQL query to sum amounts by date
    query = text("""
    SELECT date, SUM(amount) as total_sales FROM sales GROUP BY date ORDER BY date
    """)

    # Execute query and store results in a DataFrame
    result_df = pd.read_sql_query(query, con=engine)  # Using engine for consistency in `read_sql_query`

    # Print summary results
    print("Sales summary:")
    print(result_df)

Sales summary:
                         date  total_sales
0  2023-01-01 00:00:00.000000        100.0
1  2023-01-02 00:00:00.000000        150.0
2  2023-01-03 00:00:00.000000        200.0
3  2023-01-04 00:00:00.000000        120.0
4  2023-01-05 00:00:00.000000        180.0
