# Groceries E-commerce EDA


In [33]:
# Imports and constants

import os
import boto3
import requests
from dotenv import load_dotenv
import pathlib
import pandas as pd
import fastparquet as fp

# Load environment variables from .env file
load_dotenv()
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
S3_PATH = "s3://zrive-ds-data/groceries/sampled-datasets/"
LOCAL_DATA_PATH = 'groceries_data/'


## 1. Download or get data

First time we need to download the data from AWS s3://zrive-ds-data/groceries/sampled-datasets/ and save it. Next times, we only need to access local data. 

In [None]:
def download_grocery_data(aws_url: str, local_path: str) -> None:
    """
    Downloads all grocery data files (.parquet) from s3 dir and saves them to the specified local path.
    """

    print(f"Downloading grocery data from {aws_url} to {local_path}")
    
    # Create local directory if it doesn't exist
    pathlib.Path(local_path).mkdir(parents=True, exist_ok=True)

    # Initialize S3 client
    s3 = boto3.client(
        's3',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY
    )

    # Extract bucket name and prefix from the S3 path
    bucket_name = aws_url.split('/')[2]
    prefix = '/'.join(aws_url.split('/')[3:])
    print(f"Bucket_name: {bucket_name}, Prefix: {prefix}")

    # List objects in the specified S3 bucket and prefix
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    if 'Contents' not in response:
        print("No contents found in the specified S3 path.")
        return
    for obj in response['Contents']:
        file_key = obj['Key']
        file_name = os.path.basename(file_key)
        local_file_path = os.path.join(local_path, file_name)

        # Download the file
        print(f"Downloading {file_key} to {local_file_path}")
        s3.download_file(bucket_name, file_key, local_file_path)



def data_files_exists(local_path: str, files_num: int = 5) -> bool:
    """
    Check if the specified number of data files exist in the local path.
    """
    # List all files in the local directory
    files = os.listdir(local_path)
    
    # Filter for .parquet files
    parquet_files = [f for f in files if f.endswith('.parquet')]
    
    # Check if the number of .parquet files is greater than or equal to the specified number
    return len(parquet_files) >= files_num



def get_grocery_data(aws_url: str, local_path: str) -> pd.DataFrame:
    """
    Main function to get grocery data. Downloads the data if it doesn't exist locally.
    """
    # Check if the data already exists
    if not data_files_exists(aws_url):
        print("Data not found locally. Downloading...")
        download_grocery_data(aws_url, local_path)
    else:
        print("Data already exists locally.")

    # Load for example orders data into a DataFrame
    orders_df = pd.read_parquet('orders.parquet', engine='fastparquet')
    
    return orders_df

In [32]:
df = get_grocery_data(S3_PATH, LOCAL_DATA_PATH)
df.head()

Data not found locally. Downloading...
Downloading grocery data from s3://zrive-ds-data/groceries/sampled-datasets/ to groceries_data/
Bucket_name: zrive-ds-data, Prefix: groceries/sampled-datasets/
Downloading groceries/sampled-datasets/abandoned_carts.parquet to groceries_data/abandoned_carts.parquet
Downloading groceries/sampled-datasets/inventory.parquet to groceries_data/inventory.parquet
Downloading groceries/sampled-datasets/orders.parquet to groceries_data/orders.parquet
Downloading groceries/sampled-datasets/regulars.parquet to groceries_data/regulars.parquet
Downloading groceries/sampled-datasets/users.parquet to groceries_data/users.parquet


ImportError: Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [None]:

bucket_name = S3_PATH.split('/')[2]
prefix = '/'.join(S3_PATH.split('/')[3:])

# Initialize S3 client
s3_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)