# Data Caching & Exploration

## DB Connection

In [None]:
import pandas as pd
import os
from dotenv import load_dotenv
from snowflake.snowpark import Session

In [None]:
 # load environment variables from .env file
load_dotenv()

# create snowpark session
params = {
    "account": os.environ["SNOWFLAKE_ACCOUNT"],
    "role": os.environ["SNOWFLAKE_ROLE"],
    "database": os.environ["SNOWFLAKE_DATABASE"],
    "schema": os.environ["SNOWFLAKE_SCHEMA"],
    "warehouse": os.environ["SNOWFLAKE_WAREHOUSE"],
    "user": os.environ["SNOWFLAKE_USER"],
    "password": os.environ["SNOWFLAKE_PASSWORD"],
    "authentication": "snowflake",
    "session_parameters": {
        "QUERY_TAG": "hack4rail",
    },
}
session = Session.builder.configs(params).create()

## Row Count

In [None]:
# Additional query to get total row count
row_count_query = "SELECT COUNT(*) AS total_rows FROM BATTERIELOK_DATA"
session.sql(row_count_query).show()

## Load BATTERIELOK_DATA

In [None]:
# Download BATTERIELOK_DATA to a pandas dataframe
df = session.table("BATTERIELOK_DATA").to_pandas()

# Display the first few rows to verify the data was loaded correctly
print(f"Downloaded {len(df)} rows from BATTERIELOK_DATA")
df.head()

In [None]:
df.info()

## Cache Data

In [None]:
df.to_parquet("batterielok_data.parquet.gzip", compression="gzip", index=False)

## Cache Subset

In [None]:
df_sub = df.sort_values(by="TIMESTAMP_VEHICLE", ascending=True)[:1000000]
df_sub.to_parquet("batterielok_data_sub.parquet", compression="gzip", index=False)

## Use Cache

In [None]:
df = pd.read_parquet("../data/batterielok_data.parquet.gzip")

In [None]:
!uv pip install ydata-profiling

In [None]:
from ydata_profiling import ProfileReport


profile = ProfileReport(
    df,
    title="Profiling Report",
    # useful for large datasets
    correlations={
        "auto": {"calculate": False},
        "pearson": {"calculate": False},
        "spearman": {"calculate": False},
        "kendall": {"calculate": False},
        "phi_k": {"calculate": False},
        "cramers": {"calculate": False},
    },
    missing_diagrams={
        "heatmap": False,
        "matrix": False
    },
)
profile.to_file("../data/batterielok_profiling_report.html")

In [None]:
!uv sync