In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import DateType
from pyspark.sql.window import Window as W
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pyspark.sql import DataFrame

In [0]:
def null_check(df):
    null_col_names = list()
    for col in df.columns:
        if df.filter(F.col(col).isNull()).count() > 0:
            null_col_names.append(col)
        else:
            continue
    return null_col_names

In [0]:
def get_tables(catalog_name, schema_name):
    return [
        row["table"]
        for row in spark.sql(
            f"SHOW TABLES IN {catalog_name}.{schema_name}"
        ).selectExpr(
            f"concat_ws('.', '{catalog_name}', database, tableName) as table"
        ).filter(""" table not rlike "_sqldf" """)
        .collect()
    ]

In [0]:
get_tables("google_fit", "silver")

daily_activity_metrics

In [0]:
df_dam = spark.table("google_fit.silver.daily_activity_metrics")

In [0]:
display(df_dam)

In [0]:
null_check(df_dam)

In [0]:
df_dam.columns

In [0]:
def add_agg_metrics(
    df,
    partition_cols=["entity"],
    granularity_cols=["date", "month", "week"],
    agg_on_cols=[
        "distance_m"
        # "step_count",
        # "heart_points",
        # "heart_minutes",
        # "move_minutes_count",
    ],
    agg_metrics=["avg", "max", "min", "sum"]
):
    map_granularity_col_to_name = lambda granularity_col: (
        "weekly"
        if granularity_col == "week"
        else ("monthly" if granularity_col == "month" else "daily")
    )
    def get_expr_for_agg_metric(agg_metric, agg_on_col):
        if agg_metric == "avg":
            return F.avg(F.col(agg_on_col))
        elif agg_metric == "max":
            return F.max(F.col(agg_on_col))
        elif agg_metric == "min":
            return F.min(F.col(agg_on_col))
        elif agg_metric == "sum":
            return F.sum(F.col(agg_on_col))

    for granularity_col in granularity_cols:
        partition_col_list = []
        partition_col_list.extend(partition_cols)
        partition_col_list.append(granularity_col)
        print("partition_col_list: ", partition_col_list)
        w_spec = W.partitionBy(partition_col_list)
        for agg_on_col in agg_on_cols:
            for agg_metric in agg_metrics:
                expr = get_expr_for_agg_metric(agg_metric, agg_on_col)
                # print("expr: ", expr)
                # print(" expr.over(w_spec): ",  expr.over(w_spec))
                df = df.withColumn(
                    f"{map_granularity_col_to_name(granularity_col)}_{agg_on_col}_{agg_metric}",
                    expr.over(w_spec)
                )
    return df

In [0]:
df_dam_test = (
    df_dam.select(['entity', 'date', 'distance_m', 'step_count', 'heart_points', 'heart_minutes', 'move_minutes_count'])
        .withColumn('week', F.next_day(F.col('date'), 'sunday'))
        .withColumn('month', F.date_trunc('month', F.col('date')).cast(DateType()))
        .orderBy(F.col('date').desc())
)

In [0]:
display(df_dam_test)

In [0]:
df_dam_test_agg = add_agg_metrics(df= df_dam_test)

In [0]:
display(df_dam_test_agg)

In [0]:
def plot_linegraphs(df_pd, figsize=(60, 30), graphs= None, category= 'entity'):
    if isinstance(df_pd, DataFrame):
        df_pd = df_pd.toPandas()
    if graphs is None:
        graphs = {
            'date': [],
            'week': [],
            'month': []
        }
        cols = list(df_pd.columns)
        for col in cols:
            if 'daily' in col:
                graphs['date'].append(col)
            elif 'weekly' in col:
                graphs['week'].append(col)
            elif 'monthly' in col:
                graphs['month'].append(col)
    fig, axs = plt.subplots(ncols=max([len(graphs[k]) for k in graphs.keys()]), nrows= len(graphs) , figsize= figsize)
    for xi, x in enumerate(graphs.keys()):
        for yi, y in enumerate(graphs[x]):
            sns.lineplot(data= df_pd, x= df_pd[x], y= df_pd[y], hue= category, ax= axs[xi, yi])
            axs[xi, yi].set_title(f"{y}_by_{x}")
    fig.tight_layout()


In [0]:
df_dam_test_agg_pd = df_dam_test_agg.toPandas()

In [0]:
plot_linegraphs(df_dam_test_agg_pd)

activities

In [0]:
df_act = spark.table("google_fit.silver.activities")

In [0]:
display(df_act)

In [0]:
null_check(df_act)

all_sessions

In [0]:
df_ses = spark.table("google_fit.silver.all_sessions")

In [0]:
display(df_ses)

In [0]:
null_check(df_ses)

In [0]:
display(df_ses.groupBy('entity').count())