# Table Imports

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import plotly.express as px

In [0]:

# this cell reads in the data from the gold layer
Base_Gold = "abfss://team1-project2@20251124eyproject2.dfs.core.windows.net/gharchive-gold"

datetime_dim_df = spark.read.parquet(f"{Base_Gold}/datetime_dim")
event_type_dim_df = spark.read.parquet(f"{Base_Gold}/event_type_dim")

user_activity_weekly_fact_df = spark.read.parquet(f"{Base_Gold}/user_activity_weekly_fact")
user_activity_weekly_fact_df.createOrReplaceTempView("user_activity_weekly_fact")

push_event_df = spark.read.parquet(f"{Base_Gold}/push_event_fact")
push_event_df.createOrReplaceTempView("push_event_fact")

event_hourly_fact_df = spark.read.parquet(f"{Base_Gold}/event_hourly_fact")

weekly_repo_activity_df = spark.read.parquet(f"{Base_Gold}/weekly_repo_activity")

weekly_event_type_activity_df = spark.read.parquet(f"{Base_Gold}/weekly_event_type_activity")

In [0]:
%sql
-- Query on weekly active users
SELECT
  week_id,
  DATE_ADD(
    TO_DATE(CONCAT(CAST(week_id/100 AS INT), '-01-01')),
    (week_id % 100 - 1) * 7
  ) AS week_start_date,
  COUNT(DISTINCT user_id) AS weekly_active_users
FROM user_activity_weekly_fact
WHERE CAST(week_id/100 AS INT) = 2015
  AND week_id < (SELECT MAX(week_id) FROM user_activity_weekly_fact WHERE CAST(week_id/100 AS INT) = 2015)
GROUP BY week_id
ORDER BY week_id;

In [0]:

# query on push events by ref type and visual
tree_df = spark.sql("""
    SELECT
    ref_type,
    SUM(push_event_count) AS push_event_count
    FROM push_event_fact
    GROUP BY ref_type
""").toPandas()

fig = px.treemap(
    tree_df,
    path=["ref_type"],
    values="push_event_count",
    title="Push Events by Ref Type in 2015"
)

fig.update_layout(
    title_x=0.5,
    title_xanchor='center'
)

fig.update_traces(texttemplate="%{label}<br>%{value:,}<br>%{percentRoot:.1%}",textfont=dict(color="white"))
fig.show()



In [0]:
# Aggregate
agg_df = (
    enriched_df
        .groupBy("hour_24", "event_type_name")
        .agg(avg("event_count").alias("avg_events"))
        .toPandas()
)

# Calculate Total
total_df = agg_df.groupby("hour_24")["avg_events"].sum().reset_index()
total_df["event_type_name"] = "Total"

# Combine
agg_df = pd.concat([agg_df, total_df], ignore_index=True)

# Calculate the sort rank (Descending)
rank = agg_df.groupby("event_type_name")["avg_events"].mean().sort_values(ascending=False).index.tolist()

# Apply Categorical sorting for the legend
agg_df["event_type_name"] = pd.Categorical(agg_df["event_type_name"], categories=rank, ordered=True)
agg_df = agg_df.sort_values(["event_type_name", "hour_24"])

# Color Mapping
colors = px.colors.qualitative.Alphabet
color_map = {event: colors[i % len(colors)] for i, event in enumerate(rank)}
color_map["Total"] = "#1f77b4"  # A strong, standard blue

# Plot
fig = px.line(
    agg_df,
    x="hour_24",
    y="avg_events",
    color="event_type_name",
    color_discrete_map=color_map, 
    category_orders={"event_type_name": rank},
    title="Average GitHub Events per Hour of Day in 2015 (Total Included)",
    labels={
        "hour_24": "Hour of Day (24h)",
        "avg_events": "Average Event Count",
        "event_type_name": "Event Type"
    },
    markers=True,
    template="plotly"
)

fig.update_layout(
    xaxis=dict(
        tickmode="linear", 
        dtick=1,
        range=[0, 23] 
    ),
    hovermode="x unified",
    legend_title_text="Event Type (High to Low)",
    height=550
)

fig.show()

In [0]:
# Get the year and week from datetime_dim
datetime_week_dim = datetime_dim_df.select("year", "week", F.col("month").alias("month_of_year")).dropDuplicates(["year", "week"])

# Join weekly_repo_activity and datetime_week_dim
facts_with_month = weekly_repo_activity_df.join(datetime_week_dim, on=["year", "week"], how="inner")
facts_with_month = facts_with_month.withColumn("year_month", F.concat_ws("-", F.col("year"), F.lpad(F.col("month_of_year").cast("string"), 2, "0")))

# Aggregate from weekly to monthly
monthly_repo_activity_df = (
    facts_with_month
    .groupBy("year_month", "repo_id")
    .agg(F.sum("event_count").alias("event_count"))
)

# Get top repo per month
window = Window.partitionBy("year_month").orderBy(F.desc("event_count"))
top_repo_per_month = (
    monthly_repo_activity_df
    .withColumn("rank", F.row_number().over(window))
    .filter(F.col("rank") == 1)
    .select("year_month", "repo_id", "event_count")
    .orderBy("year_month")
)

# Convert to pandas df
top_repo_pd = top_repo_per_month.toPandas()
fig = px.bar(
    top_repo_pd,
    x="year_month",
    y="event_count",
    text="repo_id",
    labels={"year_month": "Month", "event_count": "Event Count", "repo_id": "Top Repo"},
    title="Top Repo per Month by Event Count"
)
fig.update_xaxes(type='category', tickmode='array', tickvals=top_repo_pd["year_month"])
fig.update_layout(xaxis_title="Month", yaxis_title="Event Count", title_x=0.5)
fig.show()

In [0]:
df_by_type = (
    weekly_event_type_activity_df
    .join(event_type_dim_df, weekly_event_type_activity_df.type_id == event_type_dim_df.type_id, "left")
    .groupBy(weekly_event_type_activity_df.type_id, event_type_dim_df.event_type)
    .agg(F.sum("event_occurrence_count").alias("Total Event Occurrences"))
    .orderBy(F.desc("Total Event Occurrences"))
)

fig = df_by_type.plot.bar(x="event_type", y="Total Event Occurrences", title="Total Event Occurrences by Type in 2015")
fig.update_layout(title_x=0.5)
fig.show()

<h1 style="text-align: left;">Analytics Dashboard</h1>