<a href="https://colab.research.google.com/github/ShikharV010/gist_daily_runs/blob/main/PagePerformance_TrendsCalculations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install psycopg2-binary sqlalchemy pandas

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.10


In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# Replace with your actual PostgreSQL connection info
db_config = {
    'user': 'airbyte_user',
    'password': 'airbyte_user_password',
    'host': 'gw-rds-prod.celzx4qnlkfp.us-east-1.rds.amazonaws.com',
    'port': '5432',
    'database': 'gw_prod'
}

# Create SQLAlchemy engine
engine = create_engine(f"postgresql+psycopg2://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}")


In [None]:
sql = """
SELECT
  campaign_id,
  start_date::date AS start_date,
  end_date::date AS end_date,
  impressions,
  clicks,
  ctr,
  position
FROM gist.matv_gist_pageperformance
"""
df_raw = pd.read_sql(sql, engine)


In [None]:
display(df_raw)

Unnamed: 0,campaign_id,start_date,end_date,impressions,clicks,ctr,position
0,00d856fb-4297-480c-b2d5-c13701deffe0,2024-06-06,2024-07-04,0,0,0.000000,0.000000
1,00d856fb-4297-480c-b2d5-c13701deffe0,2024-06-13,2024-07-11,0,0,0.000000,0.000000
2,00d856fb-4297-480c-b2d5-c13701deffe0,2024-06-20,2024-07-18,0,0,0.000000,0.000000
3,00d856fb-4297-480c-b2d5-c13701deffe0,2024-06-27,2024-07-25,0,0,0.000000,0.000000
4,00d856fb-4297-480c-b2d5-c13701deffe0,2024-07-04,2024-08-01,0,0,0.000000,0.000000
...,...,...,...,...,...,...,...
1648814,ffcdff2b-6a78-46db-b636-7ea621dad1a7,2025-06-26,2025-07-24,3346,2,0.000598,56.184100
1648815,ffcdff2b-6a78-46db-b636-7ea621dad1a7,2025-07-03,2025-07-31,3017,2,0.000663,55.667219
1648816,ffcdff2b-6a78-46db-b636-7ea621dad1a7,2025-07-10,2025-08-07,2137,2,0.000936,55.865232
1648817,ffcdff2b-6a78-46db-b636-7ea621dad1a7,2025-07-17,2025-08-14,1374,1,0.000728,54.267831


In [None]:
# Ensure start_date and end_date are in datetime format
df_raw["start_date"] = pd.to_datetime(df_raw["start_date"])
df_raw["end_date"] = pd.to_datetime(df_raw["end_date"])

# Create a period label like "5Jun-3Jul"
df_raw["period_label"] = df_raw["start_date"].dt.strftime('%-d%b') + "-" + df_raw["end_date"].dt.strftime('%-d%b')


In [None]:
# Aggregate metrics by campaign_id and period_label
df_agg = (
    df_raw
    .groupby(['campaign_id', 'period_label'], as_index=False)
    .agg({
        'impressions': 'sum',
        'clicks': 'sum',
        'ctr': 'mean',  # You can later switch to weighted avg
        'position': lambda x: x.replace(0, np.nan).mean()
    })
)

# Compute max impressions and clicks per campaign
max_vals = (
    df_agg.groupby('campaign_id')[['impressions', 'clicks']]
    .max()
    .rename(columns={'impressions': 'impressions_max', 'clicks': 'clicks_max'})
    .reset_index()
)

# Merge max values back to weekly agg
df_agg = df_agg.merge(max_vals, on='campaign_id', how='left')


In [None]:
import pandas as pd
import re

# Step 1: Pivot df_agg (excluding max columns)
df_to_pivot = df_agg.drop(columns=["impressions_max", "clicks_max"])
df_pivot_clean = df_to_pivot.pivot(index='campaign_id', columns='period_label')
df_pivot_clean.columns = [f"{metric}_{period}" for metric, period in df_pivot_clean.columns]
df_pivot_clean.reset_index(inplace=True)

# Step 2: Get period_label → end_date mapping from df_raw
period_end_dates = (
    df_raw[['period_label', 'end_date']]
    .drop_duplicates()
    .sort_values('end_date', ascending=False)
    .set_index('period_label')
)

# Step 3: Sort all metrics based on descending end_date
def sort_metric_columns(metric_prefix):
    return [
        f"{metric_prefix}_{label}"
        for label in period_end_dates.index
        if f"{metric_prefix}_{label}" in df_pivot_clean.columns
    ]

impr_cols = sort_metric_columns("impressions")
click_cols = sort_metric_columns("clicks")
ctr_cols   = sort_metric_columns("ctr")
pos_cols   = sort_metric_columns("position")

# Step 4: Merge max values
df_pivot_clean = df_pivot_clean.merge(max_vals, on="campaign_id", how="left")

# Step 5: Reorder columns for final output
final_cols = (
    ["campaign_id", "impressions_max", "clicks_max"] +
    impr_cols + click_cols + ctr_cols + pos_cols
)

df_pivot_clean = df_pivot_clean[final_cols]


In [None]:
display(df_pivot_clean)

Unnamed: 0,campaign_id,impressions_max,clicks_max,impressions_31Jul-28Aug,impressions_24Jul-21Aug,impressions_17Jul-14Aug,impressions_10Jul-7Aug,impressions_3Jul-31Jul,impressions_26Jun-24Jul,impressions_19Jun-17Jul,...,position_15Feb-14Mar,position_8Feb-7Mar,position_1Feb-29Feb,position_25Jan-22Feb,position_18Jan-15Feb,position_11Jan-8Feb,position_4Jan-1Feb,position_28Dec-25Jan,position_21Dec-18Jan,position_14Dec-11Jan
0,00d856fb-4297-480c-b2d5-c13701deffe0,106752,212,98359.0,101364.0,101452.0,106752.0,103348.0,93992.0,87564.0,...,,,,,,,,,,
1,013300d9-d7e1-4cf7-8b88-e16f02d5c600,141220,966,77664.0,97818.0,120514.0,134977.0,140303.0,141220.0,135160.0,...,,,,,,,,,,
2,01c697d0-b570-491b-bd5b-192820325bb5,178038,587,178038.0,174023.0,170933.0,167122.0,162007.0,155592.0,146726.0,...,,,,,,,,,,
3,02344b65-6cd1-401c-89be-d69221aa428e,28257,10,28257.0,25760.0,21853.0,18932.0,12635.0,8964.0,7540.0,...,,,,,,,,,,
4,027d4576-60cb-4422-8425-50c62b0a0e16,25358,23,0.0,0.0,1.0,1.0,1.0,1.0,9.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,fdf60792-f202-4746-b7a7-d52813374cef,204687,465,204687.0,195894.0,185286.0,183522.0,181120.0,180910.0,179519.0,...,,,,,,,,,,
257,fe2dc8c0-eae4-4e20-9cee-3c95adce12f1,273773,682,273773.0,263149.0,258974.0,230674.0,222709.0,215472.0,198527.0,...,,,,,,,,,,
258,ff9b786f-1905-480b-a8dc-b96059af9b60,15503,107,15503.0,13823.0,13591.0,14076.0,14186.0,12463.0,7723.0,...,,,,,,,,,,
259,ffc5a287-b293-41e7-b5f0-e52b01fe8b98,25001,34,25001.0,23205.0,20889.0,20659.0,17899.0,12274.0,7084.0,...,,,,,,,,,,


In [None]:
import pandas as pd
import numpy as np

# ---- Helper for labeling performance change ----
def label_change(metric, change):
    metric = metric.capitalize()
    if change > 0.30:
        return f"{metric} - gain > 30%"
    elif change > 0.20:
        return f"{metric} - gain > 20%"
    elif change > 0.10:
        return f"{metric} - gain > 10%"
    elif change > 0.05:
        return f"{metric} - gain > 5%"
    elif change < -0.30:
        return f"{metric} - drop > 30%"
    elif change < -0.20:
        return f"{metric} - drop > 20%"
    elif change < -0.10:
        return f"{metric} - drop > 10%"
    elif change < -0.05:
        return f"{metric} - drop > 5%"
    else:
        return f"{metric} - stagnant"

# ---- Create a copy for performance calculations ----
df_performance = df_pivot_clean.copy()

# ---- Step 1: Identify latest 2 28-day periods ----
impr_cols = [col for col in df_performance.columns if col.startswith("impressions_") and col != "impressions_max"]
click_cols = [col for col in df_performance.columns if col.startswith("clicks_") and col != "clicks_max"]

# Extract end date from labels like "5Jun-3Jul"
# Use df_raw to get accurate end dates
period_end_dates = (
    df_raw[['period_label', 'end_date']]
    .drop_duplicates()
    .sort_values('end_date', ascending=False)
    .set_index('period_label')
)

def sort_by_end_date(cols, prefix):
    sorted_periods = [
        f"{prefix}_{label}"
        for label in period_end_dates.index
        if f"{prefix}_{label}" in cols
    ]
    return sorted_periods

impr_cols_sorted = sort_by_end_date(impr_cols, "impressions")
click_cols_sorted = sort_by_end_date(click_cols, "clicks")


# Latest 2 periods
latest_impr_1, latest_impr_2 = impr_cols_sorted[:2]
latest_click_1, latest_click_2 = click_cols_sorted[:2]

# ---- Step 2: Calculate performance_wrt_max ----
impr_change_max = ((df_performance[latest_impr_1] - df_performance["impressions_max"]) / df_performance["impressions_max"]).replace([np.inf, -np.inf], np.nan).fillna(0)
click_change_max = ((df_performance[latest_click_1] - df_performance["clicks_max"]) / df_performance["clicks_max"]).replace([np.inf, -np.inf], np.nan).fillna(0)

df_performance["performance_wrt_max"] = impr_change_max.apply(lambda x: label_change("impressions", x)) + ", " + click_change_max.apply(lambda x: label_change("clicks", x))

# ---- Step 3: Calculate performance_last_week ----
impr_change_wow = ((df_performance[latest_impr_1] - df_performance[latest_impr_2]) / df_performance[latest_impr_2]).replace([np.inf, -np.inf], np.nan).fillna(0)
click_change_wow = ((df_performance[latest_click_1] - df_performance[latest_click_2]) / df_performance[latest_click_2]).replace([np.inf, -np.inf], np.nan).fillna(0)

df_performance["performance_last_week"] = impr_change_wow.apply(lambda x: label_change("impressions", x)) + ", " + click_change_wow.apply(lambda x: label_change("clicks", x))

# ---- Step 4: Reorder columns to place performance columns after campaign_id ----
cols = df_performance.columns.tolist()
cols = [col for col in cols if col not in ["performance_wrt_max", "performance_last_week"]]
campaign_idx = cols.index("campaign_id")

new_order = (
    cols[:campaign_idx + 1]
    + ["performance_wrt_max", "performance_last_week"]
    + cols[campaign_idx + 1:]
)
df_performance = df_performance[new_order]

# ---- Step 5: Round numeric values appropriately ----
impr_cols = [col for col in df_performance.columns if col.startswith("impressions_") and col != "impressions_max"]
click_cols = [col for col in df_performance.columns if col.startswith("clicks_") and col != "clicks_max"]
ctr_cols = [col for col in df_performance.columns if col.startswith("ctr_")]
pos_cols = [col for col in df_performance.columns if col.startswith("position_")]

df_performance[impr_cols] = df_performance[impr_cols].round(0).astype("Int64")
df_performance[click_cols] = df_performance[click_cols].round(0).astype("Int64")
df_performance[ctr_cols] = df_performance[ctr_cols].round(3)
df_performance[pos_cols] = df_performance[pos_cols].round(3)


In [None]:
display(df_performance)

Unnamed: 0,campaign_id,performance_wrt_max,performance_last_week,impressions_max,clicks_max,impressions_31Jul-28Aug,impressions_24Jul-21Aug,impressions_17Jul-14Aug,impressions_10Jul-7Aug,impressions_3Jul-31Jul,...,position_15Feb-14Mar,position_8Feb-7Mar,position_1Feb-29Feb,position_25Jan-22Feb,position_18Jan-15Feb,position_11Jan-8Feb,position_4Jan-1Feb,position_28Dec-25Jan,position_21Dec-18Jan,position_14Dec-11Jan
0,00d856fb-4297-480c-b2d5-c13701deffe0,"Impressions - drop > 5%, Clicks - drop > 10%","Impressions - stagnant, Clicks - drop > 10%",106752,212,98359,101364,101452,106752,103348,...,,,,,,,,,,
1,013300d9-d7e1-4cf7-8b88-e16f02d5c600,"Impressions - drop > 30%, Clicks - drop > 30%","Impressions - drop > 20%, Clicks - drop > 30%",141220,966,77664,97818,120514,134977,140303,...,,,,,,,,,,
2,01c697d0-b570-491b-bd5b-192820325bb5,"Impressions - stagnant, Clicks - drop > 10%","Impressions - stagnant, Clicks - drop > 10%",178038,587,178038,174023,170933,167122,162007,...,,,,,,,,,,
3,02344b65-6cd1-401c-89be-d69221aa428e,"Impressions - stagnant, Clicks - stagnant","Impressions - gain > 5%, Clicks - stagnant",28257,10,28257,25760,21853,18932,12635,...,,,,,,,,,,
4,027d4576-60cb-4422-8425-50c62b0a0e16,"Impressions - drop > 30%, Clicks - drop > 30%","Impressions - stagnant, Clicks - stagnant",25358,23,0,0,1,1,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,fdf60792-f202-4746-b7a7-d52813374cef,"Impressions - stagnant, Clicks - drop > 10%","Impressions - stagnant, Clicks - stagnant",204687,465,204687,195894,185286,183522,181120,...,,,,,,,,,,
257,fe2dc8c0-eae4-4e20-9cee-3c95adce12f1,"Impressions - stagnant, Clicks - stagnant","Impressions - stagnant, Clicks - stagnant",273773,682,273773,263149,258974,230674,222709,...,,,,,,,,,,
258,ff9b786f-1905-480b-a8dc-b96059af9b60,"Impressions - stagnant, Clicks - drop > 5%","Impressions - gain > 10%, Clicks - gain > 5%",15503,107,15503,13823,13591,14076,14186,...,,,,,,,,,,
259,ffc5a287-b293-41e7-b5f0-e52b01fe8b98,"Impressions - stagnant, Clicks - stagnant","Impressions - gain > 5%, Clicks - stagnant",25001,34,25001,23205,20889,20659,17899,...,,,,,,,,,,


In [None]:
from sqlalchemy import create_engine, text

# Step 0: Setup engine
engine = create_engine(
    "postgresql://airbyte_user:airbyte_user_password@gw-rds-prod.celzx4qnlkfp.us-east-1.rds.amazonaws.com:5432/gw_prod"
)

# Step 1: Force drop the materialized view WITH CASCADE
with engine.begin() as conn:  # begin() ensures commit
    print("⏳ Dropping materialized view...")
    conn.execute(text("DROP MATERIALIZED VIEW IF EXISTS gist.matv_gist_pageperformancetrends CASCADE;"))
    print("✅ Dropped materialized view")

# Step 2: Replace the base table with df_performance
df_performance.to_sql(
    name="gist_pageperformancetrends",
    con=engine,
    schema="gist",
    if_exists="replace",  # Replace table
    index=False,
    method="multi"
)
print("✅ Table 'gist_pageperformancetrends' written successfully")

# Step 3: Recreate the materialized view
with engine.begin() as conn:
    print("⏳ Creating materialized view...")
    conn.execute(text("""
        CREATE MATERIALIZED VIEW gist.matv_gist_pageperformancetrends
        TABLESPACE pg_default
        AS
        SELECT * FROM gist.gist_pageperformancetrends
        WITH DATA;
    """))
    conn.execute(text("ALTER TABLE gist.matv_gist_pageperformancetrends OWNER TO airbyte_user;"))
    print("✅ Recreated materialized view")


⏳ Dropping materialized view...
✅ Dropped materialized view
✅ Table 'gist_pageperformancetrends' written successfully
⏳ Creating materialized view...
✅ Recreated materialized view
