<a href="https://colab.research.google.com/github/ShikharV010/gist_daily_runs/blob/main/PagePerformance_TrendsCalculations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install psycopg2-binary sqlalchemy pandas

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.10


In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

# Replace with your actual PostgreSQL connection info
db_config = {
    'user': 'airbyte_user',
    'password': 'airbyte_user_password',
    'host': 'gw-postgres-dev.celzx4qnlkfp.us-east-1.rds.amazonaws.com',
    'port': '5432',
    'database': 'gw_prod'
}

# Create SQLAlchemy engine
engine = create_engine(f"postgresql+psycopg2://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}")


In [3]:
sql = """
SELECT
  campaign_id,
  start_date::date AS start_date,
  end_date::date AS end_date,
  impressions,
  clicks,
  ctr,
  position
FROM gist.matv_gist_pageperformance
"""
df_raw = pd.read_sql(sql, engine)


In [4]:
# Get distinct weeks sorted by start_date
week_ranks = (
    df_raw[['start_date', 'end_date']]
    .drop_duplicates()
    .sort_values('start_date')
    .reset_index(drop=True)
)
week_ranks['week_no'] = week_ranks.reset_index().index + 1  # earliest = 1

# Merge back into raw data
df_ranked = df_raw.merge(week_ranks, on=['start_date', 'end_date'], how='left')


In [5]:
df_agg = (
    df_ranked
    .groupby(['campaign_id', 'week_no'], as_index=False)
    .agg({
        'impressions': 'sum',
        'clicks': 'sum',
        'ctr': 'mean',          # You can change to weighted avg later
        'position': lambda x: x.replace(0, np.nan).mean()
    })
)


In [6]:
df_pivot = df_agg.pivot(index='campaign_id', columns='week_no')

# Flatten multi-index columns like ('clicks', 12) → clicks_week_12
df_pivot.columns = [
    f"{metric}_week_{week_no}" for metric, week_no in df_pivot.columns
]
df_pivot.reset_index(inplace=True)

In [7]:
import pandas as pd
import numpy as np

# 1. Helper function
def get_trend_label(metric, change):
    metric = metric.rstrip("s")  # Ensure labels say "impression" not "impressions"
    if change > 0.30: return f"{metric} gain > 30%"
    elif change > 0.20: return f"{metric} gain > 20%"
    elif change > 0.10: return f"{metric} gain > 10%"
    elif change > 0.05: return f"{metric} gain > 5%"
    elif change < -0.30: return f"{metric} drop > 30%"
    elif change < -0.20: return f"{metric} drop > 20%"
    elif change < -0.10: return f"{metric} drop > 10%"
    elif change < -0.05: return f"{metric} drop > 5%"
    return ""

# 2. Get week numbers from columns
week_nums = sorted([
    int(col.split("_week_")[1])
    for col in df_pivot.columns
    if "_week_" in col and col.startswith("impressions")
], reverse=True)

# 3. Pick top 10 latest week pairs
latest_weeks = week_nums[:11]  # 11 weeks to do 10 comparisons

# 4. Initialize output container
performance_columns = {"campaign_id": df_pivot["campaign_id"]}

# 5. Loop through and calculate
for i in range(1, 11):
    week_n = latest_weeks[i - 1]      # current week
    week_prev = latest_weeks[i]       # previous week

    label_list = []

    for metric in ["impressions", "clicks", "ctr"]:
        col_curr = f"{metric}_week_{week_n}"
        col_prev = f"{metric}_week_{week_prev}"

        change = (
            (df_pivot[col_curr] - df_pivot[col_prev]) / df_pivot[col_prev]
        ).replace([np.inf, -np.inf], np.nan).fillna(0)

        label_series = change.apply(lambda x: get_trend_label(metric, x))
        label_list.append(label_series)

    # Combine all metric labels for this comparison
    combined = pd.DataFrame(label_list).T
    combined[f"performance_week_{week_n}"] = combined.apply(
        lambda row: "Stagnant" if all(v == "" for v in row) else ", ".join(filter(None, row)),
        axis=1
    )

    performance_columns[f"performance_week_{week_n}"] = combined[f"performance_week_{week_n}"]

# 6. Final weekly performance DataFrame
df_weekly_perf = pd.DataFrame(performance_columns)


In [8]:
monthly_labels = []

# Dynamically extract top 8 week numbers (latest first)
week_nums = sorted([
    int(col.split("_week_")[1])
    for col in df_pivot.columns
    if col.startswith("impressions_week_")
], reverse=True)

recent_4 = week_nums[:4]
past_4 = week_nums[4:8]

for _, row in df_pivot.iterrows():
    labels = []

    # Sum recent and past impressions
    recent_impr = row[[f"impressions_week_{w}" for w in recent_4 if f"impressions_week_{w}" in row]].sum()
    past_impr = row[[f"impressions_week_{w}" for w in past_4 if f"impressions_week_{w}" in row]].sum()

    # Sum clicks
    recent_clicks = row[[f"clicks_week_{w}" for w in recent_4 if f"clicks_week_{w}" in row]].sum()
    past_clicks = row[[f"clicks_week_{w}" for w in past_4 if f"clicks_week_{w}" in row]].sum()

    # Average CTR
    recent_ctr = row[[f"ctr_week_{w}" for w in recent_4 if f"ctr_week_{w}" in row]].mean()
    past_ctr = row[[f"ctr_week_{w}" for w in past_4 if f"ctr_week_{w}" in row]].mean()

    # Generate performance label
    for metric, recent, past in zip(
        ["impression", "clicks", "ctr"],
        [recent_impr, recent_clicks, recent_ctr],
        [past_impr, past_clicks, past_ctr]
    ):
        change = 0 if past == 0 else (recent - past) / past
        label = get_trend_label(metric, change)
        if label:
            labels.append(label)

    final_label = "Stagnant" if not labels else ", ".join(labels)
    monthly_labels.append(final_label)

# Final DataFrame
df_monthly_perf = pd.DataFrame({
    "campaign_id": df_pivot["campaign_id"],
    "performance_monthly": monthly_labels
})


In [9]:
quarterly_labels = []

# Get valid weeks where all required metrics are present
valid_weeks = [
    w for w in week_nums
    if all(f"{metric}_week_{w}" in df_pivot.columns for metric in ["impressions", "clicks", "ctr"])
]

if len(valid_weeks) >= 13:  # Minimum 13 to allow a gap and at least 1 past week
    latest_week = valid_weeks[0]

    # Current quarter: latest 12 weeks
    recent_12 = [latest_week - i for i in range(12)]

    # For past quarter: up to 12 weeks before the recent block, skip 1 week in between
    past_start = latest_week - 13
    past_weeks = [past_start - i for i in range(12)]
    past_weeks_available = [w for w in past_weeks if w in valid_weeks]

    for _, row in df_pivot.iterrows():
        labels = []

        # Aggregate recent quarter
        recent_impr = row[[f"impressions_week_{w}" for w in recent_12 if f"impressions_week_{w}" in row]].sum()
        past_impr = row[[f"impressions_week_{w}" for w in past_weeks_available if f"impressions_week_{w}" in row]].sum()

        recent_clicks = row[[f"clicks_week_{w}" for w in recent_12 if f"clicks_week_{w}" in row]].sum()
        past_clicks = row[[f"clicks_week_{w}" for w in past_weeks_available if f"clicks_week_{w}" in row]].sum()

        recent_ctr = row[[f"ctr_week_{w}" for w in recent_12 if f"ctr_week_{w}" in row]].mean()
        past_ctr = row[[f"ctr_week_{w}" for w in past_weeks_available if f"ctr_week_{w}" in row]].mean()

        for metric, recent, past in zip(
            ["impression", "clicks", "ctr"],
            [recent_impr, recent_clicks, recent_ctr],
            [past_impr, past_clicks, past_ctr]
        ):
            change = 0 if past == 0 else (recent - past) / past
            label = get_trend_label(metric, change)
            if label:
                labels.append(label)

        final = "Stagnant" if not labels else ", ".join(labels)
        quarterly_labels.append(final)

    df_quarterly_perf = pd.DataFrame({
        "campaign_id": df_pivot["campaign_id"],
        "performance_quarterly": quarterly_labels
    })

else:
    print("Not enough weeks to compute quarterly performance (need ≥13 weeks)")
    df_quarterly_perf = pd.DataFrame({
        "campaign_id": df_pivot["campaign_id"],
        "performance_quarterly": ["N/A"] * len(df_pivot)
    })


In [10]:
# Merge on campaign_id
df_perf_all = df_monthly_perf.merge(df_quarterly_perf, on="campaign_id", how="left") \
                             .merge(df_weekly_perf, on="campaign_id", how="left")

# Show full DataFrame in notebook
pd.set_option('display.max_columns', None)

In [11]:
# Step 1: Identify only the performance_week_* columns
week_perf_cols = [col for col in df_perf_all.columns if col.startswith("performance_week_")]

# Step 2: Extract week numbers and sort descending (latest week first)
original_week_nums = sorted([
    int(col.split("_")[-1]) for col in week_perf_cols
], reverse=True)

# Step 3: Create a mapping to rename them with week_10 being latest
rename_map = {
    f"performance_week_{old}": f"performance_week_{new}"
    for old, new in zip(original_week_nums, range(10, 0, -1))
}

# Step 4: Apply renaming to get the final DataFrame
df_perf_all_final = df_perf_all.rename(columns=rename_map)


In [12]:
# Get all unique week numbers from impressions columns
week_nums_all = sorted([
    int(col.split("_week_")[1])
    for col in df_pivot.columns
    if col.startswith("impressions_week_")
], reverse=True)

# Take the top 10 weeks (most recent)
top_10_weeks = week_nums_all[:10]

# Desired order: first impressions for all weeks, then clicks, ctr, position
metrics = ['impressions', 'clicks', 'ctr', 'position']
ordered_cols = ['campaign_id'] + [
    f"{metric}_week_{w}" for metric in metrics for w in top_10_weeks
]

# Filter the DataFrame
df_pivot_filtered = df_pivot[ordered_cols].copy()


In [13]:
# Create a rename mapping: week_22 → week_10, week_21 → week_9, ..., week_13 → week_1
rename_mapping = {}
for i, week in enumerate(top_10_weeks):
    for metric in metrics:
        old_col = f"{metric}_week_{week}"
        new_col = f"{metric}_week_{10 - i}"
        rename_mapping[old_col] = new_col

# Apply the renaming
df_pivot_filtered_renamed = df_pivot_filtered.rename(columns=rename_mapping)


In [14]:
# Join on campaign_id
df_final = df_perf_all_final.merge(
    df_pivot_filtered_renamed,
    on='campaign_id',
    how='left'
)


In [16]:
from sqlalchemy import create_engine, text

# Step 0: Setup engine
engine = create_engine(
    "postgresql://airbyte_user:airbyte_user_password@gw-postgres-dev.celzx4qnlkfp.us-east-1.rds.amazonaws.com:5432/gw_prod"
)

# Step 1: Force drop the materialized view WITH CASCADE
with engine.begin() as conn:  # begin() ensures commit
    print("⏳ Dropping materialized view...")
    conn.execute(text("DROP MATERIALIZED VIEW IF EXISTS gist.matv_gist_pageperformancetrends CASCADE;"))
    print("✅ Dropped materialized view")

# Step 2: Replace the base table
df_final.to_sql(
    name="gist_pageperformancetrends",
    con=engine,
    schema="gist",
    if_exists="replace",  # Replace table
    index=False,
    method="multi"
)
print("✅ Table 'gist_pageperformancetrends' written successfully")

# Step 3: Recreate the materialized view (adjust if needed)
with engine.begin() as conn:
    print("⏳ Creating materialized view...")
    conn.execute(text("""
        CREATE MATERIALIZED VIEW gist.matv_gist_pageperformancetrends
        TABLESPACE pg_default
        AS
        SELECT * FROM gist.gist_pageperformancetrends
        WITH DATA;
    """))
    conn.execute(text("ALTER TABLE gist.matv_gist_pageperformancetrends OWNER TO airbyte_user;"))
    print("✅ Recreated materialized view")


⏳ Dropping materialized view...
✅ Dropped materialized view
✅ Table 'gist_pageperformancetrends' written successfully
⏳ Creating materialized view...
✅ Recreated materialized view
