<a href="https://colab.research.google.com/github/ShikharV010/gist_daily_runs/blob/main/Gush_SEO_Serper_Rank_Tracker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
pip install pandas sqlalchemy psycopg2-binary python-dateutil gspread oauth2client



In [10]:
# --- CONFIG ---
import os
import pandas as pd
from sqlalchemy import create_engine, text

DB_URL = "postgresql+psycopg2://airbyte_user:airbyte_user_password@gw-rds-prod.celzx4qnlkfp.us-east-1.rds.amazonaws.com:5432/gw_prod"

SCHEMA_GSC  = "airbyte_ingestion"
TBL_GSC_PQD = "gush_gsc_page_query_daily"  # (kept if you still need query-level)
TBL_GSC_PD  = "gush_gsc_page_daily"        # <-- new: page-level table

SCHEMA_DICT = "airbyte_ingestion"
TABLE_DICT  = "gtm_seo_gush_seo_pages"
TABLE_BLOG_DICT = '"seo_tracker_BlogType"'

engine = create_engine(DB_URL)


In [11]:
# --- READ GSC: page_query_daily ---
with engine.connect() as con:
    gsc_pqd_sql = text(f"""
        SELECT
            date::date          AS date,
            page                AS page,
            query               AS query,
            clicks::bigint      AS clicks,
            impressions::bigint AS impressions,
            ctr::numeric        AS ctr,
            position::numeric   AS position
        FROM {SCHEMA_GSC}.{TBL_GSC_PQD}
        WHERE date IS NOT NULL
    """)
    gsc_page_query_daily_df = pd.read_sql(gsc_pqd_sql, con)

# --- READ GSC: page_daily (use this for page-level performance) ---
with engine.connect() as con:
    gsc_pd_sql = text(f"""
        SELECT
            date::date          AS date,
            page                AS page,
            clicks::bigint      AS clicks,
            impressions::bigint AS impressions
        FROM {SCHEMA_GSC}.{TBL_GSC_PD}
        WHERE date IS NOT NULL
    """)
    gsc_page_daily_df = pd.read_sql(gsc_pd_sql, con)

# --- READ Pages/Keywords (only the columns you need; exact hero_url strings) ---
with engine.connect() as con:
    dict_sql = text(f"""
        SELECT
            primary_keyword,
            secondary_keyword,
            hero_url,
            volume,
            last_updated_date,
            status
        FROM {SCHEMA_DICT}.{TABLE_DICT}
        WHERE hero_url IS NOT NULL
          AND hero_url <> ''
          AND hero_url <> 'New'
          AND hero_url <> '\\'
    """)
    dict_raw_df = pd.read_sql(dict_sql, con)

#display(gsc_page_daily_df.head(3))
#display(dict_raw_df.head(3))

with engine.connect() as con:
    dict_blog_sql = text(f"""
        SELECT DISTINCT blog_url,
              regexp_replace(blog_url, '^https?://[^/]+', '') AS updated_url,
              blog_type
          FROM airbyte_ingestion."seo_tracker_BlogType"
    """)
    dict_blogtype_df = pd.read_sql(dict_blog_sql, con)

In [12]:
display(dict_raw_df)

Unnamed: 0,primary_keyword,secondary_keyword,hero_url,volume,last_updated_date,status
0,marketing for manufacturing,marketing for manufacturing,https://www.gushwork.ai/blog/b2b-manufacturing...,5760,"Oct 8, 1915","Jul 30, 2025"
1,marketing for manufacturing,marketing solutions for manufacturing companies,https://www.gushwork.ai/blog/b2b-manufacturing...,10,"Oct 8, 1915","Jul 30, 2025"
2,marketing for manufacturing,marketing strategy for manufacturing company,https://www.gushwork.ai/blog/b2b-manufacturing...,110,"Oct 8, 1915","Jul 30, 2025"
3,marketing for manufacturing,marketing to manufacturing companies,https://www.gushwork.ai/blog/b2b-manufacturing...,30,"Oct 8, 1915","Jul 30, 2025"
4,marketing for manufacturing,marketing manufacturing strategy,https://www.gushwork.ai/blog/b2b-manufacturing...,90,"Oct 8, 1915","Jul 30, 2025"
...,...,...,...,...,...,...
2120,construction industry marketing,marketing to construction companies,https://www.gushwork.ai/blog/proven-constructi...,30,,
2121,construction industry marketing,construction industry advertising,https://www.gushwork.ai/blog/proven-constructi...,30,,
2122,construction industry marketing,marketing for construction contractors,https://www.gushwork.ai/blog/proven-constructi...,20,,
2123,construction industry marketing,marketing ideas for general contractor,https://www.gushwork.ai/blog/proven-constructi...,0,,


GET QUERY AND PAGE PERFORMANCE

In [18]:
# -*- coding: utf-8 -*-
"""
WEEKLY SEO RANK TRACKING PIPELINE
---------------------------------
This script fetches Google rankings for each (secondary_keyword, hero_url)
and stores them in a long-format history table in PostgreSQL.

Key features:
- Does NOT modify or replace any existing SEO tables.
- Creates and maintains its own history table: gist.gush_rank_history.
- Appends new rows each run (one row per keyword per date).
- Allows filtering to only "core" blogs for testing.
"""

import os
import pandas as pd
from sqlalchemy import create_engine, text

# ─────────────────────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────────────────────
DB_URL = os.getenv(
    "DB_URL",
    "postgresql+psycopg2://airbyte_user:airbyte_user_password@"
    "gw-rds-prod.celzx4qnlkfp.us-east-1.rds.amazonaws.com:5432/gw_prod"
)

SCHEMA_GSC = "airbyte_ingestion"
TABLE_GSC  = "gush_gsc_page_query_daily"

SCHEMA_DICT = "airbyte_ingestion"
TABLE_DICT  = "gtm_seo_gush_seo_pages"

df_keywords = dict_raw_df[["primary_keyword", "secondary_keyword", "hero_url"]]

df_keywords.columns = df_keywords.columns.str.lower()
df_keywords = df_keywords.drop_duplicates()

In [15]:
df_keywords.head()

Unnamed: 0,primary_keyword,secondary_keyword,hero_url
0,marketing for manufacturing,marketing for manufacturing,https://www.gushwork.ai/blog/b2b-manufacturing...
1,marketing for manufacturing,marketing solutions for manufacturing companies,https://www.gushwork.ai/blog/b2b-manufacturing...
2,marketing for manufacturing,marketing strategy for manufacturing company,https://www.gushwork.ai/blog/b2b-manufacturing...
3,marketing for manufacturing,marketing to manufacturing companies,https://www.gushwork.ai/blog/b2b-manufacturing...
4,marketing for manufacturing,marketing manufacturing strategy,https://www.gushwork.ai/blog/b2b-manufacturing...


In [19]:
### Remove this later!!!

# Normalize formatting
dict_blogtype_df.columns = dict_blogtype_df.columns.str.lower()
dict_blogtype_df['blog_type'] = dict_blogtype_df['blog_type'].str.lower()

# Rename updated_url -> hero_url to match wide table
dict_blogtype_df = dict_blogtype_df.rename(columns={"blog_url": "hero_url"})

In [20]:
# Merge onto final output
final_long_df = df_keywords.merge(
    dict_blogtype_df[['hero_url', 'blog_type']],
    on='hero_url',
    how='left'
)

# Filter only core blogs
final_long_df = final_long_df[final_long_df['blog_type'] == 'core']
# final_wide_df = final_wide_df.drop(columns=['blog_type'])

In [21]:
display(final_long_df)

Unnamed: 0,primary_keyword,secondary_keyword,hero_url,blog_type
0,marketing for manufacturing,marketing for manufacturing,https://www.gushwork.ai/blog/b2b-manufacturing...,core
1,marketing for manufacturing,marketing solutions for manufacturing companies,https://www.gushwork.ai/blog/b2b-manufacturing...,core
2,marketing for manufacturing,marketing strategy for manufacturing company,https://www.gushwork.ai/blog/b2b-manufacturing...,core
3,marketing for manufacturing,marketing to manufacturing companies,https://www.gushwork.ai/blog/b2b-manufacturing...,core
4,marketing for manufacturing,marketing manufacturing strategy,https://www.gushwork.ai/blog/b2b-manufacturing...,core
...,...,...,...,...
937,digital marketing for manufacturers,digital marketing for manufacturing websites,https://www.gushwork.ai/blog/digital-marketing...,core
938,digital marketing for manufacturers,manufacturing digital marketing package,https://www.gushwork.ai/blog/digital-marketing...,core
1021,content marketing for industrial companies,content marketing for industrial companies,https://www.gushwork.ai/blog/content-marketing...,core
1022,content marketing for industrial companies,content marketing examples for manufacturers,https://www.gushwork.ai/blog/content-marketing...,core


GET RANK FROM SERPER

In [22]:
import requests
import pandas as pd

# config
SERPER_API_KEY = "6769b8e78f7e96c5ff1793582bebbe532085d6be"   # replace with your real key
API_URL = "https://google.serper.dev/search"

headers = {
    "X-API-KEY": SERPER_API_KEY,
    "Content-Type": "application/json"
}


In [24]:
# run over all keywords
# df_keywords = dict_raw_df.copy()

# run over all the keywords for the core blogs
final_long_df = final_long_df[["primary_keyword", "secondary_keyword", "hero_url"]].drop_duplicates()
print(f"Secondary keywords (core pages) selected for ranking: {len(final_long_df)}")

Secondary keywords (core pages) selected for ranking: 369


In [25]:
# ------------------------------------------------------------
#  RANK EXTRACTION SECTION
# ------------------------------------------------------------

results = []

for idx, row in final_long_df.iterrows():
    keyword = row["secondary_keyword"].strip()
    hero_url = row["hero_url"].strip()

    payload = {
        "q": keyword,
        "gl": "us",
        "hl": "en",
        "num": 100
    }

    try:
        res = requests.post(API_URL, headers=headers, json=payload, timeout=20)
        print(f"{idx}: status {res.status_code}")

        res.raise_for_status()
        data = res.json()

        # Find hero_url in organic results
        rank = None
        for i, r in enumerate(data.get("organic", []), start=1):
            if hero_url in r.get("link", ""):
                rank = i
                break

        rank_value = rank if rank is not None else "100+"

    except Exception as e:
        print(f"Error for {keyword}: {e}")
        rank_value = "100+"

    results.append({
        "primary_keyword": row["primary_keyword"],
        "secondary_keyword": keyword,
        "hero_url": hero_url,
        "rank": rank_value
    })

rank_df = pd.DataFrame(results)
print("\nExtracted ranks:")
print(rank_df.head())

0: status 200
1: status 200
2: status 200
3: status 200
4: status 200
5: status 200
6: status 200
7: status 200
8: status 200
9: status 200
10: status 200
11: status 200
12: status 200
13: status 200
14: status 200
15: status 200
16: status 200
17: status 200
18: status 200
19: status 200
20: status 200
21: status 200
22: status 200
23: status 200
24: status 200
26: status 200
27: status 200
28: status 200
29: status 200
30: status 200
31: status 200
32: status 200
33: status 200
34: status 200
35: status 200
36: status 200
37: status 200
38: status 200
39: status 200
40: status 200
41: status 200
42: status 200
43: status 200
44: status 200
45: status 200
46: status 200
47: status 200
48: status 200
49: status 200
50: status 200
51: status 200
52: status 200
53: status 200
54: status 200
55: status 200
56: status 200
57: status 200
58: status 200
59: status 200
60: status 200
61: status 200
213: status 200
214: status 200
215: status 200
216: status 200
217: status 200
218: status 200

In [26]:
rank_df[rank_df["rank"] != "100+"]

Unnamed: 0,primary_keyword,secondary_keyword,hero_url,rank
62,SEO for manufacturing,seo for manufacturers,https://www.gushwork.ai/blog/boost-strategy-se...,1
75,SEO for manufacturing,seo for manufacturing industry,https://www.gushwork.ai/blog/boost-strategy-se...,8
99,SEO for manufacturing,seo content for manufacturers,https://www.gushwork.ai/blog/boost-strategy-se...,3
126,SEO for manufacturing,seo solution for manufacturers,https://www.gushwork.ai/blog/boost-strategy-se...,10
133,SEO for manufacturing,seo management for manufacturers,https://www.gushwork.ai/blog/boost-strategy-se...,4
134,SEO for manufacturing,seo for manufacturing company,https://www.gushwork.ai/blog/boost-strategy-se...,9
143,SEO for manufacturing,seo guide for manufacturers,https://www.gushwork.ai/blog/boost-strategy-se...,1
284,Top Manufacturing SEO Companies for 2025,Top Manufacturing SEO Companies for 2025,https://www.gushwork.ai/blog/top-manufacturing...,3
287,Top Manufacturing SEO Companies for 2025,best seo companies for manufacturers,https://www.gushwork.ai/blog/top-manufacturing...,8
288,Top Manufacturing SEO Companies for 2025,manufacturing seo companies,https://www.gushwork.ai/blog/top-manufacturing...,7


MERGE RANK AND WEEK ON WEEK QUERY & PAGE PERFORMANCE

In [27]:
rank_df["extracted_date"] = pd.Timestamp.utcnow().date()

In [28]:
rank_df.head()

Unnamed: 0,primary_keyword,secondary_keyword,hero_url,rank,extracted_date
0,marketing for manufacturing,marketing for manufacturing,https://www.gushwork.ai/blog/b2b-manufacturing...,100+,2025-12-17
1,marketing for manufacturing,marketing solutions for manufacturing companies,https://www.gushwork.ai/blog/b2b-manufacturing...,100+,2025-12-17
2,marketing for manufacturing,marketing strategy for manufacturing company,https://www.gushwork.ai/blog/b2b-manufacturing...,100+,2025-12-17
3,marketing for manufacturing,marketing to manufacturing companies,https://www.gushwork.ai/blog/b2b-manufacturing...,100+,2025-12-17
4,marketing for manufacturing,marketing manufacturing strategy,https://www.gushwork.ai/blog/b2b-manufacturing...,100+,2025-12-17


In [None]:
with engine.begin() as conn:
    conn.execute(text(f"""
        CREATE TABLE IF NOT EXISTS "gist"."gush_serper_rank_history" (
            primary_keyword     TEXT,
            secondary_keyword   TEXT NOT NULL,
            hero_url            TEXT NOT NULL,
            rank                TEXT,
            extracted_date      DATE NOT NULL,
            PRIMARY KEY (secondary_keyword, hero_url, extracted_date)
        );
    """))

In [None]:
# 1) Keep only join keys + rank, dedupe
rank_clean = (
    rank_df[["secondary_keyword", "hero_url", "rank"]]
    .drop_duplicates(subset=["secondary_keyword", "hero_url"], keep="first")
)

# 2) LEFT JOIN
merged = final_wide_df.merge(
    rank_clean,
    on=["secondary_keyword", "hero_url"],
    how="left"
)

# 3) Reorder so: primary_keyword, secondary_keyword, hero_url, volume, rank, then everything else
front = ["primary_keyword", "secondary_keyword", "hero_url", "volume", "rank"]
rest  = [c for c in merged.columns if c not in front]
merged = merged[front + rest]

merged.head()


Unnamed: 0,primary_keyword,secondary_keyword,hero_url,volume,rank,21st December_clicks,21st December_impressions,7th December_clicks,7th December_impressions,30th November_clicks,...,27th July_impressions,20th July_clicks,20th July_impressions,13th July_clicks,13th July_impressions,6th July_clicks,6th July_impressions,29th June_clicks,29th June_impressions,blog_type
0,Industrial SEO,Industrial SEO,https://www.gushwork.ai/blog/seo-industrial-co...,1460,100+,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,core
1,Industrial SEO,how to improve seo on an industrial website,https://www.gushwork.ai/blog/seo-industrial-co...,20,100+,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,core
2,Industrial SEO,industrial & manufacturer seo,https://www.gushwork.ai/blog/seo-industrial-co...,10,100+,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,core
3,Industrial SEO,industrial company seo,https://www.gushwork.ai/blog/seo-industrial-co...,10,100+,0,15,0,0,0,...,0,0,0,0,0,0,0,0,0,core
4,Industrial SEO,industrial company seo program,https://www.gushwork.ai/blog/seo-industrial-co...,10,100+,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,core


In [None]:
merged[merged["rank"] != "100+"]

NameError: name 'merged' is not defined

WRITE QUERY AND PAGE PERFORMANCE

In [None]:
# --- Write merged to Postgres (REPLACE table) + refresh view, handling dependencies ---

import pandas as pd
from sqlalchemy import create_engine, text

# ───────────── DB config ─────────────
engine = create_engine(
    "postgresql+psycopg2://airbyte_user:airbyte_user_password@"
    "gw-rds-prod.celzx4qnlkfp.us-east-1.rds.amazonaws.com:5432/gw_prod"
)

TABLE_SCHEMA = "gist"
TABLE_NAME   = "gist_gush_query_page_seo"
VIEW_NAME    = "vw_gist_gush_query_page_seo"

# ───────────── DataFrame to load ─────────────
assert 'merged' in globals(), "merged not found. Run the transform cell first."
df = merged.copy()
if df.empty:
    print("🛑 merged is empty; nothing to load.")
    engine.dispose()
    raise SystemExit

# Ensure metric cols are integers so PG creates BIGINT (not DOUBLE)
for c in df.columns:
    if str(c).endswith("_clicks") or str(c).endswith("_impressions"):
        df[c] = df[c].fillna(0).astype("int64")

# Ensure schema exists
with engine.begin() as conn:
    conn.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{TABLE_SCHEMA}";'))

# 1) Drop the dependent view (if it exists) so we can REPLACE the table
with engine.begin() as conn:
    conn.execute(text(f'DROP VIEW IF EXISTS "{TABLE_SCHEMA}"."{VIEW_NAME}";'))

# 2) REPLACE the table so column order matches the DataFrame every run
df.to_sql(
    name=TABLE_NAME,
    con=engine,
    schema=TABLE_SCHEMA,
    if_exists="replace",   # drop & recreate the base table
    index=False,
    method="multi",
    chunksize=5_000,
)

print(f"✅ replaced {TABLE_SCHEMA}.{TABLE_NAME} with {len(df)} rows and {len(df.columns)} columns")

# (Optional) add useful indexes after replace
with engine.begin() as conn:
    conn.execute(text(f'''
        DO $$
        BEGIN
          IF NOT EXISTS (
            SELECT 1 FROM pg_indexes
            WHERE schemaname = :schema AND indexname = 'ix_{TABLE_NAME}_hero_url'
          ) THEN
            EXECUTE 'CREATE INDEX ix_{TABLE_NAME}_hero_url ON "{TABLE_SCHEMA}"."{TABLE_NAME}" ("hero_url")';
          END IF;
        END$$;
    '''), {"schema": TABLE_SCHEMA})

# 3) Recreate the passthrough view
with engine.begin() as conn:
    conn.execute(text(f'''
        CREATE OR REPLACE VIEW "{TABLE_SCHEMA}"."{VIEW_NAME}" AS
        SELECT * FROM "{TABLE_SCHEMA}"."{TABLE_NAME}";
    '''))
print(f"🪟 view {TABLE_SCHEMA}.{VIEW_NAME} recreated.")

# 4) Analyze for planner stats
with engine.begin() as conn:
    conn.execute(text(f'ANALYZE "{TABLE_SCHEMA}"."{TABLE_NAME}";'))

engine.dispose()


✅ replaced gist.gist_gush_query_page_seo with 776 rows and 27 columns
🪟 view gist.vw_gist_gush_query_page_seo recreated.


GET PAGE PERFORMANCE

In [None]:
# --- Strict Sunday-to-Sunday 28D aggregation (includes upcoming Sunday tail) ---

import pandas as pd
from IPython.display import display

# ---------- Helpers ----------
def prepare_pages_dict(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d.columns = d.columns.str.strip()
    for col in ["hero_url", "last_updated_date", "status"]:
        if col not in d.columns:
            d[col] = pd.NA
    d["last_updated_date"] = pd.to_datetime(d["last_updated_date"], errors="coerce")
    d = d.loc[
        d["hero_url"].notna()
        & (d["hero_url"] != "")
        & (d["hero_url"] != "New")
        & (d["hero_url"] != "\\")
    ].drop_duplicates(subset=["hero_url"], keep="last")
    return d[["hero_url", "last_updated_date", "status"]]

def ordinal(n: int) -> str:
    return f"{n}{'th' if 11<=n%100<=13 else {1:'st',2:'nd',3:'rd'}.get(n%10,'th')}"

def label_for_anchor(ts: pd.Timestamp) -> str:
    # "7th September"
    return f"{ordinal(ts.day)} {ts.strftime('%B')}"

def compute_28d_sums(pages_join_df: pd.DataFrame, anchor: pd.Timestamp) -> pd.DataFrame:
    win_start = anchor - pd.Timedelta(days=27)  # inclusive 28-day window
    jdate = pd.to_datetime(pages_join_df["date"])
    sub = pages_join_df.loc[
        (jdate >= win_start) & (jdate <= anchor),
        ["hero_url", "clicks", "impressions"]
    ].copy()
    lbl = label_for_anchor(anchor)
    if sub.empty:
        return pd.DataFrame(columns=["hero_url", f"{lbl}_clicks", f"{lbl}_impressions"])
    agg = (
        sub.groupby("hero_url", as_index=False)
           .agg(**{f"{lbl}_clicks": ("clicks","sum"),
                   f"{lbl}_impressions": ("impressions","sum")})
    )
    return agg

# ---------- Inputs: use your existing DFs ----------
# dict_raw_df and gsc_page_daily_df must already exist in the session.

pages_df = prepare_pages_dict(dict_raw_df)

# Aggregate GSC to one row per (date, page)
gsc_day = (
    gsc_page_daily_df
    .groupby(["date","page"], as_index=False)
    .agg(clicks=("clicks","sum"), impressions=("impressions","sum"))
)

# Exact join: hero_url == page
pages_join_df = (
    gsc_day.merge(
        pages_df.rename(columns={"hero_url": "page"}),
        on="page",
        how="inner"
    )
    .rename(columns={"page": "hero_url"})
)

if pages_join_df.empty:
    raise SystemExit("No exact matches between hero_url and GSC.page. Check values.")

# ---------- Strict Sunday anchors + upcoming Sunday tail (tz-naive throughout) ----------
START_ANCHOR_STR = "2025-06-29"  # must be a Sunday (first weekly anchor)

anchor_start = pd.to_datetime(START_ANCHOR_STR).normalize()                 # tz-naive
max_gsc_date = pd.to_datetime(gsc_page_daily_df["date"]).max().normalize()  # tz-naive

# Last Sunday <= max GSC date
# weekday(): Mon=0,...,Sun=6 → days since last Sunday = (weekday+1)%7
days_since_sunday = (max_gsc_date.weekday() + 1) % 7
last_sunday = max_gsc_date - pd.Timedelta(days=days_since_sunday)

# tz-naive "today" at midnight UTC-equivalent
today = pd.Timestamp(pd.Timestamp.utcnow().date())  # tz-naive midnight of today's UTC date
days_until_sunday = (6 - today.weekday()) % 7       # Sun=6
upcoming_sunday = today + pd.Timedelta(days=days_until_sunday)  # tz-naive

# Build weekly Sundays up to last_sunday, then union upcoming_sunday as a tail if it is newer
anchors = pd.date_range(start=anchor_start, end=last_sunday, freq="7D")
latest_anchor = anchors.max() if len(anchors) else anchor_start
if upcoming_sunday > latest_anchor:
    anchors = anchors.union(pd.DatetimeIndex([upcoming_sunday])).sort_values()

print(
    f"Sundays from {anchor_start.date()} to {anchors.max().date()} "
    f"(max GSC date: {max_gsc_date.date()}, upcoming: {upcoming_sunday.date()})"
)

# ---------- Compute 28D sums and build wide table ----------
wide = pages_df[["hero_url", "last_updated_date", "status"]].copy()

frames = [compute_28d_sums(pages_join_df, a) for a in anchors]
for dfw in frames:
    wide = wide.merge(dfw, on="hero_url", how="left")

metric_cols = [c for c in wide.columns if c.endswith("_clicks") or c.endswith("_impressions")]
if metric_cols:
    wide[metric_cols] = wide[metric_cols].fillna(0).astype("int64")

# Order columns: hero_url, last_updated_date, status, then weekly pairs (latest Sunday first)
ordered_cols = ["hero_url", "last_updated_date", "status"]
for a in anchors[::-1]:  # latest first (so upcoming Sunday column appears first)
    lbl = label_for_anchor(a)
    ordered_cols += [f"{lbl}_clicks", f"{lbl}_impressions"]
ordered_cols = [c for c in ordered_cols if c in wide.columns]

final_pages_wide_df = wide.reindex(columns=ordered_cols)

# ---------- Output ----------
print(f"Rows: {final_pages_wide_df.shape[0]}, Cols: {final_pages_wide_df.shape[1]}")
display(final_pages_wide_df.head(10))


Sundays from 2025-06-29 to 2025-09-07 (max GSC date: 2025-09-03, upcoming: 2025-09-07)
Rows: 43, Cols: 25


Unnamed: 0,hero_url,last_updated_date,status,7th September_clicks,7th September_impressions,31st August_clicks,31st August_impressions,24th August_clicks,24th August_impressions,17th August_clicks,...,27th July_clicks,27th July_impressions,20th July_clicks,20th July_impressions,13th July_clicks,13th July_impressions,6th July_clicks,6th July_impressions,29th June_clicks,29th June_impressions
0,/,NaT,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,https://www.gushwork.ai/blog/b2b-manufacturing...,2025-07-30,new,4,3351,4,3926,3,3314,3,...,0,0,0,0,0,0,0,0,0,0
2,https://www.gushwork.ai/blog/best-manufacturin...,2025-08-26,updated,4,18518,4,19241,4,15601,3,...,0,10969,1,10328,1,7648,1,4762,1,2508
3,https://www.gushwork.ai/blog/boost-strategy-se...,2025-07-08,not updated,1,6435,1,6411,1,2839,0,...,0,440,0,402,0,82,0,0,0,0
4,https://www.gushwork.ai/blog/digital-marketing...,2025-08-13,new,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,https://www.gushwork.ai/blog/seo-industrial-co...,2025-08-26,new,0,35,0,19,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,https://www.gushwork.ai/blog/effective-strateg...,2025-08-26,updated,0,57,0,43,0,60,0,...,0,179,0,152,0,110,0,98,0,0
7,https://www.gushwork.ai/blog/content-marketing...,2025-08-26,new,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,https://www.gushwork.ai/blog/ppc-agency-servic...,2025-08-25,new,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,https://www.gushwork.ai/blog/inbound-manufactu...,2025-08-25,new,0,7,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0


WRITE PAGE PERFORMANCE

In [None]:
# --- Write final_pages_wide_df to Postgres (REPLACE table each run) + refresh view ---

import pandas as pd
from sqlalchemy import create_engine, text

# ───────────── DB config ─────────────
engine = create_engine(
    "postgresql+psycopg2://airbyte_user:airbyte_user_password@"
    "gw-rds-prod.celzx4qnlkfp.us-east-1.rds.amazonaws.com:5432/gw_prod"
)

TABLE_SCHEMA = "gist"
TABLE_NAME   = "gist_gush_page_seo"
VIEW_NAME    = "vw_gist_gush_page_seo"

# ───────────── DataFrame to load ─────────────
assert 'final_pages_wide_df' in globals(), "final_pages_wide_df not found. Run the transform cell first."
df = final_pages_wide_df.copy()
if df.empty:
    print("🛑 final_pages_wide_df is empty; nothing to load.")
    engine.dispose()
    raise SystemExit

# Optional: cast metric columns so PG creates BIGINT (integers) instead of DOUBLE
for c in df.columns:
    s = str(c)
    if s.endswith("_clicks") or s.endswith("_impressions"):
        df[c] = df[c].fillna(0).astype("int64")

# Ensure schema exists
with engine.begin() as conn:
    conn.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{TABLE_SCHEMA}";'))

# 1) Drop the dependent view (if it exists) so we can REPLACE the table
with engine.begin() as conn:
    conn.execute(text(f'DROP VIEW IF EXISTS "{TABLE_SCHEMA}"."{VIEW_NAME}";'))

# 2) REPLACE the table so column order matches the DataFrame every run
df.to_sql(
    name=TABLE_NAME,
    con=engine,
    schema=TABLE_SCHEMA,
    if_exists="replace",   # drop & recreate base table; preserves DF column order
    index=False,
    method="multi",
    chunksize=5_000,
)

print(f"✅ replaced {TABLE_SCHEMA}.{TABLE_NAME} with {len(df)} rows and {len(df.columns)} columns (ordered to match DataFrame)")

# (Optional) add helpful indexes (idempotent). Adjust to your static columns if needed.
with engine.begin() as conn:
    conn.execute(text(f'''
        DO $$
        BEGIN
          IF NOT EXISTS (
            SELECT 1 FROM pg_indexes
            WHERE schemaname = :schema AND indexname = 'ix_{TABLE_NAME}_hero_url'
          ) THEN
            EXECUTE 'CREATE INDEX ix_{TABLE_NAME}_hero_url ON "{TABLE_SCHEMA}"."{TABLE_NAME}" ("hero_url")';
          END IF;
        END$$;
    '''), {"schema": TABLE_SCHEMA})

# 3) Recreate the passthrough view
with engine.begin() as conn:
    conn.execute(text(f'''
        CREATE OR REPLACE VIEW "{TABLE_SCHEMA}"."{VIEW_NAME}" AS
        SELECT * FROM "{TABLE_SCHEMA}"."{TABLE_NAME}";
    '''))

print(f"🪟 view {TABLE_SCHEMA}.{VIEW_NAME} recreated.")

# 4) Analyze for planner stats
with engine.begin() as conn:
    conn.execute(text(f'ANALYZE "{TABLE_SCHEMA}"."{TABLE_NAME}";'))

engine.dispose()


✅ replaced gist.gist_gush_page_seo with 43 rows and 25 columns (ordered to match DataFrame)
🪟 view gist.vw_gist_gush_page_seo recreated.
