<a href="https://colab.research.google.com/github/ShikharV010/gist_daily_runs/blob/main/Gush_SEO_Tracker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pandas sqlalchemy psycopg2-binary python-dateutil gspread oauth2client

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.10


In [2]:
# --- CONFIG ---
import os
import pandas as pd
from sqlalchemy import create_engine, text

DB_URL = "postgresql+psycopg2://airbyte_user:airbyte_user_password@gw-rds-prod.celzx4qnlkfp.us-east-1.rds.amazonaws.com:5432/gw_prod"

SCHEMA_GSC  = "airbyte_ingestion"
TBL_GSC_PQD = "gush_gsc_page_query_daily"  # query-level (has ctr, position)
TBL_GSC_PD  = "gush_gsc_page_daily_2"        # page-level (now also selecting ctr)

SCHEMA_DICT = "airbyte_ingestion"
TABLE_DICT  = "gtm_seo_gush_seo_pages"

# Create SQLAlchemy engine
engine = create_engine(DB_URL)

# --- READ GSC: page_query_daily (query-level) ---
with engine.connect() as con:
    gsc_pqd_sql = text(f"""
        SELECT
            date::date          AS date,
            page                AS page,
            query               AS query,
            clicks::bigint      AS clicks,
            impressions::bigint AS impressions,
            ctr::numeric        AS ctr,
            position::numeric   AS position
        FROM {SCHEMA_GSC}.{TBL_GSC_PQD}
        WHERE date IS NOT NULL
    """)
    gsc_page_query_daily_df = pd.read_sql(gsc_pqd_sql, con)

# --- READ GSC: page_daily (page-level) ---
with engine.connect() as con:
    gsc_pd_sql = text(f"""
        SELECT
            date::date          AS date,
            page                AS page,
            clicks::bigint      AS clicks,
            impressions::bigint AS impressions,
            ctr::numeric        AS ctr
        FROM {SCHEMA_GSC}.{TBL_GSC_PD}
        WHERE date IS NOT NULL
    """)
    gsc_page_daily_df = pd.read_sql(gsc_pd_sql, con)

# --- READ Pages/Keywords dictionary (hero_url etc.) ---
with engine.connect() as con:
    dict_sql = text(f"""
        SELECT
            primary_keyword,
            secondary_keyword,
            hero_url,
            volume,
            last_updated_date,
            status
        FROM {SCHEMA_DICT}.{TABLE_DICT}
        WHERE hero_url IS NOT NULL
          AND hero_url <> ''
          AND hero_url <> 'New'
          AND hero_url <> '\\'
    """)
    dict_raw_df = pd.read_sql(dict_sql, con)

# --- (Optional) quick sanity checks ---
print("gsc_page_query_daily_df:", gsc_page_query_daily_df.shape, gsc_page_query_daily_df.columns.tolist())
print("gsc_page_daily_df:",       gsc_page_daily_df.shape,       gsc_page_daily_df.columns.tolist())
print("dict_raw_df:",             dict_raw_df.shape,             dict_raw_df.columns.tolist())

# Example peeks (comment out if not needed)
# print(gsc_page_query_daily_df.head(3))
# print(gsc_page_daily_df.head(3))
# print(dict_raw_df.head(3))


gsc_page_query_daily_df: (224947, 7) ['date', 'page', 'query', 'clicks', 'impressions', 'ctr', 'position']
gsc_page_daily_df: (19244, 5) ['date', 'page', 'clicks', 'impressions', 'ctr']
dict_raw_df: (1032, 6) ['primary_keyword', 'secondary_keyword', 'hero_url', 'volume', 'last_updated_date', 'status']


In [3]:
display(dict_raw_df)

Unnamed: 0,primary_keyword,secondary_keyword,hero_url,volume,last_updated_date,status
0,marketing for manufacturing,marketing for manufacturing,https://www.gushwork.ai/blog/b2b-manufacturing...,5760,"Jul 30, 2025",new
1,marketing for manufacturing,marketing solutions for manufacturing companies,https://www.gushwork.ai/blog/b2b-manufacturing...,10,"Jul 30, 2025",new
2,marketing for manufacturing,marketing strategy for manufacturing company,https://www.gushwork.ai/blog/b2b-manufacturing...,110,"Jul 30, 2025",new
3,marketing for manufacturing,marketing to manufacturing companies,https://www.gushwork.ai/blog/b2b-manufacturing...,30,"Jul 30, 2025",new
4,marketing for manufacturing,marketing manufacturing strategy,https://www.gushwork.ai/blog/b2b-manufacturing...,90,"Jul 30, 2025",new
...,...,...,...,...,...,...
1027,pre trade show marketing,best trade show marketing ideas,https://www.gushwork.ai/blog/pre-trade-show-ma...,20,,new
1028,pre trade show marketing,trade shows in marketing,https://www.gushwork.ai/blog/pre-trade-show-ma...,20,,new
1029,the role of digital marketing in trade show su...,the role of digital marketing in trade show su...,https://www.gushwork.ai/blog/role-of-digital-m...,10,,new
1030,the role of digital marketing in trade show su...,measuring the success of your trade show lead ...,https://www.gushwork.ai/blog/role-of-digital-m...,10,,new


GET QUERY AND PAGE PERFORMANCE

GET RANK

In [4]:
import requests
import pandas as pd

# config
SERPER_API_KEY = "6769b8e78f7e96c5ff1793582bebbe532085d6be"   # replace with your real key
API_URL = "https://google.serper.dev/search"

headers = {
    "X-API-KEY": SERPER_API_KEY,
    "Content-Type": "application/json"
}

# run over all keywords
df_keywords = dict_raw_df.copy()

results = []

for idx, row in df_keywords.iterrows():
    keyword = str(row["secondary_keyword"]).strip()
    hero_url = str(row["hero_url"]).strip()

    payload = {
        "q": keyword,
        "gl": "us",
        "hl": "en",
        "num": 100        # fetch up to top 500 results
    }

    try:
        res = requests.post(API_URL, headers=headers, json=payload, timeout=20)
        print(f"{idx}: status {res.status_code}")   # print status for debugging

        res.raise_for_status()
        data = res.json()

        rank = None
        for i, r in enumerate(data.get("organic", []), start=1):
            if hero_url in r.get("link", ""):
                rank = i
                break

        # assign "None or >500" if not found
        if rank is None:
            rank_value = "100+"
        else:
            rank_value = rank

        results.append({
            "secondary_keyword": keyword,
            "hero_url": hero_url,
            "rank": rank_value
        })

    except Exception as e:
        print(f"Error for {keyword}: {e}")
        results.append({
            "secondary_keyword": keyword,
            "hero_url": hero_url,
            "rank": "100+"
        })

rank_df = pd.DataFrame(results)
print(rank_df)

0: status 200
1: status 200
2: status 200
3: status 200
4: status 200
5: status 200
6: status 200
7: status 200
8: status 200
9: status 200
10: status 200
11: status 200
12: status 200
13: status 200
14: status 200
15: status 200
16: status 200
17: status 200
18: status 200
19: status 200
20: status 200
21: status 200
22: status 200
23: status 200
24: status 200
25: status 200
26: status 200
27: status 200
28: status 200
29: status 200
30: status 200
31: status 200
32: status 200
33: status 200
34: status 200
35: status 200
36: status 200
37: status 200
38: status 200
39: status 200
40: status 200
41: status 200
42: status 200
43: status 200
44: status 200
45: status 200
46: status 200
47: status 200
48: status 200
49: status 200
50: status 200
51: status 200
52: status 200
53: status 200
54: status 200
55: status 200
56: status 200
57: status 200
58: status 200
59: status 200
60: status 200
61: status 200
62: status 200
63: status 200
64: status 200
65: status 200
66: status 200
67: s

In [5]:
display(rank_df)

Unnamed: 0,secondary_keyword,hero_url,rank
0,marketing for manufacturing,https://www.gushwork.ai/blog/b2b-manufacturing...,100+
1,marketing solutions for manufacturing companies,https://www.gushwork.ai/blog/b2b-manufacturing...,100+
2,marketing strategy for manufacturing company,https://www.gushwork.ai/blog/b2b-manufacturing...,100+
3,marketing to manufacturing companies,https://www.gushwork.ai/blog/b2b-manufacturing...,100+
4,marketing manufacturing strategy,https://www.gushwork.ai/blog/b2b-manufacturing...,100+
...,...,...,...
1027,best trade show marketing ideas,https://www.gushwork.ai/blog/pre-trade-show-ma...,100+
1028,trade shows in marketing,https://www.gushwork.ai/blog/pre-trade-show-ma...,100+
1029,the role of digital marketing in trade show su...,https://www.gushwork.ai/blog/role-of-digital-m...,100+
1030,measuring the success of your trade show lead ...,https://www.gushwork.ai/blog/role-of-digital-m...,100+


WRITE RANK

In [7]:
# --- Safe replace: load to staging, then TRUNCATE+INSERT into base table; keep dependent views/MVs intact ---

import pandas as pd
from sqlalchemy import create_engine, text

# ───────────── DB config ─────────────
engine = create_engine(
    "postgresql+psycopg2://airbyte_user:airbyte_user_password@"
    "gw-rds-prod.celzx4qnlkfp.us-east-1.rds.amazonaws.com:5432/gw_prod"
)

TABLE_SCHEMA = "gist"
BASE_TABLE   = "gist_gush_query_rank"        # base table
VIEW_NAME    = "vw_gist_gush_query_rank"     # view on top
STAGE_TABLE  = BASE_TABLE + "__staging"      # staging table name

# ───────────── DataFrame to load ─────────────
assert 'rank_df' in globals(), "rank_df not found. Run the transform cell first."
df = rank_df.copy()

if df.empty:
    print("🛑 rank_df is empty; nothing to load.")
    engine.dispose()
    raise SystemExit

# Normalize metric dtypes a bit
for c in df.columns:
    if str(c).endswith("_clicks") or str(c).endswith("_impressions"):
        df[c] = df[c].fillna(0).astype("int64")

# Ensure schema exists
with engine.begin() as conn:
    conn.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{TABLE_SCHEMA}";'))

# 1) Write to a staging table (replace if it already exists)
df.to_sql(
    name=STAGE_TABLE,
    con=engine,
    schema=TABLE_SCHEMA,
    if_exists="replace",   # creates/overwrites the staging table only
    index=False,
    method="multi",
    chunksize=5_000,
)
print(f"📦 staged → {TABLE_SCHEMA}.{STAGE_TABLE} ({len(df)} rows, {len(df.columns)} cols)")

# Column list for explicit COPY (keeps order consistent)
cols = [f'"{col}"' for col in df.columns]
col_list = ", ".join(cols)

with engine.begin() as conn:
    # Check if base table exists
    base_exists = conn.execute(text("""
        SELECT 1
        FROM information_schema.tables
        WHERE table_schema = :schema AND table_name = :table
        LIMIT 1
    """), {"schema": TABLE_SCHEMA, "table": BASE_TABLE}).scalar() is not None

    if base_exists:
        # 2a) TRUNCATE + INSERT to preserve dependencies (views/MVs)
        conn.execute(text(f'TRUNCATE TABLE "{TABLE_SCHEMA}"."{BASE_TABLE}";'))
        conn.execute(text(f'''
            INSERT INTO "{TABLE_SCHEMA}"."{BASE_TABLE}" ({col_list})
            SELECT {col_list}
            FROM "{TABLE_SCHEMA}"."{STAGE_TABLE}";
        '''))
        # Drop the staging table
        conn.execute(text(f'DROP TABLE "{TABLE_SCHEMA}"."{STAGE_TABLE}";'))
        print(f"✅ replaced data in {TABLE_SCHEMA}.{BASE_TABLE} via TRUNCATE+INSERT")
    else:
        # 2b) First-time: rename staging → base
        conn.execute(text(f'''
            ALTER TABLE "{TABLE_SCHEMA}"."{STAGE_TABLE}"
            RENAME TO "{BASE_TABLE}";
        '''))
        print(f"✅ created {TABLE_SCHEMA}.{BASE_TABLE} (renamed from staging)")

# 3) Indexes (idempotent)
with engine.begin() as conn:
    # helpful single-column indexes
    conn.execute(text(f'''
        CREATE INDEX IF NOT EXISTS ix_{BASE_TABLE}_hero_url
        ON "{TABLE_SCHEMA}"."{BASE_TABLE}" (hero_url);
    '''))
    conn.execute(text(f'''
        CREATE INDEX IF NOT EXISTS ix_{BASE_TABLE}_secondary_keyword
        ON "{TABLE_SCHEMA}"."{BASE_TABLE}" (secondary_keyword);
    '''))
    # Optional composite unique (best-effort; ignore if duplicates exist)
    conn.execute(text(f"""
        DO $$
        BEGIN
          BEGIN
            EXECUTE 'CREATE UNIQUE INDEX IF NOT EXISTS ux_{BASE_TABLE}_hero_url_secondary_keyword
                     ON "{TABLE_SCHEMA}"."{BASE_TABLE}" (hero_url, secondary_keyword)';
          EXCEPTION WHEN others THEN
            -- duplicates present; skip making it unique
            NULL;
          END;
        END$$;
    """))

# 4) (Re)create passthrough view WITHOUT dropping it first
with engine.begin() as conn:
    conn.execute(text(f'''
        CREATE OR REPLACE VIEW "{TABLE_SCHEMA}"."{VIEW_NAME}" AS
        SELECT * FROM "{TABLE_SCHEMA}"."{BASE_TABLE}";
    '''))
print(f"🪟 view {TABLE_SCHEMA}.{VIEW_NAME} is in sync with the base table")

# 5) Analyze for planner stats
with engine.begin() as conn:
    conn.execute(text(f'ANALYZE "{TABLE_SCHEMA}"."{BASE_TABLE}";'))

engine.dispose()
print("🎉 Done.")


📦 staged → gist.gist_gush_query_rank__staging (1032 rows, 3 cols)
✅ replaced data in gist.gist_gush_query_rank via TRUNCATE+INSERT
🪟 view gist.vw_gist_gush_query_rank is in sync with the base table
🎉 Done.
