<a href="https://colab.research.google.com/github/ShikharV010/gist_daily_runs/blob/main/Gush_SEO_Rank_Tracker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
pip install pandas sqlalchemy psycopg2-binary python-dateutil gspread oauth2client



In [21]:
# --- CONFIG ---
import os
import pandas as pd
from sqlalchemy import create_engine, text

DB_URL = "postgresql+psycopg2://airbyte_user:airbyte_user_password@gw-rds-prod.celzx4qnlkfp.us-east-1.rds.amazonaws.com:5432/gw_prod"

SCHEMA_GSC  = "airbyte_ingestion"
TBL_GSC_PQD = "gush_gsc_page_query_daily"  # (kept if you still need query-level)
TBL_GSC_PD  = "gush_gsc_page_daily"        # <-- new: page-level table

SCHEMA_DICT = "airbyte_ingestion"
TABLE_DICT  = "gtm_seo_gush_seo_pages"
# TABLE_BLOG_DICT = '"seo_tracker_BlogType"'

engine = create_engine(DB_URL)


In [22]:
# --- READ GSC: page_query_daily ---
with engine.connect() as con:
    gsc_pqd_sql = text(f"""
        SELECT
            date::date          AS date,
            page                AS page,
            query               AS query,
            clicks::bigint      AS clicks,
            impressions::bigint AS impressions,
            ctr::numeric        AS ctr,
            position::numeric   AS position
        FROM {SCHEMA_GSC}.{TBL_GSC_PQD}
        WHERE date IS NOT NULL
    """)
    gsc_page_query_daily_df = pd.read_sql(gsc_pqd_sql, con)

# --- READ GSC: page_daily (use this for page-level performance) ---
with engine.connect() as con:
    gsc_pd_sql = text(f"""
        SELECT
            date::date          AS date,
            page                AS page,
            clicks::bigint      AS clicks,
            impressions::bigint AS impressions
        FROM {SCHEMA_GSC}.{TBL_GSC_PD}
        WHERE date IS NOT NULL
    """)
    gsc_page_daily_df = pd.read_sql(gsc_pd_sql, con)

# --- READ Pages/Keywords (only the columns you need; exact hero_url strings) ---
with engine.connect() as con:
    dict_sql = text(f"""
        SELECT
            primary_keyword,
            secondary_keyword,
            hero_url,
            volume,
            last_updated_date,
            status
        FROM {SCHEMA_DICT}.{TABLE_DICT}
        WHERE hero_url IS NOT NULL
          AND hero_url <> ''
          AND hero_url <> 'New'
          AND hero_url <> '\\'
    """)
    dict_raw_df = pd.read_sql(dict_sql, con)

#display(gsc_page_daily_df.head(3))
#display(dict_raw_df.head(3))

# with engine.connect() as con:
#     dict_blog_sql = text(f"""
#         SELECT DISTINCT blog_url,
#               regexp_replace(blog_url, '^https?://[^/]+', '') AS updated_url,
#               blog_type
#           FROM airbyte_ingestion."seo_tracker_BlogType"
#     """)
#     dict_blogtype_df = pd.read_sql(dict_blog_sql, con)


with engine.connect() as con:
    dict_blog_sql = text("""
        SELECT blog_url,
               regexp_replace(blog_url, '^https?://[^/]+', '') AS updated_url,
               blog_type
        FROM airbyte_ingestion."gush_blogs_Monish"

        UNION

        SELECT blog_url,
               regexp_replace(blog_url, '^https?://[^/]+', '') AS updated_url,
               blog_type
        FROM airbyte_ingestion."gush_blogs_Preksha"
    """)

    dict_blogtype_df = pd.read_sql(dict_blog_sql, con)

In [23]:
display(dict_blogtype_df)

Unnamed: 0,blog_url,updated_url,blog_type
0,https://www.gushwork.ai/blog/locksmith-leads,/blog/locksmith-leads,scale
1,https://www.gushwork.ai/blog/affordable-seo-fo...,/blog/affordable-seo-for-hvac-contractors-stra...,scale
2,https://www.gushwork.ai/blog/manufacturing-sal...,/blog/manufacturing-sales-training,scale
3,https://www.gushwork.ai/blog/b2b-content-writi...,/blog/b2b-content-writing-beginners-guide,scale
4,https://www.gushwork.ai/blog/best-seo-software...,/blog/best-seo-software-for-agencies,scale
...,...,...,...
254,https://www.gushwork.ai/blog/content-marketing...,/blog/content-marketing-for-distributors,scale
255,https://www.gushwork.ai/blog/guerrilla-marketi...,/blog/guerrilla-marketing-ideas-for-trade-shows,scale
256,https://www.gushwork.ai/trade-show-roi-calculator,/trade-show-roi-calculator,scale
257,https://www.gushwork.ai/blog/window-and-door-c...,/blog/window-and-door-contractor-marketing,scale


GET QUERY AND PAGE PERFORMANCE

In [24]:
# -*- coding: utf-8 -*-
"""
WEEKLY SEO RANK TRACKING PIPELINE
---------------------------------
This script fetches Google rankings for each (secondary_keyword, hero_url)
and stores them in a long-format history table in PostgreSQL.

Key features:
- Does NOT modify or replace any existing SEO tables.
- Creates and maintains its own history table: gist.gush_rank_history.
- Appends new rows each run (one row per keyword per date).
- Allows filtering to only "core" blogs for testing.
"""

import os
import pandas as pd
from sqlalchemy import create_engine, text

# ─────────────────────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────────────────────
DB_URL = os.getenv(
    "DB_URL",
    "postgresql+psycopg2://airbyte_user:airbyte_user_password@"
    "gw-rds-prod.celzx4qnlkfp.us-east-1.rds.amazonaws.com:5432/gw_prod"
)

SCHEMA_GSC = "airbyte_ingestion"
TABLE_GSC  = "gush_gsc_page_query_daily"

SCHEMA_DICT = "airbyte_ingestion"
TABLE_DICT  = "gtm_seo_gush_seo_pages"

df_keywords = dict_raw_df[["primary_keyword", "secondary_keyword", "hero_url"]]

df_keywords.columns = df_keywords.columns.str.lower()
df_keywords = df_keywords.drop_duplicates()

In [25]:
df_keywords.head()

Unnamed: 0,primary_keyword,secondary_keyword,hero_url
0,marketing for manufacturing,marketing for manufacturing,https://www.gushwork.ai/blog/b2b-manufacturing...
1,marketing for manufacturing,marketing solutions for manufacturing companies,https://www.gushwork.ai/blog/b2b-manufacturing...
2,marketing for manufacturing,marketing strategy for manufacturing company,https://www.gushwork.ai/blog/b2b-manufacturing...
3,marketing for manufacturing,marketing to manufacturing companies,https://www.gushwork.ai/blog/b2b-manufacturing...
4,marketing for manufacturing,marketing manufacturing strategy,https://www.gushwork.ai/blog/b2b-manufacturing...


In [26]:
# Normalize formatting
dict_blogtype_df.columns = dict_blogtype_df.columns.str.lower()
dict_blogtype_df['blog_type'] = dict_blogtype_df['blog_type'].str.lower()

# Rename updated_url -> hero_url to match wide table
dict_blogtype_df = dict_blogtype_df.rename(columns={"blog_url": "hero_url"})

In [27]:
# Merge onto final output
final_long_df = df_keywords.merge(
    dict_blogtype_df[['hero_url', 'blog_type']],
    on='hero_url',
    how='left'
)

# Filter only scale blogs --> Remove this filter once the pipeline is set
# final_long_df = final_long_df[final_long_df['blog_type'] == 'scale']

In [28]:
# Preserve the original copy for validation purpose:
final_long_df_original = final_long_df.copy()

In [29]:
mask = final_long_df["secondary_keyword"].astype(str).str.strip().eq("None")

print("Rows with secondary_keyword == 'None':", mask.sum())

# Remove them
final_long_df = final_long_df[~mask].reset_index(drop=True)

print("Final row count:", len(final_long_df))

Rows with secondary_keyword == 'None': 4
Final row count: 1719


GET RANK FROM SERPER

In [30]:
import requests
import pandas as pd

# config
SERPER_API_KEY = "6769b8e78f7e96c5ff1793582bebbe532085d6be"   # replace with your real key
API_URL = "https://google.serper.dev/search"

headers = {
    "X-API-KEY": SERPER_API_KEY,
    "Content-Type": "application/json"
}


In [31]:
# run over all keywords
# df_keywords = dict_raw_df.copy()

# run over all the keywords for the core blogs
final_long_df = final_long_df[["primary_keyword", "secondary_keyword", "hero_url"]].drop_duplicates()
print(f"Secondary keywords (core pages) selected for ranking: {len(final_long_df)}")

Secondary keywords (core pages) selected for ranking: 1719


In [32]:
# ------------------------------------------------------------
#  RANK EXTRACTION SECTION
# ------------------------------------------------------------

results = []

for idx, row in final_long_df.iterrows():
    keyword = row["secondary_keyword"].strip()
    hero_url = row["hero_url"].strip()

    payload = {
        "q": keyword,
        "gl": "us",
        "hl": "en",
        "num": 100
    }

    try:
        res = requests.post(API_URL, headers=headers, json=payload, timeout=20)
        print(f"{idx}: status {res.status_code}")

        res.raise_for_status()
        data = res.json()

        # Find hero_url in organic results
        rank = None
        for i, r in enumerate(data.get("organic", []), start=1):
            if hero_url in r.get("link", ""):
                rank = i
                break

        rank_value = rank if rank is not None else "100+"

    except Exception as e:
        print(f"Error for {keyword}: {e}")
        rank_value = "100+"

    results.append({
        "primary_keyword": row["primary_keyword"],
        "secondary_keyword": keyword,
        "hero_url": hero_url,
        "rank": rank_value
    })

rank_df = pd.DataFrame(results)
print("\nExtracted ranks:")
print(rank_df.head())

0: status 200
1: status 200
2: status 200
3: status 200
4: status 200
5: status 200
6: status 200
7: status 200
8: status 200
9: status 200
10: status 200
11: status 200
12: status 200
13: status 200
14: status 200
15: status 200
16: status 200
17: status 200
18: status 200
19: status 200
20: status 200
21: status 200
22: status 200
23: status 200
24: status 200
25: status 200
26: status 200
27: status 200
28: status 200
29: status 200
30: status 200
31: status 200
32: status 200
33: status 200
34: status 200
35: status 200
36: status 200
37: status 200
38: status 200
39: status 200
40: status 200
41: status 200
42: status 200
43: status 200
44: status 200
45: status 200
46: status 200
47: status 200
48: status 200
49: status 200
50: status 200
51: status 200
52: status 200
53: status 200
54: status 200
55: status 200
56: status 200
57: status 200
58: status 200
59: status 200
60: status 200
61: status 200
62: status 200
63: status 200
64: status 200
65: status 200
66: status 200
67: s

In [33]:
# rank_df[rank_df["rank"] != "100+"]
#rank_df = rank_df.drop(columns=["extracted_date"])
rank_df.head()

Unnamed: 0,primary_keyword,secondary_keyword,hero_url,rank
0,seo for plumbers,plumber seo expert,https://www.gushwork.ai/blog/seo-for-plumbers,100+
1,manufacturing web design,manufacturing web design,https://www.gushwork.ai/blog/best-manufacturin...,100+
2,manufacturing web design,manufacturing website design,https://www.gushwork.ai/blog/best-manufacturin...,100+
3,manufacturing web design,web design manufacturing,https://www.gushwork.ai/blog/best-manufacturin...,100+
4,manufacturing web design,manufacturing website,https://www.gushwork.ai/blog/best-manufacturin...,100+


In [34]:
from datetime import date
today = date.today()

rank_df = rank_df.copy()   # prevent SettingWithCopy issues

rank_df.loc[:, "extracted_date"] = today
rank_df["extracted_date"] = pd.to_datetime(rank_df["extracted_date"]).dt.date

rank_df = rank_df[
    ["primary_keyword", "secondary_keyword", "hero_url", "rank", "extracted_date"]
]

In [35]:
engine.dispose()
engine = create_engine(DB_URL, pool_pre_ping=True)

today = pd.Timestamp.utcnow().date()

with engine.begin() as conn:
    rows = conn.execute(text("""
        SELECT COUNT(*)
        FROM gist.gush_serper_rank_history
        WHERE extracted_date = :d
    """), {"d": today}).scalar()

print("Rows already present for", today, "=", rows)

Rows already present for 2025-12-23 = 0


In [36]:
with engine.begin() as conn:
    deleted = conn.execute(text("""
        DELETE FROM gist.gush_serper_rank_history
        WHERE extracted_date = :d
    """), {"d": today}).rowcount

print("Deleted:", deleted)

Deleted: 0


In [37]:
dupes_inside_df = (
    rank_df[["secondary_keyword", "hero_url", "extracted_date", "rank"]]
    .duplicated()
    .sum()
)
print("Duplicates inside rank_df:", dupes_inside_df)

Duplicates inside rank_df: 0


In [38]:
rank_df = rank_df.drop_duplicates(
    subset=["secondary_keyword", "hero_url", "extracted_date"]
).reset_index(drop=True)

print("Rows after de-dup:", len(rank_df))

Rows after de-dup: 1719


In [39]:
# Reset broken transaction state
engine.dispose()
engine = create_engine(DB_URL)

rank_df.to_sql(
    name="gush_serper_rank_history",
    con=engine,
    schema="gist",
    if_exists="append",
    index=False,
    method="multi"
)

print(f"Inserted {len(rank_df)} rows.")

Inserted 1719 rows.
