This file merges the 5 files downloaded from Google Keyword Planner App and then extracts the search intensity from Google Trends for each of the selected keywords. The link to the 5 keyword files from Planner App can be found here:
1. buying/demand:https://drive.google.com/file/d/1q1jdp9DzMoQDDSa8-ww_PHjghkoecxFQ/view?usp=sharing
2. mortgage/financing: https://drive.google.com/file/d/1MOplzhKJKuS9JbHwyhYku-64lXTlioSf/view?usp=sharing
3. renting/affordability: https://drive.google.com/file/d/19-9BU7SVSgDnA9814XEAXhNGpWuAVzgD/view?usp=sharing
4. market awareness: https://drive.google.com/file/d/1CYPBYQps8IuAgioKey5pqH3b4AJF21iO/view?usp=sharing
5. economic policy: https://drive.google.com/file/d/1C8xZwsN4vK-uBq4tOuY5KG3aRcxDcZJf/view?usp=sharing

In [None]:
!pip install pytrends

Collecting pytrends
  Downloading pytrends-4.9.2-py3-none-any.whl.metadata (13 kB)
Downloading pytrends-4.9.2-py3-none-any.whl (15 kB)
Installing collected packages: pytrends
Successfully installed pytrends-4.9.2


Merging and Cleaning

In [None]:
import os
import pandas as pd
from google.colab import files

TOP_N_PER_THEME = 300

# Uploading the keyword files
uploaded = files.upload()

keyword_files = [
    f for f in uploaded.keys()
    if f.lower().endswith(".csv") and "keyword" in f.lower()
]

if not keyword_files:
    raise ValueError("I couldn't find any keyword CSVs in the uploaded files. "
                     "Make sure the filenames include 'keywords' and end with .csv")

print("\nFiles loaded:")
for f in keyword_files:
    print(" -", f)

def infer_theme(filename: str) -> str:
    name = os.path.splitext(os.path.basename(filename.lower()))[0]
    if "buy" in name or "demand" in name:
        return "buying_demand"
    if "mortgage" in name or "financing" in name:
        return "mortgage_financing"
    if "market" in name or "awareness" in name:
        return "market_awareness"
    if "rent" in name or "afford" in name:
        return "renting_affordability"
    if "policy" in name or "economic" in name:
        return "economic_policy"
    return "unknown"

def read_keyword_planner_file(path: str) -> pd.DataFrame:
    """
    Keyword Planner CSV exports are usually UTF-16 with tab separators.
    This keeps only the keyword + average monthly searches columns.
    """
    raw = pd.read_csv(path, encoding="utf-16", sep="\t")
    raw.columns = [c.strip() for c in raw.columns]

    needed = ["Keyword", "Avg. monthly searches"]
    missing = [c for c in needed if c not in raw.columns]
    if missing:
        raise ValueError(f"{path} is missing expected columns: {missing}")

    out = pd.DataFrame({
        "keyword": raw["Keyword"].astype(str).str.lower().str.strip(),
        "avg_monthly_searches": pd.to_numeric(
            raw["Avg. monthly searches"].astype(str).str.replace(",", "", regex=False),
            errors="coerce"
        )
    })

    out["theme"] = infer_theme(path)

    # Dropping blanks and retaining one row per keyword
    out = out[out["keyword"].str.len() > 0].drop_duplicates(subset="keyword").reset_index(drop=True)
    return out

# Merging the files

frames = []
for f in keyword_files:
    cur = read_keyword_planner_file(f)
    frames.append(cur)

    non_null = cur["avg_monthly_searches"].notna().sum()
    print(f"Loaded {os.path.basename(f)}: {len(cur)} keywords ({non_null} with search volume)")

master = pd.concat(frames, ignore_index=True)

# If the same keyword appears in more than one theme, retaining the row with the highest search volume
master = (
    master.sort_values("avg_monthly_searches", ascending=False)
          .drop_duplicates(subset="keyword", keep="first")
          .sort_values(["theme", "keyword"])
          .reset_index(drop=True)
)

# Saving outputs

master.to_csv("master_keywords_with_volume.csv", index=False)
print("\nSaved: master_keywords_with_volume.csv")

print("\nKeywords per theme:")
print(master.groupby("theme").size())

print("\nSearch volume summary (non-missing only):")
print(master.loc[master["avg_monthly_searches"].notna(), "avg_monthly_searches"].describe())

if TOP_N_PER_THEME is not None:
    topN = (
        master.dropna(subset=["avg_monthly_searches"])
              .sort_values(["theme", "avg_monthly_searches"], ascending=[True, False])
              .groupby("theme", as_index=False)
              .head(TOP_N_PER_THEME)
              .reset_index(drop=True)
    )
    topN.to_csv("master_keywords_top.csv", index=False)
    print(f"\nSaved: master_keywords_top.csv (top {TOP_N_PER_THEME} per theme)")

# Downloading the outputs
files.download("master_keywords_with_volume.csv")
if TOP_N_PER_THEME is not None:
    files.download("master_keywords_top.csv")


Saving buying_demand_keywords.csv to buying_demand_keywords.csv
Saving economic_policy_keywords.csv to economic_policy_keywords.csv
Saving mortgage_financing_keywords.csv to mortgage_financing_keywords.csv
Saving market_awareness_keywords.csv to market_awareness_keywords.csv
Saving renting and affordability_keywords.csv to renting and affordability_keywords.csv

Files loaded:
 - buying_demand_keywords.csv
 - economic_policy_keywords.csv
 - mortgage_financing_keywords.csv
 - market_awareness_keywords.csv
 - renting and affordability_keywords.csv
Loaded buying_demand_keywords.csv: 2055 keywords (2054 with search volume)
Loaded economic_policy_keywords.csv: 3192 keywords (3192 with search volume)
Loaded mortgage_financing_keywords.csv: 4540 keywords (4530 with search volume)
Loaded market_awareness_keywords.csv: 2709 keywords (2665 with search volume)
Loaded renting and affordability_keywords.csv: 1367 keywords (1356 with search volume)

Saved: master_keywords_with_volume.csv

Keywords pe

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Selecting top 20 keywords from each theme

In [None]:
import pandas as pd

# Load the full master file
kw = pd.read_csv("master_keywords_with_volume.csv")

# Keeping only non-null search volumes
kw = kw.dropna(subset=["avg_monthly_searches"])

# Selecting the top 20 keywords per theme
top20 = (kw.sort_values(["theme","avg_monthly_searches"], ascending=[True, False])
            .groupby("theme", as_index=False)
            .head(20)
            .reset_index(drop=True))

print(top20["theme"].value_counts())
top20.to_csv("master_keywords_top20.csv", index=False)
print("Saved: master_keywords_top20.csv (100 rows)")


theme
buying_demand            20
economic_policy          20
market_awareness         20
mortgage_financing       20
renting_affordability    20
Name: count, dtype: int64
Saved: master_keywords_top20.csv (100 rows)


Extracting the selected keyword search intensity from Google Trends

In [None]:
# Google Trends input preparation
import pandas as pd, numpy as np, time
from pytrends.request import TrendReq

KEYWORD_FILE = "master_keywords_top20.csv"
GEO = "GB"
TIMEFRAME = "2005-01-01 2025-06-30"

kw = pd.read_csv(KEYWORD_FILE)
kw["keyword"] = kw["keyword"].astype(str).str.strip().str.lower()
kw = kw.drop_duplicates(subset="keyword").reset_index(drop=True)
print("Total keywords:", len(kw))


Total keywords: 100


Downloading the data at weekly frequency in batches of 5 keywords

In [None]:
# Downloading weekly Google Trend indices for the 5 themes
from pytrends.request import TrendReq
import pandas as pd
import numpy as np
import time

# Initialising pytrends session
pytrends = TrendReq(hl="en-GB", tz=0)

# Splitting a list into batches of size 5
def batch_keywords(keywords, batch_size=5):
    for i in range(0, len(keywords), batch_size):
        yield keywords[i:i + batch_size]

# Mapping each keyword to its theme
theme_map = dict(kw[["keyword", "theme"]].values)

weekly_frames = []
failed_keywords = []

print(f"Requesting weekly Google Trends data for {len(kw)} keywords...")

for kw_batch in batch_keywords(list(kw["keyword"]), batch_size=5):
    try:
        pytrends.build_payload(
            kw_batch,
            geo=GEO,
            timeframe=TIMEFRAME
        )

        trends_wide = pytrends.interest_over_time()

        # If Google returns no data, mark batch as failed
        if trends_wide.empty:
            failed_keywords.extend(kw_batch)
            time.sleep(2)
            continue

        # Drop metadata column if present
        trends_wide = trends_wide.drop(columns=["isPartial"], errors="ignore")

        # Convert to long format
        trends_long = (
            trends_wide
            .reset_index()
            .melt(id_vars="date", var_name="keyword", value_name="hits")
        )

        # Attaching theme information
        trends_long["theme"] = trends_long["keyword"].map(theme_map)

        weekly_frames.append(trends_long)

    except Exception as e:
        failed_keywords.extend(kw_batch)
        print(f"Batch failed: {kw_batch}")

    # Delay to avoid rate limiting
    time.sleep(1.5)

# Combining all successful batches
trends_long_weekly = (
    pd.concat(weekly_frames, ignore_index=True)
    if weekly_frames
    else pd.DataFrame(columns=["date", "keyword", "hits", "theme"])
)

# Final type cleaning
trends_long_weekly["date"] = pd.to_datetime(trends_long_weekly["date"])
trends_long_weekly["hits"] = pd.to_numeric(trends_long_weekly["hits"], errors="coerce")

print(f"Downloaded rows: {len(trends_long_weekly)}")
print(f"Failed keywords: {len(failed_keywords)}")

# Saving output
out_path = "trends_long_weekly.csv"
trends_long_weekly.to_csv(out_path, index=False)
print(f"Saved weekly Google Trends data → {out_path}")


Requesting weekly Google Trends data for 100 keywords...
Downloaded rows: 24600
Failed keywords: 0
Saved weekly Google Trends data → trends_long_weekly.csv


Conversion of weekly frequency to monthly

In [None]:
# Converting the weekly Google Trends data to monthly frequency
# Starting from the completed weekly dataset
df = trends_long_weekly.copy()

# Ensuring correct data types
df["date"] = pd.to_datetime(df["date"])
df["hits"] = pd.to_numeric(df["hits"], errors="coerce")

print("Weekly data span:", df["date"].min().date(), "→", df["date"].max().date())
print("Total weekly rows:", len(df))

# Aggregate to monthly frequency (month start)
trends_long_monthly = (
    df
      .set_index("date")
      .groupby(["keyword", "theme"])
      .resample("MS")["hits"]
      .mean()
      .reset_index()
)

print("Monthly rows:", len(trends_long_monthly))

# Creating wide-format version
trends_wide_monthly = (
    trends_long_monthly
        .pivot_table(
            index="date",
            columns="keyword",
            values="hits"
        )
)

# Saving outputs
trends_long_monthly.to_csv("trends_long_monthly.csv", index=False)
trends_wide_monthly.to_csv("trends_wide_monthly.csv")
print("Saved: trends_long_monthly.csv and trends_wide_monthly.csv")

Weekly data span: 2005-01-01 → 2025-06-01
Total weekly rows: 24600
Monthly rows: 24600
Saved: trends_long_monthly.csv and trends_wide_monthly.csv


Filtering the sparse keywords

In [None]:
# Non-NA count per keyword
non_na = trends_long_monthly.groupby("keyword")["hits"].apply(lambda s: s.notna().sum())
keep_min24 = non_na[non_na >= 24].index

# Zero share
zero_share = (trends_long_monthly
              .groupby("keyword")["hits"]
              .apply(lambda s: (s.fillna(0)==0).mean()))

keep_nonflat = zero_share[zero_share < 0.80].index

keep_keywords = set(keep_min24).intersection(set(keep_nonflat))
tlm_f = trends_long_monthly[trends_long_monthly["keyword"].isin(keep_keywords)].copy()

print("Kept keywords:", tlm_f["keyword"].nunique(), "of", trends_long_monthly["keyword"].nunique())
tlm_f.to_csv("trends_long_monthly_filtered.csv", index=False)


Kept keywords: 74 of 100


Generating theme indices

In [None]:
theme_monthly = (tlm_f
                 .groupby(["theme","date"])["hits"]
                 .mean()
                 .reset_index())

theme_wide = theme_monthly.pivot(index="date", columns="theme", values="hits").reset_index()
theme_wide = theme_wide.rename(columns={
    "buying_demand": "trend_buying_demand",
    "mortgage_financing": "trend_mortgage_financing",
    "market_awareness": "trend_market_awareness",
    "renting_affordability": "trend_renting_affordability",
    "economic_policy": "trend_economic_policy"
}).sort_values("date")

theme_wide.to_csv("theme_trends_monthly.csv", index=False)
print("Saved theme_trends_monthly.csv")


Saved theme_trends_monthly.csv


Standardisation of the indices

In [None]:
theme_wide = pd.read_csv("theme_trends_monthly.csv", parse_dates=["date"])

# Standardise each theme index (z-score)
theme_std = theme_wide.copy()
for col in theme_std.columns:
    if col != "date":
        theme_std[col] = (theme_std[col] - theme_std[col].mean()) / theme_std[col].std(ddof=0)

theme_std.to_csv("theme_trends_monthly_standardised_modified.csv", index=False)
print("Saved: theme_trends_monthly_standardised_modified.csv")


Saved: theme_trends_monthly_standardised_modified.csv
