This file merges the 5 files downloaded from Google Keyword Planner App.

In [None]:
from google.colab import files
uploaded = files.upload()


Saving buying_demand_keywords.csv to buying_demand_keywords.csv
Saving economic_policy_keywords.csv to economic_policy_keywords.csv
Saving market_awareness_keywords.csv to market_awareness_keywords.csv
Saving mortgage_financing_keywords.csv to mortgage_financing_keywords.csv
Saving renting and affordability_keywords.csv to renting and affordability_keywords.csv
Saving trends_long_weekly.csv to trends_long_weekly.csv


In [None]:
!pip install pytrends

Collecting pytrends
  Downloading pytrends-4.9.2-py3-none-any.whl.metadata (13 kB)
Downloading pytrends-4.9.2-py3-none-any.whl (15 kB)
Installing collected packages: pytrends
Successfully installed pytrends-4.9.2


Merging and Cleaning

In [None]:
import pandas as pd, glob, os

# -------- settings --------
TOP_N_PER_THEME = 300   # set None to skip the top-N file

# -------- locate files --------
csv_paths = glob.glob("*keywords.csv")
print("Files found:", csv_paths, "\n")
assert csv_paths, "No *keywords.csv files found in the working folder."

# -------- helpers --------
def infer_theme(filename: str) -> str:
    name = os.path.splitext(os.path.basename(filename.lower()))[0]
    if "buy" in name or "demand" in name: return "buying_demand"
    if "mortgage" in name or "financing" in name: return "mortgage_financing"
    if "market" in name or "awareness" in name: return "market_awareness"
    if "rent" in name or "afford" in name: return "renting_affordability"
    if "policy" in name or "economic" in name: return "economic_policy"
    return "unknown"

def read_kp_file(path: str) -> pd.DataFrame:
    # Keyword Planner CSVs are typically UTF-16 + tab
    df = pd.read_csv(path, encoding="utf-16", sep="\t")
    # standardise headers
    df.columns = [c.strip() for c in df.columns]
    # keep only the two columns we need
    assert "Keyword" in df.columns, f"'Keyword' not found in {path}"
    assert "Avg. monthly searches" in df.columns, f"'Avg. monthly searches' not found in {path}"
    out = pd.DataFrame({
        "keyword": df["Keyword"].astype(str).str.lower().str.strip(),
        "avg_monthly_searches": pd.to_numeric(
            df["Avg. monthly searches"].astype(str).str.replace(",", "", regex=False),
            errors="coerce"
        )
    })
    out["theme"] = infer_theme(path)
    # tidy
    out = out[out["keyword"].str.len() > 0].drop_duplicates(subset="keyword").reset_index(drop=True)
    return out

# -------- process & merge --------
frames = []
for p in csv_paths:
    cur = read_kp_file(p)
    frames.append(cur)
    print(f"✓ {os.path.basename(p)} — {len(cur)} rows, volumes non-null: {cur['avg_monthly_searches'].notna().sum()}")

master = pd.concat(frames, ignore_index=True)
# if the same keyword appears in multiple themes, keep the highest volume row
master = (master.sort_values("avg_monthly_searches", ascending=False)
                .drop_duplicates(subset="keyword", keep="first")
                .sort_values(["theme","keyword"])
                .reset_index(drop=True))

master.to_csv("master_keywords_with_volume.csv", index=False)
print("\nSaved: master_keywords_with_volume.csv")
print("Per-theme counts:\n", master.groupby("theme").size())
print("\nVolume summary (non-null):\n", master.loc[master["avg_monthly_searches"].notna(),"avg_monthly_searches"].describe())

# -------- optional: top-N per theme --------
if TOP_N_PER_THEME is not None:
    topN = (master.dropna(subset=["avg_monthly_searches"])
                   .sort_values(["theme","avg_monthly_searches"], ascending=[True, False])
                   .groupby("theme", as_index=False)
                   .head(TOP_N_PER_THEME)
                   .reset_index(drop=True))
    topN.to_csv("master_keywords_top.csv", index=False)
    print(f"\nSaved top {TOP_N_PER_THEME} per theme → master_keywords_top.csv")


Files found: ['renting and affordability_keywords.csv', 'economic_policy_keywords.csv', 'buying_demand_keywords.csv', 'market_awareness_keywords.csv', 'mortgage_financing_keywords.csv'] 

✓ renting and affordability_keywords.csv — 1367 rows, volumes non-null: 1356
✓ economic_policy_keywords.csv — 3192 rows, volumes non-null: 3192
✓ buying_demand_keywords.csv — 2055 rows, volumes non-null: 2054
✓ market_awareness_keywords.csv — 2709 rows, volumes non-null: 2665
✓ mortgage_financing_keywords.csv — 4540 rows, volumes non-null: 4530

Saved: master_keywords_with_volume.csv
Per-theme counts:
 theme
buying_demand            2031
economic_policy          3055
market_awareness         2707
mortgage_financing       4453
renting_affordability    1365
dtype: int64

Volume summary (non-null):
 count     13545.000000
mean       2185.939461
std       21578.221515
min           0.000000
25%          50.000000
50%          50.000000
75%         500.000000
max      500000.000000
Name: avg_monthly_search

Selecting only top 5 keywords from each theme

In [None]:
import pandas as pd, glob, os

# ---- 1) Find your files ----
csv_paths = glob.glob("*keywords.csv")
assert csv_paths, "No *keywords.csv files found."
print("Found:", csv_paths, "\n")

# ---- 2) Helpers ----
def infer_theme(filename: str) -> str:
    name = os.path.splitext(os.path.basename(filename.lower()))[0]
    if "buy" in name or "demand" in name: return "buying_demand"
    if "mortgage" in name or "financing" in name: return "mortgage_financing"
    if "market" in name or "awareness" in name: return "market_awareness"
    if "rent" in name or "afford" in name: return "renting_affordability"
    if "policy" in name or "economic" in name: return "economic_policy"
    return "unknown"

def read_kp(path: str) -> pd.DataFrame:
    # Keyword Planner CSVs are UTF-16 + TAB
    df = pd.read_csv(path, encoding="utf-16", sep="\t")
    df.columns = [c.strip() for c in df.columns]
    assert "Keyword" in df.columns and "Avg. monthly searches" in df.columns, \
        f"Expected columns missing in {path}"
    out = pd.DataFrame({
        "keyword": df["Keyword"].astype(str).str.lower().str.strip(),
        "avg_monthly_searches": pd.to_numeric(
            df["Avg. monthly searches"].astype(str).str.replace(",", "", regex=False),
            errors="coerce"
        )
    })
    out["theme"] = infer_theme(path)
    out = out[out["keyword"].str.len() > 0]
    return out

# ---- 3) Per-file: select Top 5 by volume ----
top_per_file = []
for p in csv_paths:
    cur = read_kp(p)
    # Drop NaNs, sort, then take top 5
    cur = cur.dropna(subset=["avg_monthly_searches"])
    top5 = (cur.sort_values("avg_monthly_searches", ascending=False)
                .head(5)
                .reset_index(drop=True))
    print(f"✓ {os.path.basename(p)} → picked {len(top5)} rows")
    # Save optional per-file top 5
    top5.to_csv(f"{infer_theme(p)}_top5.csv", index=False)
    top_per_file.append(top5)

# ---- 4) Merge the five Top-5 into one file ----
merged_top5 = pd.concat(top_per_file, ignore_index=True)

# If the same keyword appears in more than one theme, keep the highest-volume instance
merged_top5 = (merged_top5.sort_values("avg_monthly_searches", ascending=False)
                           .drop_duplicates(subset="keyword", keep="first")
                           .sort_values(["theme","avg_monthly_searches"], ascending=[True, False])
                           .reset_index(drop=True))

merged_top5.to_csv("master_keywords_top5_per_file_merged.csv", index=False)

print("\nCounts per theme:")
print(merged_top5["theme"].value_counts())
print("\nSaved:")
print(" - master_keywords_top5_per_file_merged.csv")
print(" - <theme>_top5.csv for each file")
print("\nPreview:")
print(merged_top5.head(25))


AssertionError: No *keywords.csv files found.

Selecting top 20 keywords from each theme

In [None]:
import pandas as pd

# Load the full master file (already has avg_monthly_searches + theme)
kw = pd.read_csv("master_keywords_with_volume.csv")

# Keep only non-null search volumes
kw = kw.dropna(subset=["avg_monthly_searches"])

# Pick top 20 keywords per theme
top20 = (kw.sort_values(["theme","avg_monthly_searches"], ascending=[True, False])
            .groupby("theme", as_index=False)
            .head(20)
            .reset_index(drop=True))

print(top20["theme"].value_counts())
top20.to_csv("master_keywords_top20.csv", index=False)
print("Saved: master_keywords_top20.csv (100 rows)")


theme
buying_demand            20
economic_policy          20
market_awareness         20
mortgage_financing       20
renting_affordability    20
Name: count, dtype: int64
✅ Saved: master_keywords_top20.csv (100 rows)


Extracting the selected keyword search intensity from Google Trends

In [None]:
!pip -q install pytrends
import pandas as pd, numpy as np, time
from pytrends.request import TrendReq

KEYWORD_FILE = "master_keywords_top20.csv"
GEO = "GB"
TIMEFRAME = "2005-01-01 2025-06-30"  # Jan 2005 → June 2025

kw = pd.read_csv(KEYWORD_FILE)
kw["keyword"] = kw["keyword"].astype(str).str.strip().str.lower()
kw = kw.drop_duplicates(subset="keyword").reset_index(drop=True)
print("Total keywords:", len(kw))


Total keywords: 100


Downloading the data at weekly frequency in batches of 5 keywords

In [None]:
pytrends = TrendReq(hl='en-GB', tz=0)

def batches(lst, n=5):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

theme_map = dict(kw[["keyword","theme"]].values)

weekly_frames, failed = [], []

for group in batches(list(kw["keyword"]), 5):
    try:
        pytrends.build_payload(group, geo=GEO, timeframe=TIMEFRAME)
        df = pytrends.interest_over_time()
        if df.empty:
            failed.extend(group); time.sleep(2); continue
        df = df.drop(columns=[c for c in ["isPartial"] if c in df.columns])
        df = df.reset_index().rename(columns={"date":"date"})
        long = df.melt(id_vars="date", var_name="keyword", value_name="hits")
        long["theme"] = long["keyword"].map(theme_map)
        weekly_frames.append(long)
    except Exception as e:
        failed.extend(group)
    time.sleep(1.5)  # polite delay to avoid throttling

trends_long_weekly = (pd.concat(weekly_frames, ignore_index=True)
                      if weekly_frames else
                      pd.DataFrame(columns=["date","keyword","hits","theme"]))
# types
trends_long_weekly["date"] = pd.to_datetime(trends_long_weekly["date"])
trends_long_weekly["hits"] = pd.to_numeric(trends_long_weekly["hits"], errors="coerce")

print("Downloaded weekly rows:", len(trends_long_weekly))
print("Failed keywords:", len(failed))
trends_long_weekly.to_csv("trends_long_weekly.csv", index=False)


Downloaded weekly rows: 18450
Failed keywords: 25


In [None]:
print (failed)

['mortgage rates', '0 interest credit cards', 'annual percentage rate', 'apr rate', 'bank england base rate', 'bank of england interest rate', 'bank of england interest rate uk', 'bank of england lending rate', 'bank of england official bank rate', 'barclays mortgage rates', 'rightmove sold prices', 'zoopla house prices', 'comparative market', 'house prices', 'house prices crash', 'right move sold', 'right move sold prices', 'rightmove house prices', 'rightmove sold', 'rightmove sold house prices', 'home loan mortgage calculator', 'home loan mortgage rates', 'house loan repayment calculator', 'mortgage calculator', 'mortgage estimator']


In [None]:
failed_keys = ['mortgage rates', '0 interest credit cards', 'annual percentage rate', 'apr rate', 'bank england base rate', 'bank of england interest rate', 'bank of england interest rate uk', 'bank of england lending rate', 'bank of england official bank rate', 'barclays mortgage rates', 'rightmove sold prices', 'zoopla house prices', 'comparative market', 'house prices', 'house prices crash', 'right move sold', 'right move sold prices', 'rightmove house prices', 'rightmove sold', 'rightmove sold house prices', 'home loan mortgage calculator', 'home loan mortgage rates', 'house loan repayment calculator', 'mortgage calculator', 'mortgage estimator']


new_kw = pd.DataFrame()
for i, row in kw.iterrows():
  if row["keyword"] in failed_keys:
    new_kw = kw[kw["keyword"].isin(failed_keys)].copy()
print(new_kw)

                               keyword  avg_monthly_searches  \
25                      mortgage rates              500000.0   
26             0 interest credit cards               50000.0   
27              annual percentage rate               50000.0   
28                            apr rate               50000.0   
29              bank england base rate               50000.0   
35       bank of england interest rate               50000.0   
36    bank of england interest rate uk               50000.0   
37        bank of england lending rate               50000.0   
38  bank of england official bank rate               50000.0   
39             barclays mortgage rates               50000.0   
40               rightmove sold prices              500000.0   
41                 zoopla house prices              500000.0   
42                  comparative market               50000.0   
43                        house prices               50000.0   
44                  house prices crash  

In [None]:
pytrends = TrendReq(hl='en-GB', tz=0)

def batches(lst, n=5):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

theme_map = dict(new_kw[["keyword","theme"]].values)

weekly_frames, failed = [], []

for group in batches(list(new_kw["keyword"]), 5):
    try:
        pytrends.build_payload(group, geo=GEO, timeframe=TIMEFRAME)
        df = pytrends.interest_over_time()
        if df.empty:
            failed.extend(group); time.sleep(2); continue
        df = df.drop(columns=[c for c in ["isPartial"] if c in df.columns])
        df = df.reset_index().rename(columns={"date":"date"})
        long = df.melt(id_vars="date", var_name="keyword", value_name="hits")
        long["theme"] = long["keyword"].map(theme_map)
        weekly_frames.append(long)
    except Exception as e:
        failed.extend(group)
    time.sleep(1.5)  # polite delay to avoid throttling

trends_long_weekly_2 = (pd.concat(weekly_frames, ignore_index=True)
                      if weekly_frames else
                      pd.DataFrame(columns=["date","keyword","hits","theme"]))
# types
trends_long_weekly_2["date"] = pd.to_datetime(trends_long_weekly["date"])
trends_long_weekly_2["hits"] = pd.to_numeric(trends_long_weekly["hits"], errors="coerce")

print("Downloaded weekly rows:", len(trends_long_weekly_2))
print("Failed keywords:", len(failed))
trends_long_weekly_2.to_csv("trends_long_weekly_2.csv", index=False)


Downloaded weekly rows: 6150
Failed keywords: 0


In [None]:
print(failed)

[]


In [None]:
combined_df = pd.concat([trends_long_weekly, trends_long_weekly_2], ignore_index=True)
combined_df.to_csv("trends_long_weekly_combined.csv", index=False)

Conversion of weekly frequency to monthly

In [None]:
combined_df["date"] = pd.to_datetime(combined_df["date"])
combined_df["hits"] = pd.to_numeric(combined_df["hits"], errors="coerce")

trends_long_monthly = (
    combined_df
      .set_index("date")
      .groupby(["keyword","theme"])
      .resample("MS")["hits"]
      .mean()
      .reset_index()
)

trends_wide_monthly = trends_long_monthly.pivot_table(index="date", columns="keyword", values="hits")

trends_long_monthly.to_csv("trends_long_monthly.csv", index=False)
trends_wide_monthly.to_csv("trends_wide_monthly.csv")
print("Saved: trends_long_monthly.csv and trends_wide_monthly.csv")


Saved: trends_long_monthly.csv and trends_wide_monthly.csv


Filtering the sparse keywords

In [None]:
# Non-NA count per keyword
non_na = trends_long_monthly.groupby("keyword")["hits"].apply(lambda s: s.notna().sum())
keep_min24 = non_na[non_na >= 24].index

# Zero share
zero_share = (trends_long_monthly
              .groupby("keyword")["hits"]
              .apply(lambda s: (s.fillna(0)==0).mean()))

keep_nonflat = zero_share[zero_share < 0.80].index  # keep if <80% zeros

keep_keywords = set(keep_min24).intersection(set(keep_nonflat))
tlm_f = trends_long_monthly[trends_long_monthly["keyword"].isin(keep_keywords)].copy()

print("Kept keywords:", tlm_f["keyword"].nunique(), "of", trends_long_monthly["keyword"].nunique())
tlm_f.to_csv("trends_long_monthly_filtered.csv", index=False)


Kept keywords: 77 of 100


Generating theme indices

In [None]:
theme_monthly = (tlm_f
                 .groupby(["theme","date"])["hits"]
                 .mean()
                 .reset_index())

theme_wide = theme_monthly.pivot(index="date", columns="theme", values="hits").reset_index()
theme_wide = theme_wide.rename(columns={
    "buying_demand": "trend_buying_demand",
    "mortgage_financing": "trend_mortgage_financing",
    "market_awareness": "trend_market_awareness",
    "renting_affordability": "trend_renting_affordability",
    "economic_policy": "trend_economic_policy"
}).sort_values("date")

theme_wide.to_csv("theme_trends_monthly.csv", index=False)
print("Saved theme_trends_monthly.csv")


Saved theme_trends_monthly.csv


Standardisation of the indices

In [None]:
import pandas as pd

theme_wide = pd.read_csv("theme_trends_monthly.csv", parse_dates=["date"])

# Standardise each theme index (z-score)
theme_std = theme_wide.copy()
for col in theme_std.columns:
    if col != "date":
        theme_std[col] = (theme_std[col] - theme_std[col].mean()) / theme_std[col].std(ddof=0)

theme_std.to_csv("theme_trends_monthly_standardised.csv", index=False)
print("Saved: theme_trends_monthly_standardised.csv")


✅ Saved: theme_trends_monthly_standardised.csv
