## Appendix A: Google Search Trends Data Pipeline (Code and Description)
_Note: I added extra comments into the code for understanding_

## A1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
# pytrends: unofficial Google Trends API
# pulls weekly Google search interest data
from pytrends.request import TrendReq

import matplotlib.pyplot as plt
from pathlib import Path
import statsmodels.api as sm

In [None]:
from pathlib import Path

DATA_CLEAN = Path("data_clean")
DATA_CLEAN.mkdir(exist_ok=True)

## A2. Add cities and keep consistent

In [None]:
#add city names
CITIES = [
    "Washington DC",
    "Seattle",
    "Boston",
]
#search friendly city names (keeps city names consistent)
SEARCH_CITY_NAME = {
    "Washington DC": "Washington DC",
    "Seattle": "Seattle",
    "Boston": "Boston",
}



## A3. Filter search for tourism

In [None]:
# Google Trends tourism-related search templates
TOURISM_TEMPLATES = [
    "things to do in {city}",
    "visit {city}",
    "{city} tourist attractions"
]

TIMEFRAME = "2017-01-01 2024-12-31"

## A4. Tourism proxy construction
purpose: this function constructs a monthly tourism intensity index for each city by averaging google trends search interest across multiple tourism related keywords.

How this works:
* I pull trends interest for several tourism-related searches (templates)
* Convert daily/weekly timestamps into month buckets
* average across templates to get ne monthly index per city

In [None]:
#Tourism data function
def fetch_city_tourism(city):
   #to build a monthly tourism intensity score for ONE city using Google Trends

    pytrends = TrendReq(hl="en-US", tz=360)
    search_city = SEARCH_CITY_NAME[city]

    monthly_series = [] 
    #I store each template's monthly data here (then combine later)

    for template in TOURISM_TEMPLATES:
        keyword = template.format(city=search_city) 
        #built a loop through different tourism search keywords to reduce bias

        pytrends.build_payload(
            kw_list=[keyword],
            timeframe=TIMEFRAME
        )

        data = pytrends.interest_over_time()

        if data.empty:
            continue

        data = data.drop(columns=["isPartial"])
        data = data.reset_index()
        data.columns = ["date", "value"] 
        #convert dates into month buckets so it matches my 311 monthly aggregation (appendix B)

        data["date"] = pd.to_datetime(data["date"])
        data["month"] = data["date"].dt.to_period("M").dt.to_timestamp()

        monthly = (
            data
            .groupby("month")["value"]
            .mean()
            .reset_index()
        )

        monthly_series.append(monthly)

    combined = pd.concat(monthly_series)

    combined = (
        combined
        .groupby("month")["value"]
        .mean()
        .reset_index(name="tourism_intensity")
    )

    combined["city"] = city

    return combined[["city", "month", "tourism_intensity"]]


## A5. City level tourism data generation and validation

In [None]:
all_cities = [] 
#building tourism intensity for all cities and stack into one table

for city in CITIES:
    print(f"Fetching tourism data for {city}")
    df = fetch_city_tourism(city)

    out = DATA_CLEAN / f"{city.replace(' ', '_').lower()}_tourism_monthly.csv" #for nice file names
    df.to_csv(out, index=False)

    all_cities.append(df)

tourism_all = pd.concat(all_cities, ignore_index=True)

In [None]:
tourism_all.groupby("city")["tourism_intensity"].describe() 
#final trends dataset across all cities (one row per city-month)