# Celebrity News Timeline Database Builder

This notebook builds a timeline database mapping celebrities to significant news events across the years 2004-2025. It leverages the Google Custom Search JSON API to find major news articles for each celebrity and organizes them by year.

**Purpose:** The generated database can be used as a basis for analyzing public sentiment


## 1. Setup: Imports and Constants

In [1]:
import os
import csv
import json
import time
import requests
from datetime import datetime
from typing import Dict, List, Any, Optional, Set
from urllib.parse import quote_plus, urlparse
from google import GoogleSearchAPI, RateLimitExceededError, GoogleSearchError

import os 
from dotenv import load_dotenv
parent_dir = os.path.dirname(os.getcwd())
load_dotenv(os.path.join(parent_dir, '.env'))
api_key = os.environ.get("GOOGLE_API_KEY")
search_engine_id = os.environ.get("GOOGLE_CSE_ID") #load env variables

#const
START_YEAR = 2004
END_YEAR = 2025
MAX_ARTICLES_PER_YEAR = 3 #max articles to find per year per celebrity
CACHE_FILE = "celeb_timeline_cache.json" #cache Google Search results
OUTPUT_DB_FILE = "celebrity_timeline_db.json" #output file
INPUT_CSV_FILE = "../celeb_data.csv" #input file with celebrity names

REPUTABLE_DOMAINS = {
    "nytimes.com", "washingtonpost.com", "wsj.com", "reuters.com", "apnews.com",
    "bbc.com", "bbc.co.uk", "cnn.com", "nbcnews.com", "abcnews.go.com", "cbsnews.com",
    "theguardian.com", "npr.org", "time.com", "forbes.com", "bloomberg.com",
    "economist.com", "latimes.com", "usatoday.com", "politico.com", "thehill.com",
    "foxnews.com", "cnbc.com", "businessinsider.com", "vox.com", "huffpost.com",
    "variety.com", "hollywoodreporter.com", "ew.com", "rollingstone.com", "billboard.com",
    "espn.com", "sports.yahoo.com", "si.com", "people.com", "eonline.com", "tmz.com"
}

MAJOR_EVENT_KEYWORDS = [
    "award", "win", "won", "nominated", "nomination", "oscar", "grammy", "emmy", "golden globe",
    "marriage", "divorce", "wedding", "engaged", "engagement", "child", "baby", "born",
    "scandal", "controversy", "lawsuit", "legal", "court", "trial", "arrested", "prison",
    "movie", "film", "album", "song", "release", "premiere", "debut", "launch",
    "milestone", "achievement", "breakthrough", "record", "bestseller", "box office",
    "died", "death", "accident", "health", "illness", "surgery", "hospitalized",
    "political", "campaign", "election", "president", "appointed", "named", "ceo",
    "founded", "company", "business", "startup", "investment", "philanthropy", "charity",
    "comeback", "return", "retirement", "memoir", "autobiography", "book"
]


## 2. Configuration: API Client Initialization

This cell retrieves the Google API credentials from environment variables and initializes the `GoogleSearchAPI` client.


In [2]:

api_client = GoogleSearchAPI(
    api_key=api_key,
    search_engine_id=search_engine_id,
    max_retries=5,
    retry_delay=3,
    requests_per_day=9999,  #daily limit
    requests_per_second=5   #rate limit
)

## 3. Helper Functions

These functions handle tasks like caching, filtering search results based on relevance and source reputation, and saving/loading data.

In [3]:
def load_cache(cache_file):
    """Loads cached search results from a JSON file"""
    if os.path.exists(cache_file):
        with open(cache_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    else:
        return {}

def save_cache(cache, cache_file):
    """Saves the current cache state to a JSON file"""
    with open(cache_file, 'w', encoding='utf-8') as f:
        json.dump(cache, f, indent=2)
        
def get_cache_key(celebrity, year):
    """Generates a unique cache key for a celebrity and year."""
    return f"{celebrity.lower().replace(' ', '_')}_{year}"

def is_major_event(title, snippet):
    """Determines if an article likely represents a major event based on keywords."""
    content = (title + " " + snippet).lower()
    has_major_keyword = any(keyword in content for keyword in MAJOR_EVENT_KEYWORDS)
    return has_major_keyword

def is_from_reputable_source(url):
    """Checks if the URL's domain is in the predefined list."""
    domain = urlparse(url).netloc
    if domain.startswith('www.'):
        domain = domain[4:]
    return domain in REPUTABLE_DOMAINS

def save_database(db, output_file):
    """Saves the timeline database to a JSON file."""
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(db, f, indent=2)

def load_database(input_file):
    """Loads a timeline database."""
    if not os.path.exists(input_file):
        return None
    with open(input_file, 'r', encoding='utf-8') as f:
        db = json.load(f)
    return db


## 4. Core Logic: Searching and Building Timelines

These functions perform the actual search using the Google API client, filter results, and assemble the timeline data structure.

In [4]:
def search_celebrity_year(
    api_client,
    celebrity,
    year,
    cache,
    use_cache,
    cache_file):
    """
    Searches for major news events for a celebrity in a specific year.
    """
    cache_key = get_cache_key(celebrity, year)

    if use_cache and cache_key in cache:
        cached_data = cache[cache_key]
        return cached_data
        
    queries = [
        f'"{celebrity}" {year} news',
        f'"{celebrity}" {year} major event',
        f'"{celebrity}" {year} award', 
        f'"{celebrity}" {year} controversy' 
    ]

    found_articles = {} # {url: {details}}

    num_queries_run = 0
    for query in queries:
        if len(found_articles) >= MAX_ARTICLES_PER_YEAR:
            break

        if num_queries_run > 0:
            time.sleep(1.2) #small delay

        try:
            num_queries_run += 1
            response = api_client.search_by_year(query, year)
            results = api_client.extract_search_results(response) #list of dicts

            if not results:
                continue

            for result in results:
                url = result.get('link', '')
                title = result.get('title', '')
                snippet = result.get('snippet', '')

                if not url or not title or url in found_articles:
                    continue

                if is_major_event(title, snippet) and is_from_reputable_source(url):
                    found_articles[url] = {
                        "title": title,
                        "link": url,
                        "snippet": snippet
                    }
                    if len(found_articles) >= MAX_ARTICLES_PER_YEAR:
                        break

        except RateLimitExceededError as e:
            print(f"RLE for {celebrity} ({year}):. Stopping searches for this celebrity.")
            break
        except GoogleSearchError as e:
            print(f"GSE for {celebrity} ({year}): {e}. Trying next query.")
            continue
        except Exception as e:
            print(e)
            continue

    major_events_list = list(found_articles.values())

    #cache results
    if use_cache:
        cache[cache_key] = major_events_list
        save_cache(cache, cache_file) #save cache after each year

    return major_events_list


def build_timeline_for_celebrity(
    api_client,
    celebrity,
    cache,
    use_cache,
    cache_file
):
    """
    Builds a year-by-year timeline of major news events for a single celebrity.
    """
    celebrity_timeline = {}
    
    for year in range(START_YEAR, END_YEAR + 1):
        yearly_events = search_celebrity_year(
            api_client=api_client,
            celebrity=celebrity,
            year=year,
            cache=cache,
            use_cache=use_cache,
            cache_file=cache_file
        )
        
        if yearly_events:
            celebrity_timeline[str(year)] = yearly_events # Use string keys for JSON compatibility

        
        time.sleep(0.5) 
            
    return celebrity_timeline


## 5. Execution: Build the Complete Database

This cell orchestrates the process:
1. Loads existing cache (if any).
2. Reads celebrity names from the input CSV file.
3. Iterates through each celebrity, building their timeline using the functions above.
4. Saves the cache and the final database periodically and at the end.

In [5]:
def run_database_build(
    api_client,
    input_csv,
    output_db_file,
    cache_file,
    use_cache = True,
    save_interval = 5 #save db  every N celebrities
):
    """
    Main function to run the database build process.
    """
    cache = load_cache(cache_file)

    timeline_db = load_database(output_db_file)
    if timeline_db is None:
        timeline_db = {}



    # celeb names
    celebrities_to_process = []
    
    with open(input_csv, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        if 'Name' not in reader.fieldnames:
            return
        for row in reader:
            name = row['Name'].strip()
            if name and name not in timeline_db:
                celebrities_to_process.append(name)
            else:
                pass #skip


    #build timeline for celebs
    processed_count = 0
    for i, celebrity in enumerate(celebrities_to_process):

        if celebrity in timeline_db:
             continue #skip

        celebrity_timeline = build_timeline_for_celebrity(
            api_client, celebrity, cache, use_cache, cache_file
        )

        # add to db
        timeline_db[celebrity] = celebrity_timeline
        processed_count += 1

        #save prog periodically
        if processed_count > 0 and processed_count % save_interval == 0:
            save_database(timeline_db, output_db_file)



    save_database(timeline_db, output_db_file)



run_database_build(
    api_client=api_client,
    input_csv=INPUT_CSV_FILE,
    output_db_file=OUTPUT_DB_FILE,
    cache_file=CACHE_FILE,
    use_cache=True,
    save_interval=5
)


# 6. View Results

Load the generated JSON database and inspect a sample entry.


In [None]:
#load db
final_db = load_database(OUTPUT_DB_FILE)
print(f" {len(final_db)} celebrities.")
first_celebrity = list(final_db.keys())[0]
print(f"First celebrity: {first_celebrity}")
print(f"Timeline for {first_celebrity}:")
print(json.dumps(final_db[first_celebrity], indent=2))

