# Jeopardy Web Scraper

This program scrapes the website: https://j-archive.com to get Jeopardy questions to be used as you wish.

In [12]:
import requests
import time
import json
import re

from bs4 import BeautifulSoup

base_url = "https://j-archive.com/"

def sanitize_filename(filename):
    # Remove any characters not allowed in filenames (Windows/Linux/Mac)
    return re.sub(r'[<>:"/\\|?*]', '', filename)


# Helper function to parse clue IDs
def parse_clue_id(clue_id):
    parts = clue_id.split("_")
    if len(parts) < 4:
            return None, None, None

    round_prefix = parts[1] # Round (J, DJ, or FJ)
    X = int(parts[2]) # Cat
    Y = int(parts[3]) # Value

    return round_prefix, X, Y # Extract Round Prefix (J, DJ, FJ) X (category) and Y (row)

In [None]:
# GET EVERY SEASON
list_seasons_URL = base_url + "listseasons.php"
list_seasons_page = requests.get(list_seasons_URL)
soup = BeautifulSoup(list_seasons_page.content, "html.parser")

seasons_json_array = []
seasons_trs = soup.find_all("tr")
for season_tr in seasons_trs:
    # init new episode json object
    season_json = {"name": None, "from_date": None, "to_date": None, "URL": None, "file_path": None, "episodes": []}

    #print(season_tr)

    # get 'a' tag for ID, air date, and URL
    a_tag = season_tr.find("a")
    #print(a_tag)
    #season_json["id"] = a_tag.string.split(",")[0].lstrip("#")
    #season_json["air_date"] = a_tag.string[-10:]
    season_json["URL"] = base_url + a_tag["href"]

    if a_tag.string is None:
        a_tag.string = "NoName"
    season_name = sanitize_filename(a_tag.string)
    season_name = season_name.replace(" ", "")
    season_json["name"] = sanitize_filename(season_name)

    # get 'td' tag for episode name
    td = season_tr.find_all('td')[1]
    string = td.string.lstrip().rstrip()
    if string.find(" to ") > 0:
        dates = string.split(" to ")
        #print(dates)
        season_json["from_date"] = dates[0]
        season_json["to_date"] = dates[1]

    # append episode json to array
    season_json["file_path"] = f"data/seasons/{season_json["name"]}"
    seasons_json_array.append(season_json)

#print(json.dumps(seasons_json_array))

seasons_json_array_with_game_data = []

# GET A SEASON'S EPISODES
import os

for season in seasons_json_array:

    season_episodes_URL = season["URL"]
    season_episodes_page = requests.get(season_episodes_URL)
    soup = BeautifulSoup(season_episodes_page.content, "html.parser")
    
    folder_path = f"data/seasons/{season["name"]}"

    # Create the folder (if it doesn't exist)
    os.makedirs(folder_path, exist_ok=True)

    episodes_json_array = []
    episodes_json_array_with_game_data = []
    episodes = soup.find_all("tr")

    count = 0
    total_per_season = 1
    for episode in episodes:
        if count >= total_per_season:
            break
        # init new episode json object
        episode_json = {"id": None, "name": None, "air_date": None, "URL": None, "file_path": None, "game_data": None}

        # get 'a' tag for ID, air date, and URL
        a_tag = episode.find("a")
        if a_tag.string is not None:
            episode_json["id"] = a_tag.string.split(",")[0].lstrip("#")
            episode_json["air_date"] = a_tag.string[-10:]
        episode_json["URL"] = base_url + a_tag["href"]

        # get 'td' tag for episode name
        td = episode.find_all('td')[1]
        string = td.string.lstrip().rstrip()
        episode_json["name"] = string

        ########################
        ### GET JEOPARDY Q/A ###
        ########################

        episode_url = episode_json["URL"]
        file_path = f"{folder_path}/episode_{episode_json["id"]}.json"

        episode_json["file_path"] = file_path
        
        # make request before status check
        try:
            response = requests.get(episode_url, timeout=5)  # Set timeout
        except requests.exceptions.RequestException as e:
            print(f"Request failed for {episode_url}: {e}")
            continue


        if response.status_code != 200: # status check
            print(f"HTTP request unsuccessful for {episode_url}")
            continue # skips to next episode

        soup = BeautifulSoup(response.content, "html.parser")  # Parse the HTML

        jeopardy_round = soup.find("div", id="jeopardy_round")
        double_jeopardy_round = soup.find("div", id="double_jeopardy_round")
        final_jeopardy_round = soup.find("div", id="final_jeopardy_round")

        if jeopardy_round:
            single_categories = [cat.text.strip() for cat in jeopardy_round.find_all("td", class_="category_name")]
        if double_jeopardy_round:
            double_categories = [cat.text.strip() for cat in double_jeopardy_round.find_all("td", class_="category_name")]
        if final_jeopardy_round:
            final_category = [cat.text.strip() for cat in final_jeopardy_round.find_all("td", class_="category_name")]

        clues = soup.find_all("td", class_="clue_text")

        # Initialize game_data
        game_data = {"Jeopardy Round": {},
            "Double Jeopardy": {},
            "Final Jeopardy": {}
            }

        # Loop through all clues
        for clue in soup.find_all("td", class_="clue_text"):
            clue_id = clue.get("id", "No ID")

            if clue_id.endswith("_r"):
                continue

            clue_text = clue.text.strip()
            #print(clue_text)

            # Parse clue ID to get category (X) and row index (Y)
            round_prefix, X, Y = parse_clue_id(clue_id)
            if round_prefix is None or X is None or Y is None:
                continue  # Skip invalid clues

            if round_prefix == "J":
                round_name = "Jeopardy Round"
                category_list = single_categories
                clue_value = Y * 200
            elif round_prefix == "DJ":
                round_name = "Double Jeopardy"
                category_list = double_categories
                clue_value = Y * 400
            else:
                continue

        
            category = category_list[X - 1] if 1 <= X <= len(category_list) else "Unknown Category"


            # Generate the corresponding answer ID
            answer_id = clue_id + "_r"
            answer_tag = soup.find(id=answer_id)
            answer = answer_tag.find("em", class_="correct_response").text.strip() if answer_tag else "No Answer"

            # Ensure category exists in game_data
            if category not in game_data[round_name]:
                game_data[round_name][category] = []

            # Store the clue in game_data
            game_data[round_name][category].append({
                "value": clue_value, 
                "question": clue_text,
                "answer": answer
            })

            final_jeopardy_clue = soup.find("td", id="clue_FJ")
            if final_jeopardy_clue:
                fj_text = final_jeopardy_clue.text.strip()

                
                fj_answer_tag = soup.find("td", id="clue_FJ_r")
                fj_answer = fj_answer_tag.find("em", class_="correct_response").text.strip() if answer_tag else "No Answer"


                fj_category = final_category[0] if final_category else "Final Jeopardy"

                game_data["Final Jeopardy"][fj_category] = [{
                    "value": "Final Jeopardy",
                    "question": fj_text,
                    "answer": fj_answer
                }]
            else:
                print("No final Jeopardy question found!")

        #print(f"Game data for {episode_json["id"]}: {episode_json["game_data"]}")

        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(game_data, f, indent=4)
            #print(f"Saved episode {episode_json["id"]} to {file_path}")
        

        # append episode json to array
        episodes_json_array.append(episode_json)
        episode_json_with_game_data = episode_json.copy()
        episode_json_with_game_data["game_data"] = game_data
        episodes_json_array_with_game_data.append(episode_json_with_game_data)

        #count += 1
    
    # wait one second as to not get blocked from the website
    #time.sleep(0.1)

    season["episodes"] = episodes_json_array
    seasons_json_array_with_game_data.append(episodes_json_array_with_game_data)

print(seasons_json_array[2])
with open("data/seasons.json", "w") as file:
    json.dump(seasons_json_array, file, indent=4)

with open("data/all_data.json", "w") as file:
    json.dump(seasons_json_array_with_game_data, file, indent=4)

No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopardy question found!
No final Jeopa

In [11]:
## GET EPISODE OF SEASON
seasons = []

with open("data/seasons.json", "r") as file:
    seasons = json.loads(file.read())

print(seasons)

name_to_search = "Season39"
episode_filepath = ""
for season in seasons:
    if season["name"] != name_to_search:
        continue

    episode_filepath = season["episodes"][0]["file_path"]

episode_tiles = []

with open(episode_filepath, "r") as file:
    episode_tiles = json.loads(file.read())

print(episode_tiles)

[{'name': 'Season41', 'from_date': '2024-09-09', 'to_date': '2025-07-25', 'URL': 'https://j-archive.com/showseason.php?season=41', 'file_path': 'data/seasons/Season41', 'episodes': [{'id': '9291', 'name': 'Alex DeFrank vs. Clare Murray vs. Brett Aresco', 'air_date': '2025-03-17', 'URL': 'https://j-archive.com/showgame.php?game_id=9144', 'file_path': 'data/seasons/Season41/episode_9291.json', 'game_data': None}]}, {'name': 'Audio-onlygames', 'from_date': '2024-01-12', 'to_date': '2024-02-02', 'URL': 'https://j-archive.com/showseason.php?season=cwcpi', 'file_path': 'data/seasons/Audio-onlygames', 'episodes': [{'id': '3', 'name': 'Eric Anderson vs. Ed Petersen vs. Ilena Di Toro', 'air_date': '2024-02-02', 'URL': 'https://j-archive.com/showgame.php?game_id=8792', 'file_path': 'data/seasons/Audio-onlygames/episode_3.json', 'game_data': None}]}, {'name': 'Season40', 'from_date': '2023-09-11', 'to_date': '2024-07-26', 'URL': 'https://j-archive.com/showseason.php?season=40', 'file_path': 'data

In [None]:
"""
code for generating the unique hashes 
used as PKS for the sql tables for clues and categories
"""
'''
---why use MD5?
-deterministic, so well always get same hash for the same input, which in theory reduces hash collision
-shorter than SHA
-fast
'''
'''
***potential function for has gneration***
'''
# import the hashlib library
import hashlib

def generate_category_id(game_id, round_name, category_name):
    base = f"{game_id}_{round_name}_{category_name}" # high entropy inputs for the hash generation
    return hashlib.md5(base.encode()).hexdigest() # md5 for low collision but not overkill for usecase

def generate_category_id(category_id, clue_text):
    base = f"{category_id}_{clue_text}" # use cat_id and clue text for high entropy gneration(low collision)
    return hashlib.md5(base.encode()).hexdigest()


# more modular hash_md5 function 

import hashlib

def hash_md5(s):
    return hashlib.md5(s.encode()).hexdigest

category_id = hash_md5(f"{game_id}_{round_name}_{category_name}")
clue_id = hash_md5(f"{category_id}{clue['question']}")

In [None]:
'''
DataProcessor class for making the game tables notes
here im noting the values for the table column assigner
***workflow***
- process all games
    -  xtract data from json and append to lists
- write the lists to csvs using the csv import    
'''
# imports
import re
import hashlib
import csv

class DataProcessor:
    def __init__(self):
        self.games = []
        self.categories = []
        self.clues = []

    # first make a function for extracting a season number from the filepath in the dictionary
    def extract_season_number(self, file_path):
        match = re.search(r'Season(\d+)', file_path)
        return int(match.group(1)) if match else None

    # now make a hash helper fnction for generating unique IDs
    def hash_md5(self, string):
        return hashlib.md5(string.encode()).hexdigest() 

    def process(self, all_data):
        # sample loop for extracting game data     
        for game in all_data: # loop for games table
            file_path = game['file_path']
            if 'Season' not in file_path: # excludes special seasons
                continue
            # data to be extracted
            game_id = game['id']
            air_date = game['air_date']
            season = self.extract_season_number(file_path)

            # add the game info to the games list
            self.games.append({
                "game_id": game_id, #pk
                "air_date": air_date,
                "season": season
            })
            # move to next nested level in json, categories
            for round_name, round_data in game['game_data'].items():
                for category_name, clues in round_data.items():
                    # use the mash_md5 method for cat_id
                    category_id = self.hash_md5(f"{game_id}_{round_name}_{category_name}")
                    # append categories list
                    self.categories.append({
                        "category_id": category_id, #pk
                        "game_id": game_id, #fk
                        "round_name": round_name,
                        "category_name": category_name
                    })

                    # move to clue data
                    for clue in clues:
                        clue_id = self.hash_md5(f"{category_id}{clue['question']}")
                        self.clues.append({
                            "clue_id": clue_id, #pk
                            "category_id": category_id, #fk
                            "game_id": game_id, #fk
                            "value": clue["value"],
                            "clue_text": clue["question"],
                            "correct_response": clue["answer"]
                        })
    def write_csv(self, data, path, fieldnames);
        with open(path, 'w', newline='', encoding= 'utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames) #map keys to columns using DictWriter
            writer.writeheader()#writes the column headers
            writer.writerows(data)#writes the rows
        

