# Scraping NYT 

Scraping data from the NYT mini games.

Data Format Ideal Setup

Puzzle metadata.

| puzzle_id | publication_date | creator | editor | 
| --------- | ---------------- | ------------ | ------ | 
| sample_id    | sample_date       | sample_creator | sample_editor |


Other table is information on the puzzles 
| puzzle_id | clue | answer | direction |
| --------- | --- | ------- | ----------|
| sample_id | text | ans_text | across     |


In [94]:
import pandas as pd
import numpy as np
import json
import requests
import os 

from datetime import datetime, timedelta

In [87]:
def get_crossword_data(date):
    slash_format = date.strftime("%Y/%m/%d")
    dash_format = date.strftime("%Y-%m-%d")
    
    nyt_s=os.environ["NYT_S"]
    
    cookies = {
        "NYT-S": f"{nyt_s}"
    }
    
    headers = {
        'accept': '*/*',
        'accept-language': 'en-GB,en;q=0.7',
        'content-type': 'application/x-www-form-urlencoded',
        'priority': 'u=1, i',
        'referer': f'https://www.nytimes.com/crosswords/game/mini/{slash_format}',
        'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Brave";v="128"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Linux"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'sec-gpc': '1',
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
        'x-games-auth-bypass': 'true',
    }
    
    response = requests.get(f'https://www.nytimes.com/svc/crosswords/v6/puzzle/mini/{dash_format}.json', cookies=cookies, headers=headers)
    response_json = response.json()
    return response_json

In [96]:
def parse_puzzle_metadata(json_request):
    creator = json_request['constructors'][0]
    editor = json_request.get('editor', np.nan)
    puzzle_id = json_request.get('id', np.nan)
    publication_date = json_request.get('publicationDate', np.nan)

    datetime_format = "%Y-%m-%d"

    return {
        'creator': creator, 
        'editor': editor,
        'puzzle_id': puzzle_id,
        'publication_date': datetime.strptime(publication_date, datetime_format)
    }

In [89]:
def parse_clue_data(json_request):
    puzzle_id = json_request['id']
    answers_df = pd.DataFrame([x for x in json_request['body'][0]['cells'] if len(x) > 0])

    answers_df = pd.concat([
        answers_df, 
        pd.DataFrame(answers_df['clues'].to_list(), columns = ['clue_y', 'clue_x'])
    ], axis=1)
    
    answers_df = answers_df.drop(columns=["clues"])
    across_words = df.groupby("clue_y")['answer'].sum()
    down_words = df.groupby("clue_x")['answer'].sum()

    clues = [
        x['text'][0]['plain'] for x in 
        json_request['body'][0]['clues']
        if len(x) > 0
    ]

    table = []
    
    across_clues = clues[:len(across_words)]
    down_clues = clues[len(across_words):]
    for (word, clue) in zip(across_words, across_clues):
        table.append((puzzle_id, clue, word, 'across'))

    for (word, clue) in zip(down_words, down_clues):
        table.append((puzzle_id, clue, word, 'down'))
    
    return table    

## Requesting Crossword Data

In [115]:
date = datetime(2014, 8, 21)
end_date = datetime.now()

json_requests = []
while date <= end_date:
    print(f"Requesting date: {date}", end="\r")
    json_req = get_crossword_data(date)
    json_requests.append(json_req)
    date = date + timedelta(days=1)

Requesting date: 2024-09-28 00:00:00

## Parsing Metadata

In [117]:
puzzles_metadata = [] 
for json in json_requests:
    puzzle_data = parse_puzzle_metadata(json)    
    puzzles_metadata.append(puzzle_data)

metadata_df = pd.DataFrame(puzzles_metadata)
# metadata_df.to_csv("puzzle_metadata_all.csv", index=False)

## Parsing Clues and Words

In [118]:
clues_and_words = []
for json in json_requests:
    puzzle_data = parse_clue_data(json)
    clues_and_words.append(puzzle_data)

flattened_clues = [
    clue 
    for puzzle in clues_and_words
    for clue in puzzle
]

In [119]:
puzzle_words = pd.DataFrame(flattened_clues, columns=["puzzle_id", "clue", "word", "direction"])
# puzzle_words.to_csv("puzzle_words_all.csv", index=False)

In [124]:
metadata_df['creator'].value_counts()

Joel Fagliano                                         3487
Christina Iverson                                       62
Wyna Liu                                                60
Tracy Bennett                                           51
Sam Ezersky                                             15
 Joel Fagliano                                           3
Helen Chen                                               2
Kanyin Ajayi                                             2
Ailee Yoshida                                            2
Trey Mendez                                              2
Everdeen Mason                                           2
Dani Brown (age 11)                                      1
Joel Fagliiano                                           1
Gustie Owens                                             1
Joel Fagliano and The School of The New York Times       1
Name: creator, dtype: int64