# This notebook uses New York Times API Article Search
1. https://developer.nytimes.com/apis
1. We will use article search, where the query is the name of the company. 
2. If the company had multiple names throughout history, we will query both names

In [1]:
import yaml

# Stored outside of github folder
with open("../../config.yml", "r") as stream:
    config = yaml.safe_load(stream)
    
api_key = config["api_key_3"]

In [2]:
from pynytimes import NYTAPI
import numpy as np
import pandas as pd

nyt = NYTAPI(api_key, parse_dates=True)
# This was created manually, so there he no notebook to log it. We documented firms where they had
# more than one name throughout history
nyt_query_helper = pd.read_csv("../data/nyt_query_helper.csv")
print(nyt_query_helper.secondary_query[nyt_query_helper.secondary_query.notna()].values)
nyt_query_helper.head()

['Bread Financial' 'Valaris ' 'Federated Hermes' 'Sanofi'
 'Brookfield Property REIT' 'IQVIA' 'Paramount' 'Altaba']


Unnamed: 0,Symbol,Security,wiki_ticker,new_ticker,secondary_query
0,A,Agilent Technologies,A,A,
1,AA,Alcoa Inc,AA,AA,
2,AAL,American Airlines Group,AAL,AAL,
3,AAP,Advance Auto Parts,AAP,AAP,
4,AAPL,Apple Inc.,AAPL,AAPL,


# Scraping By Search Result

In [3]:
import time
import datetime 
from tqdm.notebook import tqdm
from pathlib import Path

# Our backtest is from 2010-01-01 onwards
# So we need data from 2009-12-01 onwards (at least 1 month more of data)
dates = {
        "begin": datetime.datetime(2009, 
                                   12, 1),
        "end": datetime.datetime(2023, 2, 1)
    }

for i in tqdm(range(1, len(nyt_query_helper))):
    row = nyt_query_helper.iloc[i]
    ticker = row["Symbol"]
    # We will first query security, the company name
    security = row["Security"]
    secondary_query = row["secondary_query"]
    path = Path(f"../data/new_york_times/company/{ticker}.csv")
    current_company_result = nyt.article_search(query = security, dates = dates, results = 1000)
    
    # If there was a second company name in history, we use it
    if type(secondary_query) == str:
        current_company_result.extend(nyt.article_search(query = secondary_query, dates = dates, results = 1000))

    current_company_result = pd.DataFrame(current_company_result)
    current_company_result.to_csv(f"../data/new_york_times/company/{ticker}.csv")
    time.sleep(10)

  0%|          | 0/735 [00:00<?, ?it/s]