# 0- Setting the folders

In [48]:
import os

In [49]:
API_DIRECTORY_PATH = './NYT_API_RAW/'
PARSED_DIRECTORY_PATH = './NYT_API_clean/'

if os.path.exists(API_DIRECTORY_PATH):
    print("Raw NYT data folder exist")
else:
    print("Raw NYT data created")
    os.makedirs(API_DIRECTORY_PATH)

if os.path.exists(PARSED_DIRECTORY_PATH):
    print("Clean NYT data folder exist")
else:
    os.makedirs(PARSED_DIRECTORY_PATH)

Raw NYT data folder exist
Clean NYT data folder exist


# 1- Archive API request

In [53]:
import requests
import pandas as pd
import time 
import numpy as np
import os

#Progress bar for pandas
from tqdm import tqdm
tqdm.pandas()

from pandas import json_normalize

NYT_API_KEY = 'YOUR API KEY'

In [64]:
import os
import time
import requests
import pandas as pd


def get_nyt_archive_api(year_start=1980, year_end=2023, directory_path=API_DIRECTORY_PATH):
    """
    Function to get the data from the NYT Archive API
    """
    for year in range(year_start, year_end+1):
        nyt_data_year_path = os.path.join(directory_path, str(year))
        
        if not os.path.isdir(nyt_data_year_path):
            os.makedirs(nyt_data_year_path)

        for month in range(1, 13):           
            nyt_data_year_month_file = os.path.join(nyt_data_year_path, f"NYT_Archive_API_{year}_{month}.csv")
            
            try:
                monthly_data = pd.read_csv(nyt_data_year_month_file)
                continue
            except:
                print(f'File empty or does not exist: {nyt_data_year_month_file}')

                time.sleep(15) # To stay within API limits
                
                try:
                    nyt_archive = requests.get(f'https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={NYT_API_KEY}')

                    if nyt_archive.status_code != 200:
                        print(f"Request failed with status code {nyt_archive.status_code} for {month}-{year}")
                        break

                    nyt_df = pd.DataFrame.from_dict(nyt_archive.json()['response']['docs'])
                    nyt_df.reset_index(drop=True, inplace=True)
                    nyt_df.to_csv(nyt_data_year_month_file, index=False)

                    print(f"{year} - {month} - {len(nyt_archive.json()['response']['docs'])}" + " - Export finished")

                except requests.exceptions.RequestException as e:
                    print(f"{month} - {year} - FAIL: {e}")


In [70]:
#Give the year to download the data from:

year_start = 1853
year_end = 2023

get_nyt_archive_api(year_start, year_end, API_DIRECTORY_PATH)

File empty or does not exist: ./NYT_API_RAW/1978/NYT_Archive_API_1978_9.csv
1978 - 9 - 0 - Export finished
File empty or does not exist: ./NYT_API_RAW/1978/NYT_Archive_API_1978_10.csv
1978 - 10 - 0 - Export finished
File empty or does not exist: ./NYT_API_RAW/2023/NYT_Archive_API_2023_7.csv
Request failed with status code 403 for 7-2023


# 2- Raw Data Aggregator 

In [33]:
start_year = 2010
end_year = 2012

data_frames = []

for root, dirs, files in os.walk(API_DIRECTORY_PATH):
    for dir in dirs:
        subfolder_year = int(dir)
        if start_year <= subfolder_year <= end_year:
            subfolder_path = os.path.join(root, dir)
            for file in os.listdir(subfolder_path):
                if file.endswith('.csv'):
                    file_path = os.path.join(subfolder_path, file)
                    monthly_data = pd.read_csv(file_path)
                    data_frames.append(monthly_data)

combined_data = pd.concat(data_frames, ignore_index=True)


In [43]:
def KeywordParser(json):

    ''' 
    Parse the 5 top keywords from the NYT data in their order of importance
    '''
    
    array_1 = np.empty(10, dtype=object)
    array_1[:]= np.nan
    array_2 = json_normalize(eval(json)).values[:10,1:2].flatten()
    array_1[:len(array_2)] = array_2

    return pd.Series(array_1.reshape(10))

def extract_main(text):
    dictionary = eval(text)  # Evaluate the string as a dictionary
    main = dictionary.get('main')  # Extract the 'main' value
    return main

def NYT_data_cleaning_parsing(year_start, year_end, NYT_df, directory_path = ''):

    '''
    Parse and clean the NYT raw output
    '''

    #Remove irrelevent columns
    NYT_df = NYT_df.drop(["multimedia", "print_section","print_page", "source", "_id", "word_count", "uri", "snippet", "lead_paragraph"], axis = 1)
    
    file_path = directory_path + "NYT_Articles_" + str(year_start) + '_' + str(year_end) + ".csv"
    print(file_path)
    print(NYT_df.shape)

    ##PARSING
    #Setting the date column in a DateTime format
    NYT_df['pub_date'] = pd.to_datetime(NYT_df['pub_date'])

    #Remove empty abstract entries
    NYT_df = NYT_df[NYT_df['abstract'].notna()]
    NYT_df.reset_index(drop = True, inplace = True)

    #Parse headline data and keep main

    NYT_df['headline'] = NYT_df['headline'].progress_apply(extract_main)

    #Parse top 5 keywords
    NYT_df["keywords"] = NYT_df['keywords'].astype(str)
    NYT_df[["keyword_1", "keyword_2", "keyword_3", "keyword_4", "keyword_5", 
            "keyword_6", "keyword_7", "keyword_8", "keyword_9", "keyword_10"]] = NYT_df.keywords.progress_apply(KeywordParser)

    #Format the column order
    columns_order = ['headline', 'abstract', 'pub_date', 'web_url',
                     "keyword_1", "keyword_2", "keyword_3", "keyword_4", "keyword_5", 
                    "keyword_6", "keyword_7", "keyword_8", "keyword_9", "keyword_10"]

    NYT_df = NYT_df[columns_order]

    NYT_df.reset_index(drop = True, inplace = True)
    print(NYT_df.shape)
    
    #NYT_df.to_csv(file_path, index=False)
    
    return NYT_df

In [44]:
year_start = 2020
year_end = 2023

NYT_clean_df = NYT_data_cleaning_parsing(start_year, end_year, combined_data, PARSED_DIRECTORY_PATH)

./NYT_API_clean/NYT_Articles_2010_2012.csv
(316672, 11)


100%|██████████| 316614/316614 [00:04<00:00, 68891.31it/s]
100%|██████████| 316614/316614 [01:00<00:00, 5191.16it/s]

(316614, 14)





In [46]:
NYT_clean_df.sort_values('pub_date')

Unnamed: 0,headline,abstract,pub_date,web_url,keyword_1,keyword_2,keyword_3,keyword_4,keyword_5,keyword_6,keyword_7,keyword_8,keyword_9,keyword_10
203845,"A Camera and an Eye, Both One of a Kind","In an age of digital imagery, one man still ea...",2010-01-01 00:04:53+00:00,https://www.nytimes.com/2010/01/03/nyregion/03...,Cameras,Photography,New York City,,,,,,,
203846,Looking Ahead: Dance,Assessing notable dance performances coming up...,2010-01-01 00:18:05+00:00,https://www.nytimes.com/2010/01/01/arts/dance/...,Dancing,,,,,,,,,
203847,Chronicle of a Changing City,"Jimmy Van Bramer, newly elected to the New Yor...",2010-01-01 00:23:36+00:00,https://www.nytimes.com/2010/01/03/nyregion/03...,Queens (NYC),"van Bramer, Jimmy","Gioia, Eric N",City Council (New York City),City Councils,,,,,
203848,Chick-fil-A Bowl: No. 11 Virginia Tech (9-3) v...,The Hokies may be the most consistent program ...,2010-01-01 00:24:25+00:00,https://thequad.blogs.nytimes.com/2009/12/31/c...,,,,,,,,,,
203849,A Taxing Estate,"This week, answers to readers’ questions about...",2010-01-01 00:38:34+00:00,https://www.nytimes.com/2010/01/03/fashion/03s...,Parties (Social),"Customs, Etiquette and Manners",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66484,How Many Slaves Work for You?,We need a new Emancipation Proclamation to fre...,2012-12-31 23:45:32+00:00,https://www.nytimes.com/2013/01/01/opinion/how...,Human Trafficking,Emancipation Proclamation (1863),Forced Labor,Slavery,"Obama, Barack",United States,Blacks,Black People,,
66485,"After Crispy Pig Ears, 10 Trends for 2013","From long-aged meat to artisanal soft-serve, h...",2012-12-31 23:47:20+00:00,https://www.nytimes.com/2013/01/02/dining/afte...,Cooking and Cookbooks,Restaurants,Food,,,,,,,
66486,Hurting Knicks Uncertain of Tuesday’s Lineup,Carmelo Anthony and Amar’e Stoudemire both pra...,2012-12-31 23:48:51+00:00,https://offthedribble.blogs.nytimes.com/2012/1...,Basketball,"ANTHONY, CARMELO","Chandler, Tyson","Kidd, Jason","Stoudemire, Amar'e","Wallace, Rasheed",New York Knicks,,,
66487,Pursuing a First Win in the Bowls Since 1949,If Northwestern beats Mississippi State in the...,2012-12-31 23:51:17+00:00,https://www.nytimes.com/2013/01/01/sports/ncaa...,Football (College),"Fitzgerald, Pat (1974- )","Nagurski, Bronko",Mississippi State University,Northwestern University,Bowl Games,,,,
