# 0- Importing libraries and setting parameters

In [1]:
import requests
import pandas as pd
import time 
import numpy as np
import os

#Progress bar for pandas
from tqdm import tqdm
tqdm.pandas()

from pandas import json_normalize

NYT_API_KEY = 'YOUR API KEY'

In [3]:
year_start, year_end = 2010, 2019

API_DIRECTORY_PATH = './NYT_API_RAW/'
PARSED_DIRECTORY_PATH = './NYT_clean_data/'

if ~os.path.exists(API_DIRECTORY_PATH):
    os.makedirs(API_DIRECTORY_PATH)

if ~os.path.exists(PARSED_DIRECTORY_PATH):
    os.makedirs(PARSED_DIRECTORY_PATH)

# 1- Archive API request

In [2]:
def GET_NYT_archive_API(year_start = 1980, year_end = 2023, directory_path = ''):

    ''''
    Function to get the data fron the NYT Archive API
    '''
    
    file_path = directory_path + "NYT_Archive_API_" + str(year_start) + "_" + str(year_end) +".csv"
    print(file_path)
    #Seed df to get the structure of the reply from the NYT API
    year = 2000
    month = 1

    #WEB URL of the archive API
    NYT_ARCHIVE = requests.get('https://api.nytimes.com/svc/archive/v1/' + str(year) + '/' + str(month) + '.json?api-key=' + NYT_API_KEY)
    j = NYT_ARCHIVE.json()['response']['docs']

    NYT_df = pd.DataFrame(columns=pd.DataFrame.from_dict(j).columns)

    for year in range(year_start, year_end+1):
        NYT_df_temp_year = pd.DataFrame(columns = NYT_df.columns)

        for month in range(1, 13):           
            time.sleep(15) #To stay below the 10 request per minute limit
            
            try:
                NYT_ARCHIVE = requests.get('https://api.nytimes.com/svc/archive/v1/' + str(year) + '/' + str(month) + '.json?api-key=' + NYT_API_KEY)
                print( str(month) + " - " +  str(year) + " - " + str(len(NYT_ARCHIVE.json()['response']['docs'])))
                j = NYT_ARCHIVE.json()['response']['docs']
                NYT_df_temp_month = pd.DataFrame.from_dict(j)
                NYT_df_temp_year = pd.concat([NYT_df_temp_year, NYT_df_temp_month])

            except:
                print( str(month) + " - " +  str(year) + ' - ' + str(NYT_ARCHIVE) + ' - FAIL')

        NYT_df = pd.concat([NYT_df, NYT_df_temp_year], axis =0)

    NYT_df = NYT_df.reset_index(drop=True)
    print('Export begin')
    NYT_df.to_csv(file_path, index=False)
    print('Export finish')

    return NYT_df

In [None]:
NYT_df = GET_NYT_archive_API(year_start, year_end, API_DIRECTORY_PATH)

# 2- Cleaning and parsing NYT data

In [None]:
def KeywordParser(json):

    ''' 
    Parse the 5 top keywords from the NYT data in their order of importance
    '''
    
    array_1 = np.empty(5, dtype=object)
    array_1[:]= np.nan
    array_2 = json_normalize(eval(json)).values[:5,1:2].flatten()
    array_1[:len(array_2)] = array_2

    return pd.Series(array_1.reshape(5))

def NYT_data_cleaning_parsing(NYT_df, directory_path = ''):

    '''
    Parse and clean the NYT raw output
    '''

    file_path = directory_path + "NYT_Articles_" + str(year_start) + '_' + str(year_end) + ".csv"
    print(file_path)
    print(NYT_df.shape)

    print(NYT_df.shape)

    ##PARSING
    #Setting the date column in a DateTime format
    NYT_df['pub_date'] = pd.to_datetime(NYT_df['pub_date'])

    #Remove irrelevent columns
    NYT_df = NYT_df.drop(["multimedia", "print_section","print_page", "source", "_id", "word_count", "uri", "snippet", "lead_paragraph"], axis = 1)

    #Types of articles to keep
    type_of_material_to_keep = ['Archives', 'An Analysis', 'Editorial', 'Interactive Feature', 'News', 'News Analysis', 'Newsletter', 'Op-Ed', 'Quote', 'Review', 'Slideshow', 'briefing']
    NYT_df = NYT_df[NYT_df["type_of_material"].isin(type_of_material_to_keep)]

    #Remove empty abstract entries
    NYT_df = NYT_df[NYT_df['abstract'].notna()]
    NYT_df.reset_index(drop = True, inplace = True)

    #Parse headline data and keep main
    df_headline = pd.json_normalize(NYT_df.headline[:])
    NYT_df['headline'] = df_headline['main']

    #Parse top 5 keywords
    NYT_df["keywords"] = NYT_df['keywords'].astype(str)
    NYT_df[["keyword_1", "keyword_2", "keyword_3", "keyword_4", "keyword_5"]] = NYT_df.keywords.progress_apply(KeywordParser)

    #CLEANING
    NYT_df = NYT_df[NYT_df['document_type'] == 'article']

    to_remove = ['MOTION PICTURES']

    for item_to_remove in to_remove:
        NYT_df = NYT_df[NYT_df['keyword_1'] != item_to_remove]
        NYT_df = NYT_df[NYT_df['keyword_2'] != item_to_remove]
        NYT_df = NYT_df[NYT_df['keyword_3'] != item_to_remove]
        NYT_df = NYT_df[NYT_df['keyword_4'] != item_to_remove]
        NYT_df = NYT_df[NYT_df['keyword_5'] != item_to_remove]

    #Format the column order
    columns_order = [
    'headline', 
    'abstract', 
    'pub_date', 
    'web_url',
    'keyword_1',
    'keyword_2',
    'keyword_3',
    'keyword_4',
    'keyword_5']

    NYT_df = NYT_df[columns_order]

    NYT_df.reset_index(drop = True, inplace = True)
    print(NYT_df.shape)
    
    NYT_df.to_csv(file_path, index=False)
    
    return NYT_df

In [None]:
NYT_articles_df = NYT_data_cleaning_parsing(NYT_df, PARSED_DIRECTORY_PATH)