# Data Extraction Using API 

### Importing Libraries

In [1]:
import requests
import json
import pandas as pd

### Function to Extract Headlines and Summary of Articles from News Website
In this Case Website Used- The New York Times 

In [11]:
def fetch_nyt_archives(api_key, query, year):
    all_articles = []
    
    for month in range(1, 7):
        url = f'https://api.nytimes.com/svc/archive/v1/{year}/{month}.json'
        params = {
            'api-key': api_key,
            'q': query,
            'begin_date': f'{year}{month:02d}01', 
            'end_date': f'{year}{month:02d}31'
        }
        retry_count = 3 
        retry_delay = 5
        while retry_count > 0:
            response = requests.get(url, params=params)
            
            if response.status_code == 200:
                data = response.json()
                if 'response' in data and 'docs' in data['response']:
                    all_articles.extend(data['response']['docs'])
                break
            elif response.status_code == 429:
                print(f"Rate limit exceeded. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
                retry_delay *= 2
                retry_count -= 1
            else:
                print(f"Error fetching data for {query} in {year}/{month}: {response.status_code}, {response.text}")
                break
    
    return all_articles

### Function to Store Articles for Particular Stock

In [12]:
def process_articles(articles, query):
    processed_articles = []
    for article in articles:
        processed_articles.append({
            'Headlines': article['headline']['main'],
            'Publication_Date': article['pub_date'],
            'Summary': article['snippet'],
            'Query': query
        })
    return processed_articles

### Fetching And Saving Articles in .csv File Format

In [13]:
def fetch_and_save_articles(api_key, queries, year, output_dir):
    for query in queries:
        all_articles = fetch_nyt_archives(api_key, query, year)
        processed_articles = process_articles(all_articles, query)
        
        if processed_articles:
            df = pd.DataFrame(processed_articles)
            output_file = f"{output_dir}/{query}_News_{year}_Test.csv"
            df.to_csv(output_file, index=False)
            print(f"Saved {len(df)} articles for '{query}' to {output_file}")
        else:
            print(f"No articles fetched for '{query}' in {year}")

### Necessary Information Required in Data Extraction and Function Call to Fetch and Store Data

In [14]:
api_key = 'Enter Your API Key'
queries = [ 'Microsoft', 'Google']
year = 2024
output_dir = 'NYT_Articles'
fetch_and_save_articles(api_key, queries, year, output_dir)

Saved 22099 articles for 'Microsoft' to NYT_Articles/Microsoft_News_2024_Test.csv
Saved 22099 articles for 'Google' to NYT_Articles/Google_News_2024_Test.csv
