Fetching News Articles: Company name(Dynamic)

In [2]:
import requests
from config import API_KEY

def fetch_company_news(company_name):
    base_url = 'https://newsapi.org/v2'
    endpoint = '/everything'

    search_query = '{} suppliers'.format(company_name)

    # Query parameters
    query_params = {
        'q': search_query,
        'apiKey': API_KEY,
        'sortBy': 'publishedAt',
        'language': 'en',
        'pageSize': 20,  # Number of articles
    }

    response = requests.get(base_url + endpoint, params=query_params)

    if response.status_code == 200:
        data = response.json()
        articles = data['articles']
        
        seen_titles = set()  # To keep track of titles already written

        output_filename = '{}_news_articles.txt'.format(company_name.replace(" ", "_"))
        with open(output_filename, 'w', encoding='utf-8') as file:
            for article in articles:
                title = article['title']

                # Check if article with this title is already written
                if title in seen_titles:
                    continue
                
                # If not, add the title to the set
                seen_titles.add(title)

                content = article.get('content', 'No content available')
                published_at = article['publishedAt']
                url = article['url']

                file.write('Title: {}\n'.format(title))
                file.write('Content: {}\n'.format(content))
                file.write('Published Date: {}\n'.format(published_at))
                file.write('URL: {}\n'.format(url))
                file.write('-' * 30 + '\n')  # Separator between news articles
    else:
        print('Error:', response.status_code)

if __name__ == "__main__":
    company = input("Enter the company name: ")
    fetch_company_news(company)

Enter the company name: Apple


Limitation for above code is that the content coloumn can only display 200chars (Due to NewsAPI free account restriction)

Using BeautifulSoup we go to the url and Scrape the article from the Source website (Including time frame)
{Not applicable for free account in NewsAPI - Only restricted to Search articles up to a month old }

In [None]:
import requests
from bs4 import BeautifulSoup
from config import API_KEY
from datetime import datetime

def fetch_full_content(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        return '\n'.join([p.get_text() for p in paragraphs])
    except Exception as e:
        return f"Error fetching full content: {e}"

def is_phrase_in_sentence(phrase, company, content):
    sentences = content.split('.')
    for sentence in sentences:
        if phrase in sentence and company in sentence:
            return True
    return False

def fetch_company_suppliers_news(company_name, from_date, to_date):
    base_url = 'https://newsapi.org/v2'
    endpoint = '/everything'

    search_query = '("supplies to" OR "partner with") {}'.format(company_name)

    query_params = {
        'q': search_query,
        'from': from_date,
        'to': to_date,
        'apiKey': API_KEY,
        'sortBy': 'publishedAt',
        'language': 'en',
        'pageSize': 20,  # Ideally more results as we will filter them further
    }

    response = requests.get(base_url + endpoint, params=query_params)

    if response.status_code == 200:
        data = response.json()
        articles = data['articles']

        output_filename = '{}_supplier_articles_{}_to_{}.txt'.format(company_name.replace(" ", "_"), from_date, to_date)
        with open(output_filename, 'w', encoding='utf-8') as file:
            for article in articles:
                title = article['title']
                url = article['url']
                published_date = article['publishedAt']

                full_content = fetch_full_content(url)

                if not (is_phrase_in_sentence("supplies to", company_name, full_content) or 
                        is_phrase_in_sentence("partner with", company_name, full_content)):
                    continue

                file.write('Title: {}\n'.format(title))
                file.write('Published Date: {}\n'.format(published_date))
                file.write('Full Content: {}\n'.format(full_content))
                file.write('URL: {}\n'.format(url))
                file.write('-' * 50 + '\n')  # Separator
    else:
        print('Error:', response.status_code)

if __name__ == "__main__":
    company = input("Enter the company name: ")

    # Get the date range from the user
    date_format = "%Y-%m-%d"
    while True:
        from_date = input("Enter the start date (YYYY-MM-DD): ")
        to_date = input("Enter the end date (YYYY-MM-DD): ")
        try:
            # Validate date format
            datetime.strptime(from_date, date_format)
            datetime.strptime(to_date, date_format)
            break
        except ValueError:
            print("Invalid date format. Please use YYYY-MM-DD.")

    fetch_company_suppliers_news(company, from_date, to_date)
