# Setup

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime
import calendar
import time
import requests
from typing import List, Union

# Article Scraper

Testing out manual fetching of articles

In [2]:
query_string = "NVIDIA NVIDIA Corporation (environment OR social OR governance OR sustainability OR ESG rating OR climate change OR diversity OR emissions OR resource use OR corporate social responsibility) sourcelang:english"
mode = "artlist"
headers = {"User-Agent": f"GDELT Python API"}

In [3]:
response = requests.get(
            f"https://api.gdeltproject.org/api/v2/doc/doc?query={query_string}&mode={mode}&format=json&maxrecords=200&sourcelang=ara&startdatetime=20180101000000&enddatetime=20180131235959",
            headers=headers
        )

In [4]:
response.text

'{"articles": [ { "url": "https://www.marketwatch.com/story/this-etf-shows-how-you-can-pick-stocks-for-rapid-growth-while-doing-good-2018-01-24", "url_mobile": "https://www.marketwatch.com/amp/story/guid/05F4A382-003C-11E8-8B37-5A39F2EFB447", "title": "This ETF shows how you can pick stocks for rapid growth while doing good", "seendate": "20180124T181500Z", "socialimage": "http://s.marketwatch.com/public/resources/MWimages/MW-GC275_MMM_20_ZG_20180123123931.jpg", "domain": "marketwatch.com", "language": "English", "sourcecountry": "United States" },{ "url": "http://presstelegraph.com/as-nvidia-corp-nvda-market-value-rose-holder-myriad-asset-management-ltd-cut-by-15-13-million-its-stake/", "url_mobile": "", "title": "As Nvidia Corp ( NVDA ) Market Value Rose , Holder Myriad Asset Management LTD Cut by $15 . 13 Million Its Stake", "seendate": "20180121T134500Z", "socialimage": "", "domain": "presstelegraph.com", "language": "English", "sourcecountry": "United States" },{ "url": "http://ww

In [5]:
pd.DataFrame(response.json(strict=False)['articles'])

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry
0,https://www.marketwatch.com/story/this-etf-sho...,https://www.marketwatch.com/amp/story/guid/05F...,This ETF shows how you can pick stocks for rap...,20180124T181500Z,http://s.marketwatch.com/public/resources/MWim...,marketwatch.com,English,United States
1,http://presstelegraph.com/as-nvidia-corp-nvda-...,,"As Nvidia Corp ( NVDA ) Market Value Rose , Ho...",20180121T134500Z,,presstelegraph.com,English,United States
2,http://www.4-traders.com/NVIDIA-CORPORATION-10...,http://www.4-traders.com/amp/NVIDIA-CORPORATIO...,NVIDIA Announces Upcoming Events for Financial...,20180130T223000Z,,4-traders.com,English,United States
3,https://www.whatsonthorold.com/2018/01/21/1-15...,,$1 . 15 EPS Expected for NVIDIA ( NVDA ); USD ...,20180121T153000Z,https://www.whatsonthorold.com/wp-content/uplo...,whatsonthorold.com,English,
4,https://normanobserver.com/analysts-see-1-15-e...,,Analysts See $1 . 15 EPS for NVIDIA ( NVDA ); ...,20180129T134500Z,https://normanobserver.com/wp-content/uploads/...,normanobserver.com,English,
...,...,...,...,...,...,...,...,...
195,http://thesivertimes.com/2018/01/01/the-10-yea...,,The 10,20180101T163000Z,,thesivertimes.com,English,
196,http://dietpillo.com/2018/01/salah-named-afric...,,Salah named African Player of the Year,20180110T164500Z,,dietpillo.com,English,
197,http://www.nasdaq.com/article/amds-q4-earnings...,http://www.nasdaq.com/article/amds-q4-earnings...,AMD Q4 Earnings to Grow on Portfolio Strength ...,20180126T171500Z,http://www.nasdaq.com/images/dreamit.jpg,nasdaq.com,English,United States
198,http://www.greencarcongress.com/mapping/,,Mapping,20180117T001500Z,http://up6.typepad.com/6a00d8341c4fbe53ef00e54...,greencarcongress.com,English,United States


Writing methods to automate the article fetcher

In [6]:
Filter = Union[List[str], str]
FilterGroup = List[List[str]]

In [7]:
languages = ["eng", "English"]
with open('../LOOKUP-LANGUAGES.TXT', 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line:  # Skip empty lines
            code, name = line.split('\t')  # Split by tab character
            languages.append(code)
            languages.append(name) 

print(languages)

['eng', 'English', 'afr', 'Afrikaans', 'sqi', 'Albanian', 'ara', 'Arabic', 'hye', 'Armenian', 'axe', 'Azerbaijani', 'ben', 'Bengali', 'bos', 'Bosnian', 'bul', 'Bulgarian', 'cat', 'Catalan', 'zho', 'Chinese', 'hrv', 'Croatian', 'ces', 'Czech', 'dan', 'Danish', 'nld', 'Dutch', 'est', 'Estonian', 'fin', 'Finnish', 'fra', 'French', 'glg', 'Galician', 'kat', 'Georgian', 'deu', 'German', 'ell', 'Greek', 'guj', 'Gujarati', 'heb', 'Hebrew', 'hin', 'Hindi', 'hun', 'Hungarian', 'isl', 'Icelandic', 'ind', 'Indonesian', 'ita', 'Italian', 'jpn', 'Japanese', 'kan', 'Kannada', 'kaz', 'Kazakh', 'kor', 'Korean', 'lav', 'Latvian', 'lit', 'Lithuanian', 'mkd', 'Macedonian', 'msa', 'Malay', 'mal', 'Malayalam', 'mar', 'Marathi', 'mon', 'Mongolian', 'nep', 'Nepali', 'nor', 'Norwegian', 'nno', 'NorwegianNynorsk', 'fas', 'Persian', 'pol', 'Polish', 'por', 'Portuguese', 'pan', 'Punjabi', 'ron', 'Romanian', 'rus', 'Russian', 'srp', 'Serbian', 'sin', 'Sinhalese', 'slk', 'Slovak', 'slv', 'Slovenian', 'som', 'Somal

In [8]:
def query_maker(companies: FilterGroup, keywords: FilterGroup, language=None):
    query_string = ""

    # process companies (assuming all companies are mandatory)
    # if type(companies) == str:
    #     query_string += f'"{companies} "' if " " in companies else f"{companies} "
    # else:
    #     query_string = query_string + " ".join([f'"{company}"' if " " in company else company for company in companies]) + " " # use space seperation for AND statements

    for group in companies:
        if not group: # skip empty groups
            continue
        # join groups with multiple companies with OR
        if len(group) > 1: 
            group_string = "(" + " OR ".join([f'"{company}"' if " " in company else company for company in group]) + ")"
        else:
            company = group[0]
            group_string = f'"{company}" ' if " " in company else f"{company}"

        query_string += group_string + " "

    # process keywords (assuming all keywords are interchangeable)
    # if type(keywords) == str:
    #     query_string += f'"{keywords} "'
    # else:
    #     query_string = query_string + "(" + " OR ".join([f'"{keyword}"' if " " in keyword else keyword for keyword in keywords]) + ") "

    # query_string += " "

    for group in keywords:
        if not group:  # skip empty groups
            continue
        # join groups with multiple keywords with OR 
        if len(group) > 1:
            group_string = "(" + " OR ".join([f'"{keyword}"' if " " in keyword else keyword for keyword in group]) + ")"
        else:
            keyword = group[0]
            group_string = f'"{keyword}" ' if " " in keyword else f"{keyword}"
            
        query_string += group_string + " "

    # process language
    if language:
        if language in languages:
            query_string += f"sourcelang:{language} "
        else:
            print("Language not found in lookup table")
    
    return query_string

In [9]:
companies = [["NVIDIA", "NVIDIA Corporation"]]
keywords = [["environment", "social", "governance", "sustainability", "ESG rating", "ESG"], ["climate change", "diversity", "emissions", "resource use", "corporate social responsibility"]]

In [10]:
query_maker(companies, keywords, "eng")

'(NVIDIA OR "NVIDIA Corporation") (environment OR social OR governance OR sustainability OR "ESG rating" OR ESG) ("climate change" OR diversity OR emissions OR "resource use" OR "corporate social responsibility") sourcelang:eng '

In [21]:
def query(query_string, mode, headers, start_date=None, end_date=None):

    
    start_dt = datetime.strptime(start_date, "%Y-%m-%d")  
    start_dt = start_dt.strftime("%Y%m%d%H%M%S")

    end_dt = datetime.strptime(end_date, "%Y-%m-%d")
    end_dt = end_dt.replace(hour=23, minute=59, second=59)
    end_dt = end_dt.strftime("%Y%m%d%H%M%S")
    # end_date = "20180131235959"
    
    response = requests.get(
            f"https://api.gdeltproject.org/api/v2/doc/doc?query={query_string}&mode={mode}&format=json&maxrecords=250&startdatetime={start_dt}&enddatetime={end_dt}",
            headers=headers
        )
    
    data = pd.DataFrame(response.json(strict=False)['articles'])
    return data
    # return response

In [22]:
query_string = query_maker(companies, keywords, language="English")
df = query(query_string, mode, headers, "2024-01-01", "2024-12-31")

In [25]:
dataframes = []
for i in range(12):
    start_date = f"2024-{i+1}-01"

    end_date = datetime.strptime(f"2024-{i+1}-01", "%Y-%m-%d")
    last_day = calendar.monthrange(end_date.year, end_date.month)[1] # makes sure the last day of the given month is used
    end_date = end_date.replace(day=last_day).strftime("%Y-%m-%d")

    query_string = query_maker(companies, keywords, language="English")
    dataframes.append(query(query_string, mode, headers, start_date, end_date))
    print(f"Fetched data for {start_date} to {end_date}")
    time.sleep(5) # wait for 5 seconds to avoid overloading API calls
df = pd.concat(dataframes)

Fetched data for 2024-1-01 to 2024-01-31
Fetched data for 2024-2-01 to 2024-02-29
Fetched data for 2024-3-01 to 2024-03-31
Fetched data for 2024-4-01 to 2024-04-30
Fetched data for 2024-5-01 to 2024-05-31
Fetched data for 2024-6-01 to 2024-06-30
Fetched data for 2024-7-01 to 2024-07-31
Fetched data for 2024-8-01 to 2024-08-31
Fetched data for 2024-9-01 to 2024-09-30
Fetched data for 2024-10-01 to 2024-10-31
Fetched data for 2024-11-01 to 2024-11-30
Fetched data for 2024-12-01 to 2024-12-31


In [26]:
df.shape

(1216, 8)

In [27]:
df

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry
0,https://finance.yahoo.com/news/decoding-servic...,,Decoding ServiceNow Inc ( NOW ): A Strategic S...,20240127T061500Z,https://media.zenfs.com/en/us.finance.gurufocu...,finance.yahoo.com,English,United States
1,https://www.jdsupra.com/legalnews/ai-regulatio...,,AI Regulation in India : Current State and Fut...,20240129T201500Z,https://jdsupra-static.s3.amazonaws.com/profil...,jdsupra.com,English,United States
2,https://www.lightreading.com/data-centers/sing...,,Singtel boosts AI data center business with Nv...,20240201T124500Z,https://eu-images.contentstack.com/v3/assets/b...,lightreading.com,English,United States
3,https://www.streetinsider.com/PRNewswire/Annou...,,Announcing ClimateGPT : The First Open Source ...,20240119T164500Z,,streetinsider.com,English,United States
4,https://www.cbsnews.com/news/glassdoor-best-pl...,https://www.cbsnews.com/amp/news/glassdoor-bes...,Glassdoor unveils the best places to work in 2...,20240110T054500Z,https://assets1.cbsnewsstatic.com/hub/i/r/2023...,cbsnews.com,English,United States
...,...,...,...,...,...,...,...,...
156,https://www.osnews.com/story/author/osnews/,,OS News – OSnews,20241209T154500Z,,osnews.com,English,United States
157,https://www.galvnews.com/news_ap/nation/ap-new...,,AP News in Brief at 6 : 04 p . m . EST,20241226T233000Z,https://bloximages.newyork1.vip.townnews.com/g...,galvnews.com,English,United States
158,https://www.yumasun.com/news/national_news/ap-...,,AP News in Brief at 12 : 04 a . m . EST,20241227T063000Z,https://bloximages.newyork1.vip.townnews.com/y...,yumasun.com,English,United States
159,https://www.legalbusinessonline.com/sites/defa...,,ALB ASIA DECEMBER 2024,20241219T161500Z,https://www.legalbusinessonline.com/sites/defa...,legalbusinessonline.com,English,China


Writing methods to scrape the URL contents

In [36]:
def scrape_article_content(url):
    """
    Scrapes article content from a URL.
    
    Args:
        url (str): The URL of the article.
    
    Returns:
        str: The extracted article content as plain text.
    """
    try:
        response = requests.get(url, timeout=1)
        if response.status_code != 200:
            print(f"Error fetching URL {url}: {response.status_code}")
            return ""
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Try to find an <article> tag first
        article = soup.find('article')
        if article:
            text = article.get_text(separator=' ', strip=True)
        else:
            # Fallback: concatenate text from all <p> tags
            paragraphs = soup.find_all('p')
            text = ' '.join(p.get_text(separator=' ', strip=True) for p in paragraphs)
        
        return text
    except Exception as e:
        # In production, you might log the exception here
        print(f"Error processing URL {url}: {e}")
        return ""

def add_article_content_column(df, url_column='url', content_column='content'):
    """
    Adds a new column to the DataFrame with the scraped content for each URL.
    
    Args:
        df (pd.DataFrame): The DataFrame containing URLs.
        url_column (str): The name of the column with URLs.
        content_column (str): The name of the new column to store article content.
    
    Returns:
        pd.DataFrame: The updated DataFrame with the new content column.
    """
    df[content_column] = df[url_column].apply(scrape_article_content)
    return df

# Example usage:
# Load your dataset into a DataFrame
# df = pd.read_csv('your_dataset.csv')
# df = add_article_content_column(df)
# df.to_csv('your_dataset_with_content.csv', index=False)


In [37]:
df_content = add_article_content_column(df)

Error fetching URL https://finance.yahoo.com/news/decoding-servicenow-inc-now-strategic-052156052.html: 429
Error fetching URL https://www.lightreading.com/data-centers/singtel-boosts-ai-data-center-business-with-nvidia-alliance: 403
Error fetching URL https://www.streetinsider.com/PRNewswire/Announcing+ClimateGPT:+The+First+Open+Source+Foundational+AI+Platform+Dedicated+to+Addressing+the+Impact+of+Climate+Change/22649393.html: 403
Error processing URL https://www.tri-cityherald.com/news/business/article284647000.html: HTTPSConnectionPool(host='www.tri-cityherald.com', port=443): Read timed out. (read timeout=1)
Error fetching URL https://venturebeat.com/ai/the-ces-tech-trends-to-watch-in-2024-cta/: 403
Error fetching URL https://kmod.iheart.com/featured/big-mad-morning-show/content/2024-01-11-the-best-places-to-work-in-the-us-in-2024/: 404
Error fetching URL https://techwireasia.com/02/2024/singtel-partners-nvidia-for-sovereign-ai/: 403
Error fetching URL https://www.tmcnet.com/usubmi

In [38]:
df_content.shape

(1216, 10)

In [40]:
df_content['text_length'] = df_content['content'].apply(lambda x: len(x.split()))

In [41]:
df_content

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry,content,text_length
0,https://finance.yahoo.com/news/decoding-servic...,,Decoding ServiceNow Inc ( NOW ): A Strategic S...,20240127T061500Z,https://media.zenfs.com/en/us.finance.gurufocu...,finance.yahoo.com,English,United States,,0
1,https://www.jdsupra.com/legalnews/ai-regulatio...,,AI Regulation in India : Current State and Fut...,20240129T201500Z,https://jdsupra-static.s3.amazonaws.com/profil...,jdsupra.com,English,United States,Artificial intelligence (AI) presents big opp...,1005
2,https://www.lightreading.com/data-centers/sing...,,Singtel boosts AI data center business with Nv...,20240201T124500Z,https://eu-images.contentstack.com/v3/assets/b...,lightreading.com,English,United States,,0
3,https://www.streetinsider.com/PRNewswire/Annou...,,Announcing ClimateGPT : The First Open Source ...,20240119T164500Z,,streetinsider.com,English,United States,,0
4,https://www.cbsnews.com/news/glassdoor-best-pl...,https://www.cbsnews.com/amp/news/glassdoor-bes...,Glassdoor unveils the best places to work in 2...,20240110T054500Z,https://assets1.cbsnewsstatic.com/hub/i/r/2023...,cbsnews.com,English,United States,MoneyWatch Glassdoor unveils the best places t...,560
...,...,...,...,...,...,...,...,...,...,...
156,https://www.osnews.com/story/author/osnews/,,OS News – OSnews,20241209T154500Z,,osnews.com,English,United States,PC/GEOS source code released under Apache 2.0 ...,184
157,https://www.galvnews.com/news_ap/nation/ap-new...,,AP News in Brief at 6 : 04 p . m . EST,20241226T233000Z,https://bloximages.newyork1.vip.townnews.com/g...,galvnews.com,English,United States,,0
158,https://www.yumasun.com/news/national_news/ap-...,,AP News in Brief at 12 : 04 a . m . EST,20241227T063000Z,https://bloximages.newyork1.vip.townnews.com/y...,yumasun.com,English,United States,,0
159,https://www.legalbusinessonline.com/sites/defa...,,ALB ASIA DECEMBER 2024,20241219T161500Z,https://www.legalbusinessonline.com/sites/defa...,legalbusinessonline.com,English,China,December 2024 ASIA EDITION MCI (P) 004/02/2024...,7947


In [42]:
df_content.loc[df_content['text_length'] > 50]

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry,content,text_length
1,https://www.jdsupra.com/legalnews/ai-regulatio...,,AI Regulation in India : Current State and Fut...,20240129T201500Z,https://jdsupra-static.s3.amazonaws.com/profil...,jdsupra.com,English,United States,Artificial intelligence (AI) presents big opp...,1005
4,https://www.cbsnews.com/news/glassdoor-best-pl...,https://www.cbsnews.com/amp/news/glassdoor-bes...,Glassdoor unveils the best places to work in 2...,20240110T054500Z,https://assets1.cbsnewsstatic.com/hub/i/r/2023...,cbsnews.com,English,United States,MoneyWatch Glassdoor unveils the best places t...,560
5,https://euobserver.com/agenda/157884,,"New Belgian presidency , Red Sea tensions This...",20240108T074500Z,https://media.euobserver.com/90a8a196bcd72f0e7...,euobserver.com,English,United Kingdom,"Returning from the Christmas holidays, the new...",152
8,https://classiccountry957.iheart.com/content/2...,,The Best Places To Work In The U . S . | Class...,20240111T130000Z,https://i.iheart.com/v3/re/assets.getty/63c983...,classiccountry957.iheart.com,English,United States,"Having a job you like is great, but working in...",266
9,https://kiss983.iheart.com/content/2024-01-11-...,,The Best Places To Work In The U . S . | KISS ...,20240111T130000Z,https://i.iheart.com/v3/re/assets.getty/63c983...,kiss983.iheart.com,English,United States,"Having a job you like is great, but working in...",266
...,...,...,...,...,...,...,...,...,...,...
153,https://www.thehindubusinessline.com/companies...,,Today Business News Live : Adani Group entitie...,20241203T054500Z,https://bl-i.thgim.com/public/incoming/dxyssh/...,thehindubusinessline.com,English,India,-67.30 -25.80 + 82.00 + 119.00 + 174.00 -67.30...,11402
155,https://www.miragenews.com/geraldine-slattery-...,,Geraldine Slattery At Melbourne Mining Club,20241205T040000Z,https://cdn1.miragenews.com/tmp_cache?cdn=imag...,miragenews.com,English,United States,Mirage News Mirage News Mirage News National 0...,1981
156,https://www.osnews.com/story/author/osnews/,,OS News – OSnews,20241209T154500Z,,osnews.com,English,United States,PC/GEOS source code released under Apache 2.0 ...,184
159,https://www.legalbusinessonline.com/sites/defa...,,ALB ASIA DECEMBER 2024,20241219T161500Z,https://www.legalbusinessonline.com/sites/defa...,legalbusinessonline.com,English,China,December 2024 ASIA EDITION MCI (P) 004/02/2024...,7947
