# Setup

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime
import calendar
import time
import requests
from typing import List, Union

# Article Scraper

Testing out manual fetching of articles

In [2]:
query_string = "NVIDIA NVIDIA Corporation (environment OR social OR governance OR sustainability OR ESG rating OR climate change OR diversity OR emissions OR resource use OR corporate social responsibility) sourcelang:english"
mode = "artlist"
headers = {"User-Agent": f"GDELT Python API"}

In [3]:
response = requests.get(
            f"https://api.gdeltproject.org/api/v2/doc/doc?query={query_string}&mode={mode}&format=json&maxrecords=200&sourcelang=ara&startdatetime=20180101000000&enddatetime=20180131235959",
            headers=headers
        )

In [4]:
response.text

'{"articles": [ { "url": "https://www.marketwatch.com/story/this-etf-shows-how-you-can-pick-stocks-for-rapid-growth-while-doing-good-2018-01-24", "url_mobile": "https://www.marketwatch.com/amp/story/guid/05F4A382-003C-11E8-8B37-5A39F2EFB447", "title": "This ETF shows how you can pick stocks for rapid growth while doing good", "seendate": "20180124T181500Z", "socialimage": "http://s.marketwatch.com/public/resources/MWimages/MW-GC275_MMM_20_ZG_20180123123931.jpg", "domain": "marketwatch.com", "language": "English", "sourcecountry": "United States" },{ "url": "http://presstelegraph.com/as-nvidia-corp-nvda-market-value-rose-holder-myriad-asset-management-ltd-cut-by-15-13-million-its-stake/", "url_mobile": "", "title": "As Nvidia Corp ( NVDA ) Market Value Rose , Holder Myriad Asset Management LTD Cut by $15 . 13 Million Its Stake", "seendate": "20180121T134500Z", "socialimage": "", "domain": "presstelegraph.com", "language": "English", "sourcecountry": "United States" },{ "url": "http://ww

In [5]:
pd.DataFrame(response.json(strict=False)['articles'])

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry
0,https://www.marketwatch.com/story/this-etf-sho...,https://www.marketwatch.com/amp/story/guid/05F...,This ETF shows how you can pick stocks for rap...,20180124T181500Z,http://s.marketwatch.com/public/resources/MWim...,marketwatch.com,English,United States
1,http://presstelegraph.com/as-nvidia-corp-nvda-...,,"As Nvidia Corp ( NVDA ) Market Value Rose , Ho...",20180121T134500Z,,presstelegraph.com,English,United States
2,http://www.4-traders.com/NVIDIA-CORPORATION-10...,http://www.4-traders.com/amp/NVIDIA-CORPORATIO...,NVIDIA Announces Upcoming Events for Financial...,20180130T223000Z,,4-traders.com,English,United States
3,https://www.whatsonthorold.com/2018/01/21/1-15...,,$1 . 15 EPS Expected for NVIDIA ( NVDA ); USD ...,20180121T153000Z,https://www.whatsonthorold.com/wp-content/uplo...,whatsonthorold.com,English,
4,https://normanobserver.com/analysts-see-1-15-e...,,Analysts See $1 . 15 EPS for NVIDIA ( NVDA ); ...,20180129T134500Z,https://normanobserver.com/wp-content/uploads/...,normanobserver.com,English,
...,...,...,...,...,...,...,...,...
195,http://thesivertimes.com/2018/01/01/the-10-yea...,,The 10,20180101T163000Z,,thesivertimes.com,English,
196,http://dietpillo.com/2018/01/salah-named-afric...,,Salah named African Player of the Year,20180110T164500Z,,dietpillo.com,English,
197,http://www.nasdaq.com/article/amds-q4-earnings...,http://www.nasdaq.com/article/amds-q4-earnings...,AMD Q4 Earnings to Grow on Portfolio Strength ...,20180126T171500Z,http://www.nasdaq.com/images/dreamit.jpg,nasdaq.com,English,United States
198,http://www.greencarcongress.com/mapping/,,Mapping,20180117T001500Z,http://up6.typepad.com/6a00d8341c4fbe53ef00e54...,greencarcongress.com,English,United States


Writing methods to automate the article fetcher

In [23]:
Filter = Union[List[str], str]
FilterGroup = List[List[str]]

In [7]:
foo = "(" 
foo += "company"
foo

'(company'

In [8]:
languages = ["eng", "English"]
with open('../LOOKUP-LANGUAGES.TXT', 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line:  # Skip empty lines
            code, name = line.split('\t')  # Split by tab character
            languages.append(code)
            languages.append(name) 

print(languages)

['eng', 'English', 'afr', 'Afrikaans', 'sqi', 'Albanian', 'ara', 'Arabic', 'hye', 'Armenian', 'axe', 'Azerbaijani', 'ben', 'Bengali', 'bos', 'Bosnian', 'bul', 'Bulgarian', 'cat', 'Catalan', 'zho', 'Chinese', 'hrv', 'Croatian', 'ces', 'Czech', 'dan', 'Danish', 'nld', 'Dutch', 'est', 'Estonian', 'fin', 'Finnish', 'fra', 'French', 'glg', 'Galician', 'kat', 'Georgian', 'deu', 'German', 'ell', 'Greek', 'guj', 'Gujarati', 'heb', 'Hebrew', 'hin', 'Hindi', 'hun', 'Hungarian', 'isl', 'Icelandic', 'ind', 'Indonesian', 'ita', 'Italian', 'jpn', 'Japanese', 'kan', 'Kannada', 'kaz', 'Kazakh', 'kor', 'Korean', 'lav', 'Latvian', 'lit', 'Lithuanian', 'mkd', 'Macedonian', 'msa', 'Malay', 'mal', 'Malayalam', 'mar', 'Marathi', 'mon', 'Mongolian', 'nep', 'Nepali', 'nor', 'Norwegian', 'nno', 'NorwegianNynorsk', 'fas', 'Persian', 'pol', 'Polish', 'por', 'Portuguese', 'pan', 'Punjabi', 'ron', 'Romanian', 'rus', 'Russian', 'srp', 'Serbian', 'sin', 'Sinhalese', 'slk', 'Slovak', 'slv', 'Slovenian', 'som', 'Somal

In [24]:
def query_maker(companies: FilterGroup, keywords: FilterGroup, language=None):
    query_string = ""

    # process companies (assuming all companies are mandatory)
    # if type(companies) == str:
    #     query_string += f'"{companies} "' if " " in companies else f"{companies} "
    # else:
    #     query_string = query_string + " ".join([f'"{company}"' if " " in company else company for company in companies]) + " " # use space seperation for AND statements

    for group in companies:
        if not group: # skip empty groups
            continue
        # join groups with multiple companies with OR
        if len(group) > 1: 
            group_string = "(" + " OR ".join([f'"{company}"' if " " in company else company for company in group]) + ") "
        else:
            company = group[0]
            group_string = f'"{company}" ' if " " in company else f"{company} "

        query_string += group_string + " "

    # process keywords (assuming all keywords are interchangeable)
    # if type(keywords) == str:
    #     query_string += f'"{keywords} "'
    # else:
    #     query_string = query_string + "(" + " OR ".join([f'"{keyword}"' if " " in keyword else keyword for keyword in keywords]) + ") "

    # query_string += " "

    for group in keywords:
        if not group:  # skip empty groups
            continue
        # join groups with multiple keywords with OR 
        if len(group) > 1:
            group_string = "(" + " OR ".join([f'"{keyword}"' if " " in keyword else keyword for keyword in group]) + ") "
        else:
            keyword = group[0]
            group_string = f'"{keyword}" ' if " " in keyword else f"{keyword} "
            
        query_string += group_string + " "

    # process language
    if language:
        if language in languages:
            query_string += f"sourcelang:{language} "
        else:
            print("Language not found in lookup table")
    
    return query_string

In [None]:
companies = [["NVIDIA", "NVIDIA Corporation"]]
keywords = [["environment", "social", "governance"], ["sustainability"], ["ESG rating", "ESG"],[ "climate change", "diversity", "emissions", "resource use", "corporate social responsibility"]]

In [14]:
query_maker(companies, keywords, "eng")

'NVIDIA "NVIDIA Corporation" (environment OR social OR governance OR sustainability OR "ESG rating" OR ESG OR "climate change" OR diversity OR emissions OR "resource use" OR "corporate social responsibility") sourcelang:eng '

In [19]:
def query(query_string, mode, headers):
    
    response = requests.get(
            f"https://api.gdeltproject.org/api/v2/doc/doc?query={query_string}&mode={mode}&format=json&maxrecords=250&startdatetime=20180101000000&enddatetime=20180131235959",
            headers=headers
        )
    
    data = pd.DataFrame(response.json(strict=False)['articles'])
    return data
    # return response

In [20]:
query_string = query_maker(companies, keywords, language="English")
df = query(query_string, mode, headers)

In [21]:
df.shape

(100, 8)

In [22]:
df

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry
0,http://www.benchmarkmonitor.com/digital-curren...,,Digital Currency Miners Generate Nvidia and AM...,20180123T191500Z,,benchmarkmonitor.com,English,United States
1,http://www.4-traders.com/NVIDIA-CORPORATION-10...,http://www.4-traders.com/amp/NVIDIA-CORPORATIO...,NVIDIA Announces World First Functionally Safe...,20180110T143000Z,,4-traders.com,English,United States
2,http://wisdomsave.com/2018/01/09/nvda-shares-r...,,"( NVDA ) Shares Rose , Holder Friess Associat...",20180109T173000Z,,wisdomsave.com,English,
3,http://www.nasdaq.com/article/is-silicon-motio...,http://www.nasdaq.com/article/is-silicon-motio...,Is Silicon Motion ( NOW ) Place in Your Portfo...,20180103T151500Z,http://www.nasdaq.com/images/dreamit.jpg,nasdaq.com,English,United States
4,https://investorplace.com/2018/01/three-reason...,https://investorplace.com/2018/01/three-reason...,3 Reasons Nvidia Corporation Stock Will Contin...,20180109T124500Z,https://investorplace.com/wp-content/uploads/2...,investorplace.com,English,United States
...,...,...,...,...,...,...,...,...
95,http://currenthollywood.com/2018/01/orange-ora...,,Orange ( ORA ) PT Set at €19 . 20 by Goldman S...,20180119T230000Z,,currenthollywood.com,English,
96,http://dietpillo.com/2018/01/turkey-denies-u-s...,,Turkey denies U . S . account of conversation ...,20180126T181500Z,,dietpillo.com,English,
97,https://investorplace.com/2018/01/best-etfs-bu...,https://investorplace.com/2018/01/best-etfs-bu...,10 Best ETFs to Buy for a Stellar 2018,20180102T171500Z,https://investorplace.com/wp-content/uploads/2...,investorplace.com,English,United States
98,http://www.msn.com/en-us/money/mutualfunds/10-...,,10 Best ETFs to Buy for a Stellar 2018,20180107T034500Z,,msn.com,English,United States


In [53]:
query_string

'(NVIDIA AND NVIDIA Corporation AND environment OR social OR governance OR sustainability OR ESG rating OR ESG OR climate change OR diversity OR emissions OR resource use OR corporate social responsibility) '

In [54]:
"(NVIDIA OR NVIDIA Corporation AND environment OR social OR governance OR sustainability OR ESG rating OR climate change OR diversity OR emissions OR resource use OR corporate social responsibility) "

'(NVIDIA OR NVIDIA Corporation AND environment OR social OR governance OR sustainability OR ESG rating OR climate change OR diversity OR emissions OR resource use OR corporate social responsibility) '

Writing methods to scrape the URL contents

In [131]:
def scrape_article_content(url):
    """
    Scrapes article content from a URL.
    
    Args:
        url (str): The URL of the article.
    
    Returns:
        str: The extracted article content as plain text.
    """
    try:
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            return ""
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Try to find an <article> tag first
        article = soup.find('article')
        if article:
            text = article.get_text(separator=' ', strip=True)
        else:
            # Fallback: concatenate text from all <p> tags
            paragraphs = soup.find_all('p')
            text = ' '.join(p.get_text(separator=' ', strip=True) for p in paragraphs)
        
        return text
    except Exception as e:
        # In production, you might log the exception here
        return ""

def add_article_content_column(df, url_column='url', content_column='content'):
    """
    Adds a new column to the DataFrame with the scraped content for each URL.
    
    Args:
        df (pd.DataFrame): The DataFrame containing URLs.
        url_column (str): The name of the column with URLs.
        content_column (str): The name of the new column to store article content.
    
    Returns:
        pd.DataFrame: The updated DataFrame with the new content column.
    """
    df[content_column] = df[url_column].apply(scrape_article_content)
    return df

# Example usage:
# Load your dataset into a DataFrame
# df = pd.read_csv('your_dataset.csv')
# df = add_article_content_column(df)
# df.to_csv('your_dataset_with_content.csv', index=False)


In [133]:
df_content = add_article_content_column(df)

In [135]:
df_content

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry,content
0,https://www.investopedia.com/news/amplifying-e...,,Amplifying The ESG Definition,20180201T150000Z,https://i.investopedia.com/image/jpeg/15138787...,investopedia.com,English,United States,
1,https://www.cnbc.com/advertorial/2017/12/18/in...,,Investing with impact,20180130T171500Z,https://fm.cnbc.com/applications/cnbc.com/reso...,cnbc.com,English,United States,
2,https://www.etftrends.com/how-etf-investors-ca...,https://www.etftrends.com/how-etf-investors-ca...,How ETF Investors Can Tap Into the Expanding E...,20180117T003000Z,https://www.etftrends.com/wp-content/uploads/2...,etftrends.com,English,United States,
3,https://www.etftrends.com/smart-beta-channel/e...,https://www.etftrends.com/smart-beta-channel/e...,ESG And Bonds : A Compelling Combination,20180103T211500Z,https://www.etftrends.com/wp-content/uploads/2...,etftrends.com,English,United States,
4,http://www.4-traders.com/news/MISC-Berhad-Mala...,http://www.4-traders.com/amp/news/MISC-Berhad-...,MISC Berhad Malaysia International Shipping,20180123T044500Z,,4-traders.com,English,United States,
...,...,...,...,...,...,...,...,...,...
195,http://www.4-traders.com/news/B-redygtig-finan...,http://www.4-traders.com/amp/news/B-redygtig-f...,Bæredygtig finansiering : Ekspertgruppe på høj...,20180131T124500Z,,4-traders.com,English,United States,
196,https://www.stuff.co.nz/business/opinion-analy...,,Internet retailers like Amazon dont make the c...,20180108T030000Z,https://resources.stuff.co.nz/content/dam/imag...,stuff.co.nz,English,New Zealand,
197,https://www.stuff.co.nz/business/opinion-analy...,,Internet retailers like Amazon dont make the c...,20180108T010000Z,https://resources.stuff.co.nz/content/dam/imag...,stuff.co.nz,English,New Zealand,
198,http://justmeans.com/newsletter/2017s-top-10-s...,,2017 Top 10 Sustainable Business Stories - HBR...,20180109T000000Z,,justmeans.com,English,United States,
