# Setup

In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime
import calendar
import time
import requests
from typing import List, Union

# Article Scraper

Testing out manual fetching of articles

In [100]:
query_string = "(NVIDIA OR NVIDIA Corporation AND environment OR social OR governance OR sustainability OR ESG rating OR climate change OR diversity OR emissions OR resource use OR corporate social responsibility) sourcelang:english"
mode = "artlist"
headers = {"User-Agent": f"GDELT Python API"}

In [101]:
response = requests.get(
            f"https://api.gdeltproject.org/api/v2/doc/doc?query={query_string}&mode={mode}&format=json&maxrecords=200&sourcelang=ara&startdatetime=20180101000000&enddatetime=20180131235959",
            headers=headers
        )

In [102]:
pd.DataFrame(response.json()['articles'])

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry
0,https://www.investopedia.com/news/amplifying-e...,,Amplifying The ESG Definition,20180201T150000Z,https://i.investopedia.com/image/jpeg/15138787...,investopedia.com,English,United States
1,https://www.cnbc.com/advertorial/2017/12/18/in...,,Investing with impact,20180130T171500Z,https://fm.cnbc.com/applications/cnbc.com/reso...,cnbc.com,English,United States
2,http://www.4-traders.com/news/MISC-Berhad-Mala...,http://www.4-traders.com/amp/news/MISC-Berhad-...,MISC Berhad Malaysia International Shipping,20180123T044500Z,,4-traders.com,English,United States
3,http://www.eco-business.com/press-releases/cdl...,,CDL receives global recognition for gender div...,20180124T043000Z,http://www.eco-business.com/media/cache/31/8f/...,eco-business.com,English,China
4,https://www.hardocp.com/news/2015/10/13/jeff_b...,,[ H ] ardOCP : Jeff Bezos Plummets Down Corpo...,20180201T230000Z,,hardocp.com,English,United States
...,...,...,...,...,...,...,...,...
195,https://business.inquirer.net/244786/urc-adopt...,https://business.inquirer.net/244786/urc-adopt...,URC adopts sustainability agenda,20180125T130000Z,https://business.inquirer.net/files/2011/09/jo...,business.inquirer.net,English,Philippines
196,http://www.ttrweekly.com/site/2018/01/go-vacat...,,Go Vacation gains Travelife status,20180111T094500Z,,ttrweekly.com,English,Thailand
197,https://finance.yahoo.com/news/nvidia-corporat...,https://www.yahoo.com/amphtml/finance/news/nvi...,Nvidia Corporation ( NVDA ) Is Taking the Auto...,20180110T141500Z,,finance.yahoo.com,English,United States
198,http://www.ata.org.au/who/board/,,The Alternative Technology Association » ATA B...,20180118T030000Z,,ata.org.au,English,


Writing methods to automate the article fetcher

In [18]:
Filter = Union[List[str], str]

In [19]:
foo = "(" 
foo += "company"
foo

'(company'

In [108]:
languages = ["eng", "English"]
with open('../LOOKUP-LANGUAGES.TXT', 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line:  # Skip empty lines
            code, name = line.split('\t')  # Split by tab character
            languages.append(code)
            languages.append(name) 

print(languages)

['eng', 'English', 'afr', 'Afrikaans', 'sqi', 'Albanian', 'ara', 'Arabic', 'hye', 'Armenian', 'axe', 'Azerbaijani', 'ben', 'Bengali', 'bos', 'Bosnian', 'bul', 'Bulgarian', 'cat', 'Catalan', 'zho', 'Chinese', 'hrv', 'Croatian', 'ces', 'Czech', 'dan', 'Danish', 'nld', 'Dutch', 'est', 'Estonian', 'fin', 'Finnish', 'fra', 'French', 'glg', 'Galician', 'kat', 'Georgian', 'deu', 'German', 'ell', 'Greek', 'guj', 'Gujarati', 'heb', 'Hebrew', 'hin', 'Hindi', 'hun', 'Hungarian', 'isl', 'Icelandic', 'ind', 'Indonesian', 'ita', 'Italian', 'jpn', 'Japanese', 'kan', 'Kannada', 'kaz', 'Kazakh', 'kor', 'Korean', 'lav', 'Latvian', 'lit', 'Lithuanian', 'mkd', 'Macedonian', 'msa', 'Malay', 'mal', 'Malayalam', 'mar', 'Marathi', 'mon', 'Mongolian', 'nep', 'Nepali', 'nor', 'Norwegian', 'nno', 'NorwegianNynorsk', 'fas', 'Persian', 'pol', 'Polish', 'por', 'Portuguese', 'pan', 'Punjabi', 'ron', 'Romanian', 'rus', 'Russian', 'srp', 'Serbian', 'sin', 'Sinhalese', 'slk', 'Slovak', 'slv', 'Slovenian', 'som', 'Somal

In [None]:
def query_maker(companies: Filter, keywords: Filter, language=None):
    query_string = "("

    # process companies
    if type(companies) == str:
        query_string += companies
    else:
        query_string = query_string + " OR ".join([company for company in companies])

    # process keywords
    if type(keywords) == str:
        query_string += f" AND {keywords}"
    else:
        query_string = query_string + " AND " + " OR ".join([keyword for keyword in keywords])

    query_string += ") "

    # process language
    if language:
        if language in languages:
            query_string += f"sourcelang:{language} "
        else:
            print("Language not found in lookup table")
    
    return query_string

In [115]:
companies = ["NVIDIA", "NVIDIA Corporation"]
keywords = ["environment", "social", "governance", "sustainability", "ESG rating", "ESG", "climate change", "diversity", "emissions", "resource use", "corporate social responsibility"]

In [116]:
query_maker(companies, keywords, "eng")

'(NVIDIA OR NVIDIA Corporation AND environment OR social OR governance OR sustainability OR ESG rating OR ESG OR climate change OR diversity OR emissions OR resource use OR corporate social responsibility) sourcelang:eng '

In [124]:
def query(query_string, mode, headers):
    
    response = requests.get(
            f"https://api.gdeltproject.org/api/v2/doc/doc?query={query_string}&mode={mode}&format=json&maxrecords=200&startdatetime=20180101000000&enddatetime=20180131235959",
            headers=headers
        )
    
    data = pd.DataFrame(response.json(strict=False)['articles'])
    return data
    # return response

In [128]:
query_string = query_maker(companies, keywords, language="English")
df = query(query_string, mode, headers)

In [129]:
df.shape

(200, 8)

In [130]:
df

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry
0,https://www.investopedia.com/news/amplifying-e...,,Amplifying The ESG Definition,20180201T150000Z,https://i.investopedia.com/image/jpeg/15138787...,investopedia.com,English,United States
1,https://www.cnbc.com/advertorial/2017/12/18/in...,,Investing with impact,20180130T171500Z,https://fm.cnbc.com/applications/cnbc.com/reso...,cnbc.com,English,United States
2,https://www.etftrends.com/how-etf-investors-ca...,https://www.etftrends.com/how-etf-investors-ca...,How ETF Investors Can Tap Into the Expanding E...,20180117T003000Z,https://www.etftrends.com/wp-content/uploads/2...,etftrends.com,English,United States
3,https://www.etftrends.com/smart-beta-channel/e...,https://www.etftrends.com/smart-beta-channel/e...,ESG And Bonds : A Compelling Combination,20180103T211500Z,https://www.etftrends.com/wp-content/uploads/2...,etftrends.com,English,United States
4,http://www.4-traders.com/news/MISC-Berhad-Mala...,http://www.4-traders.com/amp/news/MISC-Berhad-...,MISC Berhad Malaysia International Shipping,20180123T044500Z,,4-traders.com,English,United States
...,...,...,...,...,...,...,...,...
195,http://www.4-traders.com/news/B-redygtig-finan...,http://www.4-traders.com/amp/news/B-redygtig-f...,Bæredygtig finansiering : Ekspertgruppe på høj...,20180131T124500Z,,4-traders.com,English,United States
196,https://www.stuff.co.nz/business/opinion-analy...,,Internet retailers like Amazon dont make the c...,20180108T030000Z,https://resources.stuff.co.nz/content/dam/imag...,stuff.co.nz,English,New Zealand
197,https://www.stuff.co.nz/business/opinion-analy...,,Internet retailers like Amazon dont make the c...,20180108T010000Z,https://resources.stuff.co.nz/content/dam/imag...,stuff.co.nz,English,New Zealand
198,http://justmeans.com/newsletter/2017s-top-10-s...,,2017 Top 10 Sustainable Business Stories - HBR...,20180109T000000Z,,justmeans.com,English,United States


In [53]:
query_string

'(NVIDIA AND NVIDIA Corporation AND environment OR social OR governance OR sustainability OR ESG rating OR ESG OR climate change OR diversity OR emissions OR resource use OR corporate social responsibility) '

In [54]:
"(NVIDIA OR NVIDIA Corporation AND environment OR social OR governance OR sustainability OR ESG rating OR climate change OR diversity OR emissions OR resource use OR corporate social responsibility) "

'(NVIDIA OR NVIDIA Corporation AND environment OR social OR governance OR sustainability OR ESG rating OR climate change OR diversity OR emissions OR resource use OR corporate social responsibility) '