# Scraping articles from Irish Times website

In [5]:
import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd


In [42]:
def parseArticle( url ):
    
    page = requests.get(url)
    htmlResponse = page.text
    
    soup = BeautifulSoup(htmlResponse, 'html.parser')
    
    # Ensure article is not 'subscriber only'
    subOnlyElem = soup.find("div", {"class": "intercept-modal"})
    if(subOnlyElem != None):
        return
    
    # Get article title
    headerSectionElem = soup.find("hgroup")
    titleElem = headerSectionElem.find("h1")
    titleText = titleElem.text
    print(titleText)
    
    # Get article date
    timeElem = soup.find("time")
    timeText = timeElem.text
    timeText = timeText[:timeText.rindex(',')]
    dateText = datetime.datetime.strptime(timeText, '%a, %b %d, %Y').strftime('%Y-%m-%d')
    
    # Get article text
    articleElem = soup.find("div", {"class": "article_bodycopy"})
    print(articleElem)
    paragraphElems = articleElem.find_all("p", {"class": "selectionShareable"})
    
    paragraphText = ""
    
    for paragraphElem in paragraphElems:
        paragraphText += paragraphElem.text 
        
    data = [[titleText, dateText, paragraphText]]
    df = pd.DataFrame(data, columns=['title', 'date', 'text'])
    df.to_csv('trial2.csv', mode='a', header=False, index=False)
    

In [34]:
def getArticleLinks( url ):
    articleUrls = []
    
    page = requests.get(url)
    htmlResponse = page.text
    
    soup = BeautifulSoup(htmlResponse, 'html.parser')
    searchResultDivs = soup.find_all("div", {"class": "search_items_title"})
    
    for searchResultDiv in searchResultDivs:
            spanElem = searchResultDiv.find("span", {"class":"h2"})
            articleUrls.append('https://www.irishtimes.com' + spanElem.contents[0]['href'])
    
    return articleUrls

In [43]:
baseUrl = "https://www.irishtimes.com/search/search-7.2285082?q=irish+economy&toDate=09-06-2020&pageId=2.709&page="

for i in range(529,634):
    print(str(i))
    articleLinks = getArticleLinks(baseUrl + str(i))
    for link in articleLinks:
        parseArticle(link)
        

529
['https://www.irishtimes.com/business/economy/cantillon-1.1319332', 'https://www.irishtimes.com/business/economy/europe/europe-can-be-a-leader-of-the-new-industrial-revolution-1.1319323', 'https://www.irishtimes.com/business/economy/europe/we-should-explore-any-avenue-that-will-help-ireland-stand-on-its-own-feet-1.1319349', 'https://www.irishtimes.com/business/economy/world/irish-service-sector-grows-at-slowest-rate-in-six-months-1.1318166', 'https://www.irishtimes.com/business/economy/swiss-pay-cap-puts-some-execs-in-the-soup-1.1317400', 'https://www.irishtimes.com/business/economy/ireland/irish-consumer-sentiment-plunges-1.1318319', 'https://www.irishtimes.com/business/economy/ireland/services-sector-slows-in-february-1.1318352', 'https://www.irishtimes.com/business/economy/ireland/troika-to-devise-extension-options-1.1318281', 'https://www.irishtimes.com/business/economy/ireland/businesses-want-postcodes-introduced-1.1317055', 'https://www.irishtimes.com/business/economy/ireland

KeyboardInterrupt: 

### CSV Structure
id | url | date | text

In [11]:
parseArticle("https://www.irishtimes.com/business/economy/christine-lagarde-calls-for-ambitious-actions-as-ecb-disappoints-1.4201089")

<div class="article_bodycopy">
<p class="no_name"><a class="search" href="/topics/topics-7.1213540?article=true&amp;tag_organisation=European+Central+Bank">European Central Bank</a> (ECB) president <a class="search" href="/topics/topics-7.1213540?article=true&amp;tag_person=Christine+Lagarde">Christine Lagarde</a> called on governments to launch an “ambitious and coordinated” financial response in light of the “major shock” to economic growth caused by Covid-19, as her institution disappointed financial markets by leaving its key interest rates unchanged.</p>
<p class="no_name">“Governments and all other policy institutions are called upon to take timely and targeted actions to address the public health challenge of containing the spread of the coronavirus and mitigate its economic risk,” Ms Lagarde said at a press conference on Thursday in Frankfurt.</p>
<p class="no_name">“In particular, an ambitious and coordinated fiscal policy response is required to support businesses and workers

In [23]:
getArticleLinks("https://www.irishtimes.com/search/search-7.2285082?q=irish+economy&toDate=09-06-2020&pageId=2.709&page=529")

['https://www.irishtimes.com/business/economy/cantillon-1.1319332',
 'https://www.irishtimes.com/business/economy/europe/europe-can-be-a-leader-of-the-new-industrial-revolution-1.1319323',
 'https://www.irishtimes.com/business/economy/europe/we-should-explore-any-avenue-that-will-help-ireland-stand-on-its-own-feet-1.1319349',
 'https://www.irishtimes.com/business/economy/world/irish-service-sector-grows-at-slowest-rate-in-six-months-1.1318166',
 'https://www.irishtimes.com/business/economy/swiss-pay-cap-puts-some-execs-in-the-soup-1.1317400',
 'https://www.irishtimes.com/business/economy/ireland/irish-consumer-sentiment-plunges-1.1318319',
 'https://www.irishtimes.com/business/economy/ireland/services-sector-slows-in-february-1.1318352',
 'https://www.irishtimes.com/business/economy/ireland/troika-to-devise-extension-options-1.1318281',
 'https://www.irishtimes.com/business/economy/ireland/businesses-want-postcodes-introduced-1.1317055',
 'https://www.irishtimes.com/business/economy/ir