In [2]:
import requests
from bs4 import BeautifulSoup

import pandas as pd

In [3]:
def getArticles(base_link, max_pages):
    articles = []

    for i in range(1, max_pages+1):
        #send a request
        response = requests.get(base_link+str(1))
        print(f"Page: {i}; Response: {response.status_code}")

        #if response is received
        if response.status_code == 200:
            
            #proccess request with BS
            soup = BeautifulSoup(response.content, 'html.parser')

            #find div with all the aticles and other links
            content_div = soup.find('ul', class_="l-entries l-entries--bordered")

            if content_div:
                #Find elements that hold links to articles
                h3s = content_div.find_all("h3")

                #get article links
                for h in h3s:
                    a = h.find_next("a")
                    articles.append(a.get("href"))
            else:
                print("No article content found.")

    return articles

In [4]:
def readArticle(link):
    row = {
        "Title": None,
        "Link": link,
        "Author": None,
        "Date": None,
        "Category": None,
        "Views": None,
        "Time to read": None,
        "Sections": None
    }

    #send a request
    response = requests.get(link)
    print(f"Page: {link}; Response: {response.status_code}")

    #if response is received
    if response.status_code == 200:
        #proccess request with BS
        soup = BeautifulSoup(response.content, 'html.parser')

        #get title
        title_tag = soup.find("h1", class_="c-entry__title c-title c-title--h1 font-bold")
        row["Title"] = title_tag.get_text(strip=True)

        #get author
        author_tag = soup.find("a", class_="text-current c-bar__link")
        row["Author"] = author_tag.get_text(strip=True) if author_tag else None

        header = soup.find('dl', class_="c-entry__info c-bar")

        # iterate over all dt elements
        for dt in header.find_all("dt", class_="sr-only"):
                label = dt.get_text(strip=True)
                dd = dt.find_next_sibling("dd")
                if not dd:
                    continue
                value = dd.get_text(strip=True)

                if label == "Date of publication":
                    row["Date"] = value
                elif label == "Category":
                    row["Category"] = value
                elif label == "Number of views":
                    row["Views"] = value
                elif label == "Time to read":
                    span = dd.find("span")
                    if span:
                        icon = span.find("i")
                        if icon:
                            icon.extract()   # remove the clock icon
                        row["Time to read"] = span.get_text(strip=True)

        #find div with the article contents
        article_body = soup.find('div', class_="c-prose c-post__inner")

        sections = [h2.get_text(strip=True) for h2 in article_body.find_all("h2")]
        if "Read also" in sections:
            sections.remove("Read also")
        row["Sections"] = sections

    # return as a one-row DataFrame
    return pd.DataFrame([row])


In [7]:
#Amount of pages we wish to look through
MAX_PAGES = 10

base_link = "https://tsn.ua/en/page-"

results = pd.DataFrame(columns=["Title", "Link", "Author", "Date", "Category", "Views", "Time to read", "Sections"])

for article in getArticles(base_link, MAX_PAGES):
    results = pd.concat([results, readArticle(article)], ignore_index=True)

results.to_csv("../../data/output/scrapped.csv")
print(len(results))

Page: 1; Response: 200
Page: 2; Response: 200
Page: 3; Response: 200
Page: 4; Response: 200
Page: 5; Response: 200
Page: 6; Response: 200
Page: 7; Response: 200
Page: 8; Response: 200
Page: 9; Response: 200
Page: 10; Response: 200
Page: https://tsn.ua/en/ukrayina/emergency-rescue-operations-completed-in-zaporizhzhia-the-aftermath-2901818.html; Response: 200
Page: https://tsn.ua/en/ukrayina/zelenskyy-responds-to-russias-massive-overnight-strike-on-ukraine-2901755.html; Response: 200
Page: https://tsn.ua/en/politika/yermak-reveals-details-of-talks-with-witkoff-2901473.html; Response: 200
Page: https://tsn.ua/en/politika/us-talks-zelenskyy-outlines-key-topics-for-discussion-with-witkoff-2901341.html; Response: 200
Page: https://tsn.ua/en/ato/zelenskyy-updates-on-frontline-situation-key-danger-zones-identified-2901320.html; Response: 200
Page: https://tsn.ua/en/politika/russian-tricolor-appears-at-venice-film-festival-ukraine-responds-2901194.html; Response: 200
Page: https://tsn.ua/en/exc

150