In [1]:
import bs4
import requests
import pandas as pd
from datetime import datetime
import os

In [2]:
class NovinkyScraper:
    def __init__(self,url,save_folder):
        self.url = url
        self.res = None
        self.soap = None
        self.articles = set()
        self.save_folder = save_folder
        self.download_date = datetime.now().strftime('%Y-%m-%d-%H-%M')

    def fetch_page(self):
        #Read a web page
        self.res = requests.get(self.url)
        self.soap = bs4.BeautifulSoup(self.res.text,'lxml')

    def extract_articles(self):
        #Articles and urls
        for article in self.soap.select('.c_aM.c_aP'):
            title = article.get_text(strip=True)
            a_tag = article.find('a')
            if a_tag and 'href' in a_tag.attrs and title != '':
                link = a_tag['href']
                self.articles.add((title,link,self.download_date))
    
    def save_to_df(self):
        # set to df

        df_articles = pd.DataFrame(self.articles,columns=['Title','Link','Download_date'])
        return df_articles
    
    def export_to_csv(self):
        file_name = f'novinky_cz_{self.download_date}.csv'
        df_articles = self.save_to_df()
        file_path = os.path.join(self.save_folder,file_name)

        df_articles.to_csv(file_path,index = False,encoding='utf-8-sig')
        print('Data has been successfully exported')




In [3]:
if __name__ == '__main__':
    url = 'https://www.novinky.cz/'
    save_folder = r'C:\Users\ondra\OneDrive\Data analyst\GITHUB\Python-Bootcamp\data_export'

    scraper = NovinkyScraper(url,save_folder)
    scraper.fetch_page()
    scraper.extract_articles()
    scraper.export_to_csv()

Data has been successfully exported
