In [53]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
from glob import glob
from datetime import date
from datetime import datetime
import hashlib

In [54]:
def get_hash_from_string(string):
    result = hashlib.sha1(string.encode())
    return result.hexdigest()

In [55]:
# Constants
base_url = "https://www.finanzen.net"
df_stocks = pd.read_csv("data/stocks/processed/20220824T074737_xetra_finanzen.csv")


In [56]:
# Input data
stock_ticker = ['UNLYF']
start_date = date(2021,1,1)
end_date = date(2021,12,31)



In [57]:
def get_soup(url):
    response = requests.get(url)
    return BeautifulSoup(response.text, 'html.parser')


In [58]:
def get_links_for_all_pages(soup):
    pagination_list_object = soup.find('ul', {'class': "pagination__list"})
    if pagination_list_object:
        pagination_objects = pagination_list_object.find_all(
            'a', {'class': 'pagination__text'})
        links = [pagination.attrs['href'] for pagination in pagination_objects][:-1]
        return links
    else:
        return []
    

In [59]:
def get_news_links(df_stocks, start_date, end_date):
    extracted_news_properties = []
    failed_links = []
    for ISIN, url_news in df_stocks[['ISIN', 'news_link']].values:
        links_all_pages = get_links_for_all_pages(get_soup(url_news))
        if links_all_pages:
            for link in links_all_pages:
                url = base_url + link
                print(url)
                soup = get_soup(url)
                for news in soup.find_all('div', {'class': 'news news--item-with-media'}):
                    news_entry = dict()
                    date = news.find('time', {'class': 'news__date'})
                    article_date = datetime.strptime(date.text, '%d.%m.%y').date()
                    if (start_date <= article_date <= end_date):
                        source = news.find('span', {'class': 'news__source'})
                        kicker = news.find('span', {'class': 'news__kicker'})
                        title = news.find('span', {'class': 'news__title'})
                        link = news.find('a', {'class': 'news__card'}).attrs['href']
                        id = get_hash_from_string(link)
                        keys = ['id', 'ISIN', 'date', 'title', 'source', 'kicker', 'link_article']
                        values = [id, ISIN, date, title, source, kicker, link]
                        for key, value in zip(keys, values):
                            if hasattr(value, 'text'):
                                news_entry[key] = value.text.encode('latin').decode()
                            else:
                                news_entry[key] = value
                        if news_entry['link_article']:
                            news_entry['link_article'] = base_url + \
                                news_entry['link_article']

                        extracted_news_properties.append(news_entry)
        else:
            print(url_news)
            failed_links.append(url_news)
    return extracted_news_properties


In [60]:
df_stocks_selected = df_stocks.query("Symbol in @stock_ticker").copy()
extracted_news_properties = get_news_links(df_stocks_selected, start_date, end_date)

https://www.finanzen.net/news/unilever-news
https://www.finanzen.net/news/unilever-news@intpagenr_2
https://www.finanzen.net/news/unilever-news@intpagenr_3
https://www.finanzen.net/news/unilever-news@intpagenr_4
https://www.finanzen.net/news/unilever-news@intpagenr_5
https://www.finanzen.net/news/unilever-news@intpagenr_6
https://www.finanzen.net/news/unilever-news@intpagenr_7


In [61]:
stock_tickers = '-'.join(stock_ticker)
file_name = '_'.join(['news', stock_tickers, str(start_date), str(end_date)])
file_name += '.csv'

In [62]:
file_path = f"data/stocks/{file_name}"
df_stock_news = pd.DataFrame(extracted_news_properties)
df_stock_news.to_csv(file_path, index=False)
print(f"Save to {file_path}")


Save to data/stocks/news_UNLYF_2021-01-01_2021-12-31.csv
