In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
from glob import glob

In [2]:
base_url = "https://www.finanzen.net"


In [4]:
df_stocks = pd.read_csv("data/stocks/processed/20220824T074737_xetra_finanzen.csv")

In [5]:
df_stocks

Unnamed: 0,ISIN,STOCK,CURRENCY,stock_name,WKN,Symbol,news_link
0,FR0000031122,AIR FRANCE-KLM INH. EO 1,EUR,Air France-KLM Aktie,855111,AFRAF,https://www.finanzen.net/news/air_france-klm-news
1,GB00B10RZP78,"UNILEVER PLC LS-,031111",EUR,Unilever Aktie,A0JNE2,UNLYF,https://www.finanzen.net/news/unilever-news
2,NL00150001Q9,"STELLANTIS NV EO -,01",EUR,Stellantis Aktie,A2QL01,STLA,https://www.finanzen.net/news/stellantis-news
3,AT0000969985,AT+S AUSTR.T.+SYSTEMT.,EUR,AT S (AT&S) Aktie,922230,ASAAF,https://www.finanzen.net/news/at_s-news
4,NL0013332471,"TOMTOM NV NAM. EO-,20",EUR,TomTom Aktie,A2PK2B,TMOAF,https://www.finanzen.net/news/tomtom-news
...,...,...,...,...,...,...,...
95,DE0005654933,EINHELL GERMANY VZO O.N.,EUR,Einhell Germany Aktie,565493,,https://www.finanzen.net/news/einhell_germany-...
96,FR0000131757,"ERAMET SA INH. EO 3,05",EUR,Eramet Aktie,892800,ERMAF,https://www.finanzen.net/news/eramet-news
97,US6541101050,NIKOLA CORP.,EUR,Nikola Aktie,A2P4A9,NKLA,https://www.finanzen.net/news/nikola-news
98,US4062161017,"HALLIBURTON CO. DL 2,50",EUR,Halliburton Aktie,853986,HAL,https://www.finanzen.net/news/halliburton-news


In [6]:
# Get Unilever stock
stock_ticker = ['UNLYF']
df_stocks_selected = df_stocks.query("Symbol in @stock_ticker").copy()
print(df_stocks_selected)

           ISIN                      STOCK CURRENCY      stock_name     WKN  \
1  GB00B10RZP78  UNILEVER PLC   LS-,031111      EUR  Unilever Aktie  A0JNE2   

  Symbol                                    news_link  
1  UNLYF  https://www.finanzen.net/news/unilever-news  


In [7]:
def get_soup(url):
    response = requests.get(url)
    return BeautifulSoup(response.text, 'html.parser')


In [8]:
def get_links_for_all_pages(soup):
    pagination_list_object = soup.find('ul', {'class': "pagination__list"})
    if pagination_list_object:
        pagination_objects = pagination_list_object.find_all(
            'a', {'class': 'pagination__text'})
        links = [pagination.attrs['href'] for pagination in pagination_objects][:-1]
        return links
    else:
        return []
    

In [42]:
def get_news_links(df_stocks, start_date, end_date):
    extracted_news_properties = []
    failed_links = []
    for ISIN, url_news in df_stocks[['ISIN', 'news_link']].values:
        links_all_pages = get_links_for_all_pages(get_soup(url_news))
        if links_all_pages:
            for link in links_all_pages:
                url = base_url + link
                print(url)
                soup = get_soup(url)
                for news in soup.find_all('div', {'class': 'news news--item-with-media'}):
                    news_entry = dict()
                    date = news.find('time', {'class': 'news__date'})
                    article_date = datetime.strptime(date.text, '%d.%m.%y').date()
                    if (start_date <= article_date <= end_date):
                        source = news.find('span', {'class': 'news__source'})
                        kicker = news.find('span', {'class': 'news__kicker'})
                        title = news.find('span', {'class': 'news__title'})
                        link = news.find('a', {'class': 'news__card'}).attrs['href']
                        keys = ['ISIN', 'date', 'title', 'source', 'kicker', 'link_article']
                        values = [ISIN, date, title, source, kicker, link]
                        for key, value in zip(keys, values):
                            if hasattr(value, 'text'):
                                news_entry[key] = value.text.encode('latin').decode()
                            else:
                                news_entry[key] = value
                        if news_entry['link_article']:
                            news_entry['link_article'] = base_url + \
                                news_entry['link_article']

                        extracted_news_properties.append(news_entry)
        else:
            print(url_news)
            failed_links.append(url_news)
    return extracted_news_properties


In [43]:
from datetime import date
from datetime import datetime

In [44]:
start_date = date(2022,1,1)
end_date = date(2022,12,31)

In [45]:
extracted_news_properties = get_news_links(df_stocks_selected, start_date, end_date)

https://www.finanzen.net/news/unilever-news
https://www.finanzen.net/news/unilever-news@intpagenr_2
https://www.finanzen.net/news/unilever-news@intpagenr_3
https://www.finanzen.net/news/unilever-news@intpagenr_4
https://www.finanzen.net/news/unilever-news@intpagenr_5
https://www.finanzen.net/news/unilever-news@intpagenr_6
https://www.finanzen.net/news/unilever-news@intpagenr_7


In [48]:
df_stock_news = pd.DataFrame(extracted_news_properties)
file_path = f"data/stocks/news-{'-'.join(stock_ticker)}.csv"
df_stock_news.to_csv(file_path, index=False)
print(file_path)


data/stocks/news-UNLYF.csv


In [47]:
df_stock_news

Unnamed: 0,ISIN,date,title,source,kicker,link_article
0,GB00B10RZP78,07.09.22,Mehr als nur Pepsi Cola: Diese Marken gehören ...,finanzen.net,PepsiCo-Produkte,https://www.finanzen.net/nachricht/aktien/peps...
1,GB00B10RZP78,02.09.22,Aktivistischer Investor Nelson Peltz - Trian-I...,Dow Jones,Kapital wird zurückgegeben,https://www.finanzen.net/nachricht/aktien/kapi...
2,GB00B10RZP78,24.08.22,Breites Sortiment: Diese Marken gehören zu Col...,finanzen.net,Colgate-Palmolive Portfolio,https://www.finanzen.net/nachricht/aktien/colg...
3,GB00B10RZP78,26.07.22,Unilever-Aktie höher: Unilever rechnet für 202...,Dow Jones,Preiserhöhungen,https://www.finanzen.net/nachricht/aktien/prei...
4,GB00B10RZP78,14.07.22,Droht der große Aktien-Crash? 10 Experten und ...,Redaktion Finanzen Verlag,Kommt es noch schlimmer?,https://www.finanzen.net/nachricht/aktien/komm...
5,GB00B10RZP78,06.07.22,Unilever-Aktie gefragt: Ben & Jerry's klagt ge...,Dow Jones,Nach Kontroverse,https://www.finanzen.net/nachricht/aktien/nach...
6,GB00B10RZP78,29.06.22,Unilever-Aktie gefragt: Unilever trennt sich v...,Dow Jones,Nach Kontroverse,https://www.finanzen.net/nachricht/aktien/nach...
7,GB00B10RZP78,15.06.22,Unilever-Aktie: Aktie mit Kurssprung,Redaktion Finanzen Verlag,Euro am Sonntag-Aktien-Check,https://www.finanzen.net/nachricht/aktien/euro...
8,GB00B10RZP78,01.06.22,Haleon-IPO: GlaxoSmithKline plant Börsengang d...,finanzen.net,Zeitplan konkretisiert,https://www.finanzen.net/nachricht/aktien/zeit...
9,GB00B10RZP78,31.05.22,Unilever-Aktie schließt im Rallymodus: Unileve...,Dow Jones,Nonexecutive Director,https://www.finanzen.net/nachricht/aktien/none...
