In [44]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
from glob import glob

In [45]:
base_url = "https://www.finanzen.net"


In [54]:
df_stocks = pd.read_csv("data/stocks/processed/20220824T074737_xetra_finanzen.csv" , nrows=5)

In [56]:
df_stocks

Unnamed: 0,ISIN,STOCK,CURRENCY,stock_name,WKN,Symbol,news_link
0,FR0000031122,AIR FRANCE-KLM INH. EO 1,EUR,Air France-KLM Aktie,855111,AFRAF,https://www.finanzen.net/news/air_france-klm-news
1,GB00B10RZP78,"UNILEVER PLC LS-,031111",EUR,Unilever Aktie,A0JNE2,UNLYF,https://www.finanzen.net/news/unilever-news
2,NL00150001Q9,"STELLANTIS NV EO -,01",EUR,Stellantis Aktie,A2QL01,STLA,https://www.finanzen.net/news/stellantis-news
3,AT0000969985,AT+S AUSTR.T.+SYSTEMT.,EUR,AT S (AT&S) Aktie,922230,ASAAF,https://www.finanzen.net/news/at_s-news
4,NL0013332471,"TOMTOM NV NAM. EO-,20",EUR,TomTom Aktie,A2PK2B,TMOAF,https://www.finanzen.net/news/tomtom-news


In [57]:
def get_soup(url):
    response = requests.get(url)
    return BeautifulSoup(response.text, 'html.parser')


In [58]:
def get_links_for_all_pages(soup):
    pagination_list_object = soup.find('ul', {'class': "pagination__list"})
    if pagination_list_object:
        pagination_objects = pagination_list_object.find_all(
            'a', {'class': 'pagination__text'})
        links = [pagination.attrs['href'] for pagination in pagination_objects][:-1]
        return links
    else:
        return []
    

In [72]:
extracted_news_properties = []
failed_links = []
for ISIN, url_news in df_stocks[['ISIN', 'news_link']].values:
    links_all_pages = get_links_for_all_pages(get_soup(url_news))
    if links_all_pages:
        for link in links_all_pages:
            url = base_url + link
            print(url)
            soup = get_soup(url)
            for news in soup.find_all('div', {'class': 'news news--item-with-media'}):
                news_entry = dict()
                date = news.find('time', {'class': 'news__date'})
                source = news.find('span', {'class': 'news__source'})
                kicker = news.find('span', {'class': 'news__kicker'})
                title = news.find('span', {'class': 'news__title'})
                link = news.find('a', {'class': 'news__card'}).attrs['href']
                keys = ['ISIN', 'date', 'title', 'source', 'kicker', 'link_article']
                values = [ISIN, date, title, source, kicker, link]
                for key, value in zip(keys, values):
                    if hasattr(value, 'text'):
                        news_entry[key] = value.text.encode('latin').decode()
                    else:
                        news_entry[key] = value
                if news_entry['link_article']:
                    news_entry['link_article'] = base_url + \
                        news_entry['link_article']

                extracted_news_properties.append(news_entry)
    else:
        print(url_news)
        failed_links.append(url_news)


https://www.finanzen.net/news/air_france-klm-news
https://www.finanzen.net/news/air_france-klm-news@intpagenr_2
https://www.finanzen.net/news/air_france-klm-news@intpagenr_3
https://www.finanzen.net/news/air_france-klm-news@intpagenr_4
https://www.finanzen.net/news/air_france-klm-news@intpagenr_5
https://www.finanzen.net/news/air_france-klm-news@intpagenr_6
https://www.finanzen.net/news/air_france-klm-news@intpagenr_7
https://www.finanzen.net/news/unilever-news
https://www.finanzen.net/news/unilever-news@intpagenr_2
https://www.finanzen.net/news/unilever-news@intpagenr_3
https://www.finanzen.net/news/unilever-news@intpagenr_4
https://www.finanzen.net/news/unilever-news@intpagenr_5
https://www.finanzen.net/news/unilever-news@intpagenr_6
https://www.finanzen.net/news/unilever-news@intpagenr_7
https://www.finanzen.net/news/stellantis-news
https://www.finanzen.net/news/stellantis-news@intpagenr_2
https://www.finanzen.net/news/stellantis-news@intpagenr_3
https://www.finanzen.net/news/stella

In [73]:
df_stock_news = pd.DataFrame(extracted_news_properties)
df_stock_news.to_csv("data/stocks/finanzen-dot-net-news.csv" ,index=False)


In [74]:
df_stock_news

Unnamed: 0,ISIN,date,title,source,kicker,link_article
0,FR0000031122,23.08.22,Lufthansa-Aktie: Weichenstellung zu Gebot von ...,Reuters,Privatisierung von ITA,https://www.finanzen.net/nachricht/aktien/priv...
1,FR0000031122,04.08.22,Lufthansa-Aktie: Italienische Gewerkschaften s...,dpa-afx,Übernahmepoker,https://www.finanzen.net/nachricht/aktien/uebe...
2,FR0000031122,31.07.22,Air France-KLM-Aktie: Das sind die Expertenmei...,finanzen.net,Experten-Meinungen,https://www.finanzen.net/nachricht/aktien/air-...
3,FR0000031122,29.07.22,Air France-Aktie höher: Air France-KLM landet ...,dpa-afx,Nettogewinn,https://www.finanzen.net/nachricht/aktien/nett...
4,FR0000031122,12.07.22,Lufthansa-Aktie steigt: ITA-Gebot von Lufthans...,dpa-afx,Bessere Aussichten,https://www.finanzen.net/nachricht/aktien/bess...
...,...,...,...,...,...,...
300,AT0000969985,30.01.19,AT&S nach drei Quartalen mit deutlich besserem...,dpa-afx,Umsatzerlöse steigen,https://www.finanzen.net/nachricht/aktien/umsa...
301,AT0000969985,21.06.18,AT & S: Günstiger Elektronikspezialist,finanzen.net,Euro am Sonntag-Aktien-Tipp,https://www.finanzen.net/nachricht/aktien/euro...
302,AT0000969985,07.05.18,AT&S kehrte 2017/18 in Gewinnzone zurück,dpa-afx,Höhere Dividende geplant,https://www.finanzen.net/nachricht/aktien/hoeh...
303,AT0000969985,12.10.17,AT&S: Comeback!,finanzen.net,Christian Scheid-Kolumne,https://www.finanzen.net/nachricht/zertifikate...
