In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import json
from glob import glob

In [2]:
def get_soup(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        print(f"Access denied for url: {url}. Output is None.")
        return None


In [3]:
base_url = "https://www.finanzen.net"


## Choose a sample from the stocks news overviews

In [6]:
df_news_links = pd.read_csv("data/stocks/finanzen-dot-net-news.csv").sample(5)
print(f"Shape: {df_news_links.shape}")
df_news_links.head()

Shape: (5, 6)


Unnamed: 0,ISIN,date,title,source,kicker,link_article
245,NL00150001Q9,26.01.21,Stellantis-Aktie zieht an: SocGen startet Stel...,dpa-afx,Überzeugende Aussichten,https://www.finanzen.net/nachricht/aktien/uebe...
92,GB00B10RZP78,08.05.22,So schätzen die Analysten die Zukunft der Unil...,finanzen.net,Aussichten,https://www.finanzen.net/nachricht/aktien/auss...
189,NL00150001Q9,08.04.22,Stellantis-Aktie gewinnt: Stellantis verkauft ...,finanzen.net,25 Prozent,https://www.finanzen.net/nachricht/aktien/25-p...
157,GB00B10RZP78,28.11.17,Unilever bestätigt Ausblick - Strategische Maß...,Dow Jones,Wachstum erwartet,https://www.finanzen.net/nachricht/aktien/wach...
144,GB00B10RZP78,28.01.19,Unilever übernimmt US-Waschmittelunternehmen T...,Dow Jones,Kaufsumme unbekannt,https://www.finanzen.net/nachricht/aktien/kauf...


In [7]:
def append_to_json(file_path_json, dict_to_save):
    news_object = []
    try:
        with open(file_path_json, 'r') as f:
            news_object = json.load(f)
    except FileNotFoundError:
        print(f"File {file_path_json} not found and will be created")
    
    news_object.append(dict_to_save)    
    with open(file_path_json, 'w') as json_file:
        json.dump(news_object, json_file)
     

In [8]:
file_path_news_content = "data/stocks/finanzen-dot-net-news-content.json"

In [15]:
def is_url_ingested(url, file_path):
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        links = [data_point['link_article'] for data_point in data]
    except FileNotFoundError:
        links = []
    return url in links


In [16]:
def is_div_element_in_soup(soup, element_prop):
    if soup.find('div', element_prop):
        return True
    else:
        return False
    

In [17]:

for url, ISIN in df_news_links[['link_article', 'ISIN']].values:
    headline_text = None
    teaser_text = None
    news_extracted = None
    print(url)

    
    if not is_url_ingested(url, file_path_news_content):
        soup = get_soup(url)
        if soup:        
            # headline
            if is_div_element_in_soup(soup, {'class': 'row news-snapshot'}):
                headline_html = soup.find('div', {'class': 'row news-snapshot'})
            else:
                if is_div_element_in_soup(soup, {'class': 'single-article'}):
                    headline_html = soup.find('div', {'class': 'single-article'})
                else:
                    headline_html = None
                    
            try:
                headline_text = headline_html.find('h1').text.encode('latin').decode()
            except AttributeError:
                print('Headline has not been identified from the implemented rules.')
                print(headline_html)
                raise
                

            # teaser
            if is_div_element_in_soup(soup, {'class': 'teaser teaser-snapshot'}):
                teaser_html = soup.find('div', {'class': 'teaser teaser-snapshot'})
            else:
                teaser_html = None
                
            try: 
                teaser_text = teaser_html.find_all('div')[-1].text.encode('latin').decode()
            except AttributeError:
                print('Teaser has not been identified from the implemented rules.')
                raise
            
            if is_div_element_in_soup(soup, {'class': 'pull-left mright-20'}):
                datetime_html = soup.find('div', {'class': 'pull-left mright-20'})
            else:
                datetime_html = None
                
            try:
                datetime_text = datetime_html.text
            except AttributeError:
                print('No date found in article')
                datetime_text = None


            # news content
            news_container = soup.find('div', {'id': 'news-container'})

            div_properties_to_delete = [
                {'class': 'dropdown-container-triangle seperate-triangle'},
                {'class': 'dropdown-container-chartflow relative'},
                {'class': 'visible-xs-block'},
                {'class': 'pull-right'},
                {'class': 'lvgSearchOuter'},
                {'class': 'native-content-ad-container'},
                {'class': 'medium-font light-grey'},
                {'class': '',},
                ]


            for div_properties in div_properties_to_delete:
                if news_container.find('div', div_properties):
                    news_container.find('div', div_properties).decompose()

            news_content = news_container.prettify()
            
            # put the content together in a structured format
            news_extracted = {key: value for key, value in zip(['ISIN', 'link_article', 'timestamp', 'headline', 'teaser', 'content_html'], [
                ISIN, url, datetime_text, headline_text, teaser_text, news_content])}
            append_to_json(file_path_news_content, news_extracted)
            # extracted_news_from_url.append(news_extracted)
        
    

https://www.finanzen.net/nachricht/aktien/ueberzeugende-aussichten-stellantis-aktie-zieht-an-socgen-startet-stellantis-mit-34-buy-34-9732429
File data/stocks/finanzen-dot-net-news-content.json not found and will be created
https://www.finanzen.net/nachricht/aktien/aussichten-so-schaetzen-die-analysten-die-zukunft-der-unilever-aktie-ein-11299275
https://www.finanzen.net/nachricht/aktien/25-prozent-stellantis-aktie-gewinnt-stellantis-verkauft-restliche-beteiligung-an-gefco-11223393
https://www.finanzen.net/nachricht/aktien/wachstum-erwartet-unilever-bestaetigt-ausblick-strategische-massnahmen-auf-kurs-5837852
https://www.finanzen.net/nachricht/aktien/kaufsumme-unbekannt-unilever-uebernimmt-us-waschmittelunternehmen-the-laundress-7072255


In [18]:
with open(file_path_news_content, 'r') as f:
    news_data = json.load(f)
df_news = pd.DataFrame(news_data)


In [19]:
df_news

Unnamed: 0,ISIN,link_article,timestamp,headline,teaser,content_html
0,NL00150001Q9,https://www.finanzen.net/nachricht/aktien/uebe...,26.01.2021 14:07,Stellantis-Aktie zieht an: SocGen startet Stel...,Die französische Großbank Societe Generale (So...,"<div class=""teaser teaser-xs color-news"" id=""n..."
1,GB00B10RZP78,https://www.finanzen.net/nachricht/aktien/auss...,08.05.2022 17:24,So schätzen die Analysten die Zukunft der Unil...,Die jüngsten Expertenmeinungen zur Unilever-Ak...,"<div class=""teaser teaser-xs color-news"" id=""n..."
2,NL00150001Q9,https://www.finanzen.net/nachricht/aktien/25-p...,08.04.2022 16:29,Stellantis-Aktie gewinnt: Stellantis verkauft ...,Der Autobauer Stellantis NV hat seinen verblei...,"<div class=""teaser teaser-xs color-news"" id=""n..."
3,GB00B10RZP78,https://www.finanzen.net/nachricht/aktien/wach...,28.11.2017 10:16,Unilever bestätigt Ausblick - Strategische Maß...,Der Konsumgüterkonzern Unilever hat seinen Aus...,"<div class=""teaser teaser-xs color-news"" id=""n..."
4,GB00B10RZP78,https://www.finanzen.net/nachricht/aktien/kauf...,28.01.2019 20:10,Unilever übernimmt US-Waschmittelunternehmen T...,Der Konsumgüterkonzern Unilever hat sich mit e...,"<div class=""teaser teaser-xs color-news"" id=""n..."
