In [18]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import json
from glob import glob

In [19]:
def get_soup(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        print(f"Access denied for url: {url}. Output is None.")
        return None


In [20]:
base_url = "https://www.finanzen.net"


## Choose a sample from the stocks news overviews

In [21]:
file_path = 'data/stocks/news_UNLYF_2021-01-01_2021-12-31.csv'
df_news_links = pd.read_csv(file_path)
print(f"Shape: {df_news_links.shape}")
df_news_links.head()

Shape: (10, 7)


Unnamed: 0,id,ISIN,date,title,source,kicker,link_article
0,02d8c86bb5de13952ff0d3dd84ab66923bfbd660,GB00B10RZP78,16.12.21,Maersk und Unilever schließen Logistik-Partner...,Dow Jones,Supply-Chain-Management,https://www.finanzen.net/nachricht/aktien/supp...
1,b747700e4a9a9117f14914ba35378aa955eeb3fd,GB00B10RZP78,19.11.21,Unilever-Aktie knapp im Minus: Unilever verkau...,Dow Jones,Verkauf,https://www.finanzen.net/nachricht/aktien/verk...
2,9d9f061dbb3c23a0f564520c966e0af95009ca34,GB00B10RZP78,10.11.21,Unilever-Aktie fester: Unilever legt wohl Verk...,Dow Jones,Interesse zu gering,https://www.finanzen.net/nachricht/aktien/inte...
3,7366b6898ce8dd20d22460dae0d28226883bca82,GB00B10RZP78,21.10.21,Unilever-Aktie steigt: Unilever mit Umsatzplus,dpa-afx,Umfeld unbeständig,https://www.finanzen.net/nachricht/aktien/umfe...
4,0897e96b96833749c3472b95d97577af0ce7f038,GB00B10RZP78,13.08.21,"Deo, Eis und Waschmittel: Hinter diesen Produk...",finanzen.net,Suppen und Diät-Drinks,https://www.finanzen.net/nachricht/aktien/supp...


In [22]:
def append_to_json(file_path_json, dict_to_save):
    news_object = []
    try:
        with open(file_path_json, 'r') as f:
            news_object = json.load(f)
    except FileNotFoundError:
        print(f"File {file_path_json} not found and will be created")
    
    news_object.append(dict_to_save)    
    with open(file_path_json, 'w') as json_file:
        json.dump(news_object, json_file)
     

In [23]:
def is_url_ingested(url, file_path):
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        links = [data_point['link_article'] for data_point in data]
    except FileNotFoundError:
        links = []
    return url in links


In [24]:
def is_div_element_in_soup(soup, element_prop):
    if soup.find('div', element_prop):
        return True
    else:
        return False
    

In [25]:
file_path_news_content = 'data/stocks/news_content_UNLYF_2021-01-01_2021-12-31.json'

In [26]:

for id, url, ISIN in df_news_links[['id', 'link_article', 'ISIN']].values:
    headline_text = None
    teaser_text = None
    news_extracted = None
    print(url)

    
    if not is_url_ingested(url, file_path_news_content):
        soup = get_soup(url)
        if soup:        
            # headline
            if is_div_element_in_soup(soup, {'class': 'row news-snapshot'}):
                headline_html = soup.find('div', {'class': 'row news-snapshot'})
            else:
                if is_div_element_in_soup(soup, {'class': 'single-article'}):
                    headline_html = soup.find('div', {'class': 'single-article'})
                else:
                    headline_html = None
                    
            try:
                headline_text = headline_html.find('h1').text.encode('latin').decode()
            except AttributeError:
                print('Headline has not been identified from the implemented rules.')
                print(headline_html)
                raise
                

            # teaser
            if is_div_element_in_soup(soup, {'class': 'teaser teaser-snapshot'}):
                teaser_html = soup.find('div', {'class': 'teaser teaser-snapshot'})
            else:
                teaser_html = None
                
            try: 
                teaser_text = teaser_html.find_all('div')[-1].text.encode('latin').decode()
            except AttributeError:
                print('Teaser has not been identified from the implemented rules.')
                raise
            
            if is_div_element_in_soup(soup, {'class': 'pull-left mright-20'}):
                datetime_html = soup.find('div', {'class': 'pull-left mright-20'})
            else:
                datetime_html = None
                
            try:
                datetime_text = datetime_html.text
            except AttributeError:
                print('No date found in article')
                datetime_text = None


            # news content
            news_container = soup.find('div', {'id': 'news-container'})

            div_properties_to_delete = [
                {'class': 'dropdown-container-triangle seperate-triangle'},
                {'class': 'dropdown-container-chartflow relative'},
                {'class': 'visible-xs-block'},
                {'class': 'pull-right'},
                {'class': 'lvgSearchOuter'},
                {'class': 'native-content-ad-container'},
                {'class': 'medium-font light-grey'},
                {'class': '',},
                ]


            for div_properties in div_properties_to_delete:
                if news_container.find('div', div_properties):
                    news_container.find('div', div_properties).decompose()

            news_content = news_container.prettify()
            
            # put the content together in a structured format
            news_extracted = {key: value for key, value in zip(['id', 'ISIN', 'link_article', 'timestamp', 'headline', 'teaser', 'content_html'], [
                id, ISIN, url, datetime_text, headline_text, teaser_text, news_content])}
            append_to_json(file_path_news_content, news_extracted)
            # extracted_news_from_url.append(news_extracted)
        
    

https://www.finanzen.net/nachricht/aktien/supply-chain-management-maersk-und-unilever-schliessen-logistik-partnerschaft-maersk-aktie-hoeher-10851584
File data/stocks/news_content_UNLYF_2021-01-01_2021-12-31.json not found and will be created
https://www.finanzen.net/nachricht/aktien/verkauf-unilever-aktie-knapp-im-minus-unilever-verkauft-teegeschaeft-fuer-milliardenbetrag-an-cvc-capital-10763354
https://www.finanzen.net/nachricht/aktien/interesse-zu-gering-unilever-aktie-fester-unilever-legt-wohl-verkaufsplaene-fuer-q-tips-auf-eis-10728573
https://www.finanzen.net/nachricht/aktien/umfeld-unbestaendig-unilever-aktie-steigt-unilever-mit-umsatzplus-10648601
https://www.finanzen.net/nachricht/aktien/suppen-und-diaet-drinks-deo-eis-und-waschmittel-hinter-diesen-produkten-steckt-unilever-9013248
https://www.finanzen.net/nachricht/aktien/prognose-gesenkt-margenwarnung-unilever-durch-hoehere-kosten-belastet-unilever-aktie-verliert-10358729
https://www.finanzen.net/nachricht/aktien/pflanzliche-

In [27]:
with open(file_path_news_content, 'r') as f:
    news_data = json.load(f)
df_news = pd.DataFrame(news_data)


In [28]:
file_name_news_content = 'data/stocks/news_content_UNLYF_2021-01-01_2021-12-31.csv'
df_news.to_csv(file_name_news_content, index=False)
print("Saved to", file_name_news_content)

Saved to data/stocks/news_content_UNLYF_2021-01-01_2021-12-31.csv
