In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import json
from glob import glob

In [2]:
def get_soup(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        print(f"Access denied for url: {url}. Output is None.")
        return None


In [3]:
base_url = "https://www.finanzen.net"


## Choose a sample from the stocks news overviews

In [6]:
file_path = 'data/stocks/news-UNLYF.csv'
df_news_links = pd.read_csv(file_path)
print(f"Shape: {df_news_links.shape}")
df_news_links.head()

Shape: (26, 6)


Unnamed: 0,ISIN,date,title,source,kicker,link_article
0,GB00B10RZP78,07.09.22,Mehr als nur Pepsi Cola: Diese Marken gehören ...,finanzen.net,PepsiCo-Produkte,https://www.finanzen.net/nachricht/aktien/peps...
1,GB00B10RZP78,02.09.22,Aktivistischer Investor Nelson Peltz - Trian-I...,Dow Jones,Kapital wird zurückgegeben,https://www.finanzen.net/nachricht/aktien/kapi...
2,GB00B10RZP78,24.08.22,Breites Sortiment: Diese Marken gehören zu Col...,finanzen.net,Colgate-Palmolive Portfolio,https://www.finanzen.net/nachricht/aktien/colg...
3,GB00B10RZP78,26.07.22,Unilever-Aktie höher: Unilever rechnet für 202...,Dow Jones,Preiserhöhungen,https://www.finanzen.net/nachricht/aktien/prei...
4,GB00B10RZP78,14.07.22,Droht der große Aktien-Crash? 10 Experten und ...,Redaktion Finanzen Verlag,Kommt es noch schlimmer?,https://www.finanzen.net/nachricht/aktien/komm...


In [12]:
def append_to_json(file_path_json, dict_to_save):
    news_object = []
    try:
        with open(file_path_json, 'r') as f:
            news_object = json.load(f)
    except FileNotFoundError:
        print(f"File {file_path_json} not found and will be created")
    
    news_object.append(dict_to_save)    
    with open(file_path_json, 'w') as json_file:
        json.dump(news_object, json_file)
     

In [13]:
file_path_news_content = 'data/stocks/news-content-UNLYF.json'


In [14]:
def is_url_ingested(url, file_path):
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        links = [data_point['link_article'] for data_point in data]
    except FileNotFoundError:
        links = []
    return url in links


In [15]:
def is_div_element_in_soup(soup, element_prop):
    if soup.find('div', element_prop):
        return True
    else:
        return False
    

In [16]:

for url, ISIN in df_news_links[['link_article', 'ISIN']].values:
    headline_text = None
    teaser_text = None
    news_extracted = None
    print(url)

    
    if not is_url_ingested(url, file_path_news_content):
        soup = get_soup(url)
        if soup:        
            # headline
            if is_div_element_in_soup(soup, {'class': 'row news-snapshot'}):
                headline_html = soup.find('div', {'class': 'row news-snapshot'})
            else:
                if is_div_element_in_soup(soup, {'class': 'single-article'}):
                    headline_html = soup.find('div', {'class': 'single-article'})
                else:
                    headline_html = None
                    
            try:
                headline_text = headline_html.find('h1').text.encode('latin').decode()
            except AttributeError:
                print('Headline has not been identified from the implemented rules.')
                print(headline_html)
                raise
                

            # teaser
            if is_div_element_in_soup(soup, {'class': 'teaser teaser-snapshot'}):
                teaser_html = soup.find('div', {'class': 'teaser teaser-snapshot'})
            else:
                teaser_html = None
                
            try: 
                teaser_text = teaser_html.find_all('div')[-1].text.encode('latin').decode()
            except AttributeError:
                print('Teaser has not been identified from the implemented rules.')
                raise
            
            if is_div_element_in_soup(soup, {'class': 'pull-left mright-20'}):
                datetime_html = soup.find('div', {'class': 'pull-left mright-20'})
            else:
                datetime_html = None
                
            try:
                datetime_text = datetime_html.text
            except AttributeError:
                print('No date found in article')
                datetime_text = None


            # news content
            news_container = soup.find('div', {'id': 'news-container'})

            div_properties_to_delete = [
                {'class': 'dropdown-container-triangle seperate-triangle'},
                {'class': 'dropdown-container-chartflow relative'},
                {'class': 'visible-xs-block'},
                {'class': 'pull-right'},
                {'class': 'lvgSearchOuter'},
                {'class': 'native-content-ad-container'},
                {'class': 'medium-font light-grey'},
                {'class': '',},
                ]


            for div_properties in div_properties_to_delete:
                if news_container.find('div', div_properties):
                    news_container.find('div', div_properties).decompose()

            news_content = news_container.prettify()
            
            # put the content together in a structured format
            news_extracted = {key: value for key, value in zip(['ISIN', 'link_article', 'timestamp', 'headline', 'teaser', 'content_html'], [
                ISIN, url, datetime_text, headline_text, teaser_text, news_content])}
            append_to_json(file_path_news_content, news_extracted)
            # extracted_news_from_url.append(news_extracted)
        
    

https://www.finanzen.net/nachricht/aktien/pepsico-produkte-mehr-als-nur-pepsi-cola-diese-marken-gehoeren-zu-pepsi-9044500
File data/stocks/news-content-UNLYF.json not found and will be created
https://www.finanzen.net/nachricht/aktien/kapital-wird-zurueckgegeben-aktivistischer-investor-nelson-peltz-trian-investmentvehikel-loest-sich-nach-kritik-der-investoren-auf-11681384
https://www.finanzen.net/nachricht/aktien/colgate-palmolive-portfolio-breites-sortiment-diese-marken-gehoeren-zu-colgate-palmolive-9381504
https://www.finanzen.net/nachricht/aktien/preiserhoehungen-unilever-aktie-hoeher-unilever-rechnet-fuer-2022-mit-staerkerem-umsatzwachstum-11563803
https://www.finanzen.net/nachricht/aktien/kommt-es-noch-schlimmer-droht-der-grosse-aktien-crash-10-experten-und-ihre-prognosen-11495625
https://www.finanzen.net/nachricht/aktien/nach-kontroverse-unilever-aktie-gefragt-ben-jerry-s-klagt-gegen-israel-verkauf-11504791
https://www.finanzen.net/nachricht/aktien/nach-kontroverse-unilever-aktie

In [17]:
with open(file_path_news_content, 'r') as f:
    news_data = json.load(f)
df_news = pd.DataFrame(news_data)


In [18]:
df_news

Unnamed: 0,ISIN,link_article,timestamp,headline,teaser,content_html
0,GB00B10RZP78,https://www.finanzen.net/nachricht/aktien/peps...,07.09.2022 06:21,Mehr als nur Pepsi Cola: Diese Marken gehören ...,Pepsi-Marken sind sowohl auf Getränkekarten in...,"<div class=""teaser teaser-xs color-news"" id=""n..."
1,GB00B10RZP78,https://www.finanzen.net/nachricht/aktien/kapi...,02.09.2022 12:52,Aktivistischer Investor Nelson Peltz - Trian-I...,Ein in London börsennotiertes Investmentvehike...,"<div class=""teaser teaser-xs color-news"" id=""n..."
2,GB00B10RZP78,https://www.finanzen.net/nachricht/aktien/colg...,24.08.2022 06:16,Breites Sortiment: Diese Marken gehören zu Col...,"Egal ob Zahnpasta, Spülmittel oder Tierfutter ...","<div class=""teaser teaser-xs color-news"" id=""n..."
3,GB00B10RZP78,https://www.finanzen.net/nachricht/aktien/prei...,26.07.2022 17:35,Unilever-Aktie höher: Unilever rechnet für 202...,Unilever ist nach dem ersten Halbjahr optimist...,"<div class=""teaser teaser-xs color-news"" id=""n..."
4,GB00B10RZP78,https://www.finanzen.net/nachricht/aktien/komm...,14.07.2022 21:55,Droht der große Aktien-Crash? 10 Experten und ...,Prominente Börsianer warnen vor neuen Turbulen...,"<div class=""teaser teaser-xs color-news"" id=""n..."
5,GB00B10RZP78,https://www.finanzen.net/nachricht/aktien/nach...,06.07.2022 13:37,Unilever-Aktie gefragt: Ben & Jerry's klagt ge...,Der Eishersteller Ben & Jerry's will den Verka...,"<div class=""teaser teaser-xs color-news"" id=""n..."
6,GB00B10RZP78,https://www.finanzen.net/nachricht/aktien/nach...,29.06.2022 17:26,Unilever-Aktie gefragt: Unilever trennt sich v...,Der Konsumgüterkonzern Unilever trennt sich na...,"<div class=""teaser teaser-xs color-news"" id=""n..."
7,GB00B10RZP78,https://www.finanzen.net/nachricht/aktien/euro...,15.06.2022 09:33,Unilever-Aktie: Aktie mit Kurssprung,Der US-Investor Nelson Peltz wird Direktor im ...,"<div class=""teaser teaser-xs color-news"" id=""n..."
8,GB00B10RZP78,https://www.finanzen.net/nachricht/aktien/zeit...,01.06.2022 15:52,Haleon-IPO: GlaxoSmithKline plant Börsengang d...,Die Abspaltung der Konsumgütersparte des Pharm...,"<div class=""teaser teaser-xs color-news"" id=""n..."
9,GB00B10RZP78,https://www.finanzen.net/nachricht/aktien/none...,31.05.2022 17:52,Unilever-Aktie schließt im Rallymodus: Unileve...,Der Lebensmittel- und Konsumgüterkonzern Unile...,"<div class=""teaser teaser-xs color-news"" id=""n..."
