In [10]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import json
from glob import glob

In [60]:
def get_soup(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        print(f"Access denied for url: {url}. Output is None.")
        return None


In [61]:
base_url = "https://www.finanzen.net"


In [62]:
df_news_links = pd.read_csv("data/stocks/finanzen-dot-net-news.csv")
print(f"Shape: {df_news_links.shape}")
df_news_links.head()

Shape: (305, 6)


Unnamed: 0,ISIN,date,title,source,kicker,link_article
0,FR0000031122,23.08.22,Lufthansa-Aktie: Weichenstellung zu Gebot von ...,Reuters,Privatisierung von ITA,https://www.finanzen.net/nachricht/aktien/priv...
1,FR0000031122,04.08.22,Lufthansa-Aktie: Italienische Gewerkschaften s...,dpa-afx,Übernahmepoker,https://www.finanzen.net/nachricht/aktien/uebe...
2,FR0000031122,31.07.22,Air France-KLM-Aktie: Das sind die Expertenmei...,finanzen.net,Experten-Meinungen,https://www.finanzen.net/nachricht/aktien/air-...
3,FR0000031122,29.07.22,Air France-Aktie höher: Air France-KLM landet ...,dpa-afx,Nettogewinn,https://www.finanzen.net/nachricht/aktien/nett...
4,FR0000031122,12.07.22,Lufthansa-Aktie steigt: ITA-Gebot von Lufthans...,dpa-afx,Bessere Aussichten,https://www.finanzen.net/nachricht/aktien/bess...


In [63]:
def append_to_json(file_path_json, dict_to_save):
    news_object = []
    try:
        with open(file_path_json, 'r') as f:
            news_object = json.load(f)
    except FileNotFoundError:
        print(f"File {file_path_json} not found and will be created")
    
    news_object.append(dict_to_save)    
    with open(file_path_json, 'w') as json_file:
        json.dump(news_object, json_file)
     

In [64]:
file_path_news_content = "data/stocks/finanzen-dot-net-news-content.json"

In [65]:
def is_url_ingested(url, file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return url in [data_point['link_article'] for data_point in data]


In [66]:
def is_div_element_in_soup(soup, element_prop):
    if soup.find('div', element_prop):
        return True
    else:
        return False
    

In [67]:



for url, ISIN in df_news_links[['link_article', 'ISIN']].values:
    headline_text = None
    teaser_text = None
    news_extracted = None
    print(url)

    
    if not is_url_ingested(url, file_path_news_content):
        soup = get_soup(url)
        if soup:        
            # headline
            if is_div_element_in_soup(soup, {'class': 'row news-snapshot'}):
                headline_html = soup.find('div', {'class': 'row news-snapshot'})
            else:
                if is_div_element_in_soup(soup, {'class': 'single-article'}):
                    headline_html = soup.find('div', {'class': 'single-article'})
                else:
                    headline_html = None
                    
            try:
                headline_text = headline_html.find('h1').text.encode('latin').decode()
            except AttributeError:
                print('Headline has not been identified from the implemented rules.')
                print(headline_html)
                raise
                

            # teaser
            if is_div_element_in_soup(soup, {'class': 'teaser teaser-snapshot'}):
                teaser_html = soup.find('div', {'class': 'teaser teaser-snapshot'})
            else:
                teaser_html = None
                
            try: 
                teaser_text = teaser_html.find_all('div')[-1].text.encode('latin').decode()
            except AttributeError:
                print('Teaser has not been identified from the implemented rules.')
                raise

            # news content
            news_container = soup.find('div', {'id': 'news-container'})

            div_properties_to_delete = [
                {'class': 'dropdown-container-triangle seperate-triangle'},
                {'class': 'dropdown-container-chartflow relative'},
                {'class': 'visible-xs-block'},
                {'class': 'pull-right'},
                {'class': 'lvgSearchOuter'},
                {'class': 'native-content-ad-container'},
                {'class': 'medium-font light-grey'},
                {'class': '',},
                ]


            for div_properties in div_properties_to_delete:
                if news_container.find('div', div_properties):
                    news_container.find('div', div_properties).decompose()

            news_content = news_container.prettify()
            
            # put the content together in a structured format
            news_extracted = {key: value for key, value in zip(['ISIN', 'link_article', 'headline', 'teaser', 'content_html'], [
                                                            ISIN, url, headline_text, teaser_text, news_content])}
            append_to_json(file_path_news_content, news_extracted)
            # extracted_news_from_url.append(news_extracted)
        
    

https://www.finanzen.net/nachricht/aktien/privatisierung-von-ita-lufthansa-aktie-weichenstellung-zu-gebot-von-lufthansa-und-msc-fuer-ita-bis-monatsende-moeglich-11650055
https://www.finanzen.net/nachricht/aktien/uebernahmepoker-lufthansa-aktie-italienische-gewerkschaften-sprechen-sich-bei-ita-uebernahme-fuer-lufthansa-und-msc-aus-11599685
https://www.finanzen.net/nachricht/aktien/air-france-klm-aktie-das-sind-die-expertenmeinungen-des-monats-juli-11583073
https://www.finanzen.net/nachricht/aktien/nettogewinn-air-france-aktie-hoeher-air-france-klm-landet-wieder-in-der-gewinnzone-11578525
https://www.finanzen.net/nachricht/aktien/bessere-aussichten-lufthansa-aktie-steigt-ita-gebot-von-lufthansa-und-msc-wohl-favorit-11521806
https://www.finanzen.net/nachricht/aktien/brief-an-draghi-lufthansa-aktie-verliert-lufthansa-draengt-auf-entscheidung-bei-ita-verkauf-11464284
https://www.finanzen.net/nachricht/aktien/corona-krise-air-france-klm-aktie-tiefrott-air-france-klm-sichert-sich-milliardensu

In [68]:
with open(file_path_news_content, 'r') as f:
    news_data = json.load(f)

In [69]:
df_news = pd.DataFrame(news_data)

In [74]:
df_news

Unnamed: 0,ISIN,link_article,headline,teaser,content_html
0,FR0000031122,https://www.finanzen.net/nachricht/aktien/priv...,Lufthansa-Aktie: Weichenstellung zu Gebot von ...,Im Ringen um die Privatisierung der italienisc...,"<div class=""teaser teaser-xs color-news"" id=""n..."
1,FR0000031122,https://www.finanzen.net/nachricht/aktien/uebe...,Lufthansa-Aktie: Italienische Gewerkschaften s...,In den stockenden Übernahmeverhandlungen um di...,"<div class=""teaser teaser-xs color-news"" id=""n..."
2,FR0000031122,https://www.finanzen.net/nachricht/aktien/air-...,Air France-KLM-Aktie: Das sind die Expertenmei...,Experten haben im letzten Monat einen genauen ...,"<div class=""teaser teaser-xs color-news"" id=""n..."
3,FR0000031122,https://www.finanzen.net/nachricht/aktien/nett...,Air France-Aktie höher: Air France-KLM landet ...,Die Fluggesellschaft Air France-KLM hat im zwe...,"<div class=""teaser teaser-xs color-news"" id=""n..."
4,FR0000031122,https://www.finanzen.net/nachricht/aktien/bess...,Lufthansa-Aktie steigt: ITA-Gebot von Lufthans...,Die italienische Regierung bevorzugt einem Med...,"<div class=""teaser teaser-xs color-news"" id=""n..."
...,...,...,...,...,...
297,AT0000969985,https://www.finanzen.net/nachricht/aktien/umsa...,AT&S nach drei Quartalen mit deutlich besserem...,Der börsennotierte steirische Leiterplattenher...,"<div class=""teaser teaser-xs color-news"" id=""n..."
298,AT0000969985,https://www.finanzen.net/nachricht/aktien/euro...,AT & S: Günstiger Elektronikspezialist,Nach dem erfolgreichen Turnaround geht es mit ...,"<div class=""teaser teaser-xs color-news"" id=""n..."
299,AT0000969985,https://www.finanzen.net/nachricht/aktien/hoeh...,AT&S kehrte 2017/18 in Gewinnzone zurück,Der österreichische Leiterplattenhersteller AT...,"<div class=""teaser teaser-xs color-news"" id=""n..."
300,AT0000969985,https://www.finanzen.net/nachricht/zertifikate...,AT&S: Comeback!,Eine lange Durststrecke haben die Aktionäre de...,"<div class=""teaser teaser-xs color-news"" id=""n..."
