In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

import random
from time import sleep

In [2]:
from tqdm import tqdm
from itertools import product
import numpy as np

In [4]:
def clean_mio(x):
    x = str(x).replace(',', '.')
    for i, j in [('Tsd.',10**3), ('Mio.',10**6), ('Mrd.',10**9)]:
        if i in x:
            return float(x.replace(i, '').strip()) * j
    return float(x)

In [5]:
def get_fundamentals(html):
	t = pd.read_html(html)[0]
	t = t.drop(t.tail(1).index, axis=0)
	t = t.replace('-', np.nan)
	t['EPS'] = t['EPS'].astype(float) / 100
	t['KGV'] = t['KGV'].astype(float) / 100
	t['KUV'] = t['KUV'].astype(float) / 100
	t['Dividende'] = t['Dividende'].astype('string').str.replace(',', '.').str.replace('%', '').astype(float) / 100
	cols = t['Datum'].values
	for col in ['Umsatz', 'EBIT', 'Gewinn']:
		t[col] = t[col].apply(clean_mio)

	t = t.T
	t = t.drop(['Datum'], axis=0).sort_index()
	cols = list(product(cols, t.index))
	cols.sort(key=lambda x: x[1])
	cols = ['_'.join([a.split(' ')[0],b]) for a, b in cols]

	df = pd.DataFrame([t.stack(dropna=False).values], columns=cols)
	# # [  row for col in t for row in t[col]  ]
	return df


In [36]:
links = open('traderfox.txt', 'w')
markets = ['alle-deutschen-aktien', 'alle-nasdaq-aktien', 'alle-nyse-aktien', 'stoxx-europe-600', 'nikkei-225']

for market in markets:
    stocks = requests.get(f'https://traderfox.de/aktien/{market}-bestandteile').content
    soup = BeautifulSoup(stocks, "html.parser")
    ahrefs = soup.select('#insert-stocks > tbody > tr > td > a')#insert-stocks > tbody > tr:nth-child(1) > td.name
    site_links = [href.get('href') for href in ahrefs]
    [links.write(f"{link.replace('/aktien/', '')}\n") for link in site_links]
    respect = random.randint(300,1400)/1000
    print('Market', market, 'done ... spleeping for', respect, 'ms')
    sleep(respect)

links.close()

Market alle-deutschen-aktien done ... spleeping for 0.424 ms
Market alle-nasdaq-aktien done ... spleeping for 0.611 ms
Market alle-nyse-aktien done ... spleeping for 0.701 ms
Market stoxx-europe-600 done ... spleeping for 1.183 ms
Market nikkei-225 done ... spleeping for 1.081 ms


In [3]:
with open('traderfox.txt', 'r') as f:
    comp_links = f.readlines()

In [145]:
# get_fundamentals(str(soup_fund))

Unnamed: 0,2021_Dividende,2020_Dividende,2019_Dividende,2018_Dividende,2017_Dividende,2016_Dividende,2015_Dividende,2014_Dividende,2013_Dividende,2021_EBIT,...,2013_KUV,2021_Umsatz,2020_Umsatz,2019_Umsatz,2018_Umsatz,2017_Umsatz,2016_Umsatz,2015_Umsatz,2014_Umsatz,2013_Umsatz
0,0.0053,0.0048,0.0048,0.0048,0.0045,0.0043,0.0039,0.0039,0.0039,17720000.0,...,3.29,266350000.0,246730000.0,236400000.0,209780000.0,189400000.0,174300000.0,152880000.0,186610000.0,126130000.0


In [53]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

options = Options()
driver = webdriver.Chrome(ChromeDriverManager().install())
comps = []
counter = 0
options.add_argument('--disable-gpu')
options.add_argument('user-agent=fake-useragent')
driver.maximize_window()

for comp_link in tqdm(comp_links):
    link = comp_link.split('\n')[0]
    idx = link.find('-') + 1
    company = pd.DataFrame({'name': [link[idx:]]})
    driver.get(f'http://web.archive.org/web/20221209031735/https://traderfox.de/aktien/{link}')
    source = driver.page_source.encode()
    try:
        company['sector'] = BeautifulSoup(source, 'html.parser').select('#stock-top-summary > div > table > tbody > tr > td')[2].get_text()
    except:
        company['sector'] = np.nan

    comps.append(company)
    sleep(random.randint(200,900)/1000) # RESPECT!
    counter += 1

    if counter % 10 == 0 or counter == len(comp_links):
        pd.concat(comps,axis=0).to_csv('traderfox_sectors.csv', index=False)

# driver.close()
display(pd.concat(comps,axis=0))

  driver = webdriver.Chrome(ChromeDriverManager().install())
100%|██████████| 2/2 [00:39<00:00, 19.90s/it]


Unnamed: 0,name,sector
0,11-ag,Telekommunikation / Telekomdienstleister
0,11-88-0-solutions-ag,Telekommunikation / Telekomdienstleister


In [55]:
BeautifulSoup(requests.get('http://web.archive.org/web/20221209031735/https://traderfox.de/aktien/386713-11-ag').content, "html.parser").select('#stock-top-summary > div > table > tbody > tr > td')


[<td>2,25 Mrd. EUR (176,30 Mio. x 12,780€)</td>,
 <td>https://unternehmen.1und1.de/</td>,
 <td>Telekommunikation / Telekomdienstleister</td>,
 <td><a href="/web/20221209031735/https://traderfox.de/nachrichten/dpa-afx-pro/">dpa-AFX Pro
 									<span>5</span></a></td>,
 <td><a href="/web/20221209031735/https://traderfox.de/nachrichten/386713-11-ag-inh-on/quellen-newsbot">Newsbot
 									<span>-</span></a></td>,
 <td><a href="/web/20221209031735/https://traderfox.de/nachrichten/386713-11-ag-inh-on/quellen-dpa-afx-compact">dpa-AFX
 									<span>5</span></a></td>,
 <td><a href="/web/20221209031735/https://traderfox.de/nachrichten/386713-11-ag-inh-on/quellen-reuters">Reuters
 									<span>-</span></a></td>,
 <td>0,62</td>,
 <td>0,61</td>,
 <td>0,59</td>,
 <td>0,58</td>,
 <td>5,56</td>,
 <td>6,03</td>,
 <td>10,26</td>,
 <td>6,09</td>,
 <td>0,39%</td>,
 <td>0,39%</td>,
 <td>0,39%</td>,
 <td>0,39%</td>]

In [44]:
comp_links[2]

'2079583-2g-energy-ag\n'

In [None]:
companies = []
counter = 0

for comp_link in tqdm(comp_links):
    link = comp_link.split('\n')[0]
    idx = link.find('-') + 1
    company = pd.DataFrame({'name': [link[idx:]]})
    soup = BeautifulSoup(requests.get(f'https://markets.traderfox.com/aktien/{link}').content, "html.parser")
    # try:   
    # company['sector'] = soup.select('#stock-top-summary > div > table > tbody > tr > td')[2].get_text()
    # except:
    #     company['sector'] = np.nan            # THANK YOU TRADERFOX!!! :(

    try:
        company['description'] = soup.select('#profile > div')[0].get_text()
    except:
        company['description'] = np.nan

    try:
        soup_fund = soup.select('#table-pd')[0]
        company = pd.concat([company, get_fundamentals(str(soup_fund))],axis=1)
    except:
        pass
        
    companies.append(company)
    sleep(random.randint(200,900)/1000) # RESPECT!
    counter += 1

    if counter % 10 == 0 or counter == len(comp_links):
        pd.concat(companies,axis=0).to_csv('traderfox.csv', index=False)

display(pd.concat(companies,axis=0))

In [4]:
# get sectors from cached site
companies = []
counter = 0
shuffled = random.sample(comp_links, len(comp_links))

for comp_link in tqdm(shuffled):
    link = comp_link.split('\n')[0]
    idx = link.find('-') + 1
    company = pd.DataFrame({'name': [link[idx:]]})
    try:   
        soup = BeautifulSoup(requests.get(f'http://web.archive.org/web/https://traderfox.de/aktien/{link}', timeout=4).content, "html.parser")
        company['sector'] = soup.select('#stock-top-summary > div > table > tbody > tr > td')[2].get_text()
    except:
        company['sector'] = np.nan            # THANK YOU TRADERFOX!!! :(

    # try:
    #     company['description'] = soup.select('#profile > div')[0].get_text()
    # except:
    #     company['description'] = np.nan

    # try:
    #     soup_fund = soup.select('#table-pd')[0]
    #     company = pd.concat([company, get_fundamentals(str(soup_fund))],axis=1)
    # except:
    #     pass
        
    companies.append(company)
    sleep(random.randint(200,900)/1000) # RESPECT!
    counter += 1

    if counter % 10 == 0 or counter == len(comp_links):
        pd.concat(companies,axis=0).to_csv('traderfox_sectors.csv', index=False)

display(pd.concat(companies,axis=0))

  1%|          | 33/5419 [03:16<8:09:01,  5.45s/it] 