In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
def is_site_check_successful():
    url = "https://www.xetra.com/xetra-de/instrumente/aktien/liste-der-handelbaren-aktien"
    response = requests.get(url=url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        results = soup.find('div', {'class': 'results'})
        if results:
            return "Ergebnisse" in results.get_text()
        else:
            return False
    else:
        print("Not reachable")
        
def get_url_for_page(page_number):
    base_url = "https://www.xetra.com/xetra-de/instrumente/aktien/liste-der-handelbaren-aktien"
    search_string = f"/xetra/3002!search?state=H4sIAAAAAAAAADWKsQoCMRAFf0W2TmFjkw-wsgh42IfkRQNrgrsb5Dju3z2EdDPMbJSj4Sr9Tb4NZvf3pU8rMcGU_LYfXEXtBjPIzK9qGiAhPkH-cnZUW-KRca8GnVNvvIZcyJfICkefAVnJEzkS6GB7VHznrF3saLpUY5yiJtp_lYqXCqQAAAA&sort=sTitle+asc&hitsPerPage=50&pageNum={page_number}"
    return base_url + search_string

def get_max_page_value(soup):
    page_values_in_nav_panel = []
    for page_button in soup.find('ul', {'class': 'nav-page'}).find_all('li'):
        page_button = page_button.find('button')
        if page_button:
            if page_button.attrs.keys() == {'value', 'type', 'title', 'name'}:
                value = int(page_button['value'])
                page_values_in_nav_panel.append(value)
    return max(page_values_in_nav_panel)

def get_number_of_stocks_listed(soup):
    number_as_string = soup.find('div', {'class': 'results'}).get_text().split()[0]
    return int(number_as_string.replace('.',''))

In [3]:
response = requests.get(get_url_for_page(0))
soup = BeautifulSoup(response.text, 'html.parser')
max_page_value = get_max_page_value(soup)
number_of_stocks_listed = get_number_of_stocks_listed(soup)

In [17]:
stock_names = []
isin_numbers = []

for page_number in range(max_page_value+1):
    response = requests.get(get_url_for_page(page_number))
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        for item in soup.find('div', {'class':'searchList list'}).find_all('li'):
            if (item.find('h4') is not None) & (item.find('p') is not None):
                stock_name = item.find('h4').get_text().strip()
                isin_number = item.find('p').get_text().split()[-1]
                stock_names.append(stock_name)
                isin_numbers.append(isin_number)

df_stocks = pd.DataFrame({'ISIN': isin_numbers, 'STOCK': stock_names})

In [18]:
print(f"Number of stocks listed: {number_of_stocks_listed}")
print(f"Stocks extracted: {np.round((df_stocks.shape[0] / number_of_stocks_listed) * 100)} %")


Number of stocks listed: 1053
Stocks extracted: 100.0 %


In [20]:
from datetime import datetime
import os

In [23]:
stocks_folder = "data/stocks/raw"
os.makedirs(stocks_folder, exist_ok=True)

datetime_string = datetime.now().strftime("%Y%m%dT%H%M%S")
file_name_stocks = datetime_string + "_xetra.csv"
file_path_stocks = os.path.join(stocks_folder, file_name_stocks)
df_stocks.to_csv(file_path_stocks, index=False)


In [22]:
df_stocks.sample(3)

Unnamed: 0,ISIN,STOCK
516,DE0006223605,INTERTAINMENT AG INH O.N.\n\t\t\t(EUR)
451,DE000A0S8488,HAMBURG.HAFEN LOG.A-SP NA\n\t\t\t(EUR)
252,DE000A2LQUA5,CREDITSHELF AG IA O.N.\n\t\t\t(EUR)
