In [None]:
import pandas as pd
import requests
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import json
import collections

In [None]:
def connections(origin, destination, dt=datetime.now(), only_direct=False):
        """
        Find connections between two stations
        Args:
            origin (str): origin station
            destination (str): destination station
            dt (datetime): date and time for query
            only_direct (bool): only direct connections
        """
        query = {
            'S': origin,
            'Z': destination,
            'date': dt.strftime("%d.%m.%y"),
            'time': dt.strftime("%H:%M"),
            'start': 1,
            'REQ0JourneyProduct_opt0': 1 if only_direct else 0
        }
        rsp = requests.get('http://mobile.bahn.de/bin/mobil/query.exe/dox?', params=query)
        print(rsp.url)
        return parse_connections(rsp.text)

In [None]:
def parse_connections(html):
    soup = BeautifulSoup(html, "html.parser")
    
    connections = list()

    for row in soup.find_all("td", class_="overview timelink"):
        columns = row.parent.find_all("td")

        try:
            price_raw = columns[3].find("span", class_="bold").text.strip().replace(',', '.')
            price = float(price_raw)
        except:
            price = None
        data = {
            'details': columns[0].a.get('href').replace('!details=opened!', '!details=opened!detailsVerbund=opened!'),
            'departure': columns[0].a.contents[0].string,
            'arrival': columns[0].a.contents[2].string,
            'transfers': int(columns[2].contents[0]),
            'time': columns[2].contents[2],
            'products': columns[3].contents[0].split(', '),
            'price': price,
        }
        if int(columns[2].contents[0]) != 0:
            data['trains'] = get_trains(data['details'].replace('&abroadage=1&', ''))
        else:
            data['trains'] = []
        
        connections.append(data)
    return connections

In [None]:
def get_trains(url):
    
    rsp = requests.get(url)
    soup = BeautifulSoup(rsp.text, "html.parser")
    an, ab = {}, {}
    connections, trains, date = [], [] ,[]
    #get start
    for row in soup.find_all("div", class_="rline haupt routeStart"):
        erg = str(row).replace('<', '\n').replace('>', '\n').split('\n')
        #print(erg)
        start['bhf'] = erg[5]
        if len(erg[10].split(' ')) <= 5:
            start['platform'] = erg[10].split(' ') 
        else:
            erg[10] = erg[10].split(' ') #weird edge cases 'fern'
            print('Edge case:', erg[10])
            erg[10].pop(4)
            start['platform'] = erg[10]
    #get end
    for row in soup.find_all("div", class_="rline haupt routeEnd routeEnd__IV"):
        erg = str(row).replace('<', '\n').replace('>', '\n').split('\n')
        #print(erg)
        end['bhf'] = erg[9]
        if len(erg[3].split(' ')) <= 5:
            end['platform'] = erg[3].split(' ') 
        else:
            erg[3] = erg[3].split(' ') #weird edge cases 'fern'
            print('Edge case:', erg[3])
            erg[3].pop(4)
            end['platform'] = erg[3]
    #get trains
    for row in soup.find_all("a", class_="flaparrow"):
        train = str(row).split('\n')
        #check if this is the train line
        if len(train) > 2:
            #get train number into right format
            train = train[2].split(' ')
            trains.append(train[0] + '_' + train[-1])
    #get anfahrts infos
    for row in soup.find_all("div", class_="rline haupt routeChange routeChange__IV"):
        erg = str(row).replace('<', '\n').replace('>', '\n').split('\n')
        if len(erg[3].split(' ')) <= 5:
            an[erg[9]] = erg[3].split(' ') 
        else:
            erg[3] = erg[3].split(' ') #weird edge cases 'fern'
            print('Edge case:', erg[3])
            erg[3].pop(4)
            an[erg[9]] = erg[3]
    #get abfahrts infos
    for row in [*soup.find_all("div", class_="rline haupt stationDark routeChange routeChangeIV"),*soup.find_all("div", class_="rline haupt routeChange routeChangeIV")]:
        erg = str(row).replace('<', '\n').replace('>', '\n').split('\n')
        if len(erg[10].split(' ')) <= 5:
            ab[erg[5]] = erg[10].split(' ') 
        else:
            erg[10] = erg[10].split(' ') #weird edge cases 'fern'
            print('Edge case:',erg[10])
            erg[10].pop(4)
            ab[erg[5]] = erg[10]
    #get date and time of connection
    for row in soup.find_all("span", class_="querysummary2"):
        erg = str(row).replace(',', '\n').split('\n')
        if "img" in erg[1]:
            erg.pop(1)
        if 'span' in erg[4]:
            erg.pop(4)
        print(erg)
        date.append(erg[3].replace(' ', ''))#first start time
        date.append(erg[2])#start date
        if len(erg) > 7:
            date.append(erg[6]) #if the end date is diffenrent than the starts

    if(len(ab) != len(an) != (len(trains)-1)): #check if there is an eqal amount of everything
        print('ERROR')
    #print(an)
    #print(ab)
    print(start)
    print(end)
    info = {
                'startbhf': start['bhf'],
                'endbhf': end['bhf'],
                'starttrain': trains[0],
                'startplatform': start['platform'][len(start['platform'])-1],
                'endplatform': end['platform'][len(end['platform'])-1],
                'starttime': datetime.strptime(start['platform'][1] + date[1], '%H:%M %d.%m.%y'),
                'endtime': datetime.strptime(end['platform'][1] + date[1], '%H:%M %d.%m.%y') if len(date) == 2 else datetime.strptime(end['platform'][1] + date[2], '%H:%M %d.%m.%y')
            }
    connections.append(info)
    for no in range(len(trains)-1):
        try:
            info = {
                'anbhf': list(an.keys())[no],
                'abbhf': list(ab.keys())[no],
                'antrain': trains[no],
                'abtrain': trains[no+1],
                'transfertime': datetime.strptime(ab[list(ab.keys())[no]][1], '%H:%M') - datetime.strptime(an[list(an.keys())[no]][1], '%H:%M'),
                'anplatform': an[list(an.keys())[no]][4] if len(an[list(an.keys())[no]]) == 5 else ab[list(ab.keys())[no]][4],# sometimes the platform is the same  
                'abplatform': ab[list(ab.keys())[no]][4] if len(ab[list(ab.keys())[no]]) == 5 else an[list(an.keys())[no]][4],# sometimes the platform is the same 
            }
            if len(date) > 2: #is the date today or tomorrow?
                if datetime.strptime(date[0], '%H:%M') > datetime.strptime(an[list(an.keys())[no]][1], '%H:%M'):
                    info['anzeit'] = datetime.strptime(an[list(an.keys())[no]][1] + date[2], '%H:%M %d.%m.%y')
                else:
                    info['anzeit'] = datetime.strptime(an[list(an.keys())[no]][1] + date[1], '%H:%M %d.%m.%y')
                if datetime.strptime(date[0], '%H:%M') > datetime.strptime(ab[list(ab.keys())[no]][1], '%H:%M'):
                    info['abzeit'] = datetime.strptime(ab[list(ab.keys())[no]][1] + date[2], '%H:%M %d.%m.%y')
                else:
                    info['abzeit'] = datetime.strptime(ab[list(ab.keys())[no]][1] + date[1], '%H:%M %d.%m.%y')
            else:
                info['anzeit'] = datetime.strptime(an[list(an.keys())[no]][1] + date[1], '%H:%M %d.%m.%y')
                info['abzeit'] = datetime.strptime(ab[list(ab.keys())[no]][1] + date[1], '%H:%M %d.%m.%y')
            connections.append(info)
        except:
            print('Looks like something is weird with that url: \n %s' % url)
            connections.append('')
    return connections       

In [None]:
#cons = connections('Tübingen HbF', 'Mannheim HbF', datetime.now() - timedelta(4))
#datetime(2019, 9, 14, 17, 0)
#['<span class="querysummary2" id="dtlOpen_2">', 'Do', ' 03.10.19', ' 17:30', '-', 'Fr', ' 04.10.19', '06:06', '</span>']
#['<span class="querysummary2" id="dtlOpen_2">', 'Sa', ' 14.09.19', ' 17:15', '-', 'So', ' 15.09.19', '06:06', '</span>']
cons = connections('Tübingen Hbf', 'Berlin HbF', datetime(2019, 9, 14, 17, 0))

In [None]:
date

In [None]:
cons