In [None]:
import pandas as pd
from datetime import datetime
from progress.bar import Bar

In [None]:
def get_pred_data(bhf, zugname):
    zugname = zugname.replace('.csv','').replace(' ','_')#to make it copatiple with more inputs

    fahrplan_path = '../data/Streckendaten/' + zugname + '.csv'#
    try:
        fahrplan = pd.read_csv(fahrplan_path, sep=",", index_col=False, engine='c')
    except:
        print("\x1b[1;31m No Timetable for "+ zugname + "\x1b[0m")
        #if we don't have a timetable of the train, we can't get the data
        return False
    fahrplan = fahrplan.set_index('bhf')
    if not bhf in fahrplan.index:
        print("\x1b[1;31m Bhf not in Timetable "+ bhf + "\x1b[0m")
        #if we don't have the bhf in the timetable, we can't get the data
        return False

    data = {'track_length_since_start': 0,
            'time_since_first_station': 0,
            'station_number': 0,
            'lat': 0,
            'lon': 0,
            'track_length': 0,
            'stay_time': 0,
            'time_since_last_station': 0}

    data['track_length_since_start'] = fahrplan.at[bhf,'track_length_since_start']
    data['time_since_last_station'] = fahrplan.at[bhf,'time_since_last_station']
    data['station_number'] = fahrplan.at[bhf,'station_number']
    data['lon'] = fahrplan.at[bhf,'lon']
    data['lat'] = fahrplan.at[bhf,'lat']
    
    data['track_length'] = fahrplan.at[bhf,'track_length']
    data['stay_time'] = fahrplan.at[bhf,'stay_time']

    try:#sometimes data['stay_time'] is kind of an array.
        #I don't know why, but we catch it anyway.
        if pd.isna(data['stay_time']):
            data['stay_time'] = 0
    except ValueError:
        return False
    data['time_since_first_station'] = fahrplan.at[bhf,'time_since_first_station']
    
    return data

In [None]:
path =  '../data/connecting_trains.csv'
    
df = pd.read_csv(path, index_col=False, compression='zip')

df = df.dropna()

In [None]:
for i in range(len(df)):
    if (i % 100000) == 0:
        print('.', end='')
    aninfo = get_pred_data(df.at[i, 'bhf'], df.at[i, 'antrain'])
    abinfo = get_pred_data(df.at[i, 'bhf'], df.at[i, 'abtrain'])
    df.at[i, 'an_track_length_since_start'] = aninfo['track_length_since_start']
    df.at[i, 'an_time_since_first_station'] = aninfo['time_since_first_station']
    df.at[i, 'an_lat'] = aninfo['lat']
    df.at[i, 'an_lon'] = aninfo['lon']
    df.at[i, 'an_track_length'] = aninfo['track_length']
    df.at[i, 'an_stay_time'] = aninfo['stay_time']
    df.at[i, 'an_time_since_last_station'] = aninfo['time_since_last_station']
    
    df.at[i, 'ab_track_length_since_start'] = aninfo['track_length_since_start']
    df.at[i, 'ab_time_since_first_station'] = aninfo['time_since_first_station']
    df.at[i, 'ab_lat'] = aninfo['lat']
    df.at[i, 'ab_lon'] = aninfo['lon']
    df.at[i, 'ab_track_length'] = aninfo['track_length']
    df.at[i, 'ab_stay_time'] = aninfo['stay_time']
    df.at[i, 'ab_time_since_last_station'] = aninfo['time_since_last_station']

In [None]:
df.dropna()

In [None]:
df.to_csv('../data/connecting_trains_full.csv', index=False, compression='zip')