<a href="https://colab.research.google.com/github/RemyaVKarthikeyan/AA-Stagecoach-Project/blob/main/04_07_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython.display import clear_output
import pandas as pd
import requests
from difflib import SequenceMatcher
from datetime import datetime, timedelta
import pytz
import time

# Function to normalize stop names
def normalize_stop_name(name):
    return ' '.join(name.lower().split())

# Function to fetch data from the API
def fetch_data(url):
    response = requests.get(url)
    return response.json()

# Function to extract schedule names
def extract_schedule_names(data, schedule_names_dict={}):
    if isinstance(data, dict):
        if data.get('$type') == "Tfl.Api.Presentation.Entities.Schedule, Tfl.Api.Presentation.Entities" and 'knownJourneys' in data:
            if 'name' in data:
                schedule_names_dict[data['name']] = data['knownJourneys']
        for key, value in data.items():
            extract_schedule_names(value, schedule_names_dict)
    elif isinstance(data, list):
        for item in data:
            extract_schedule_names(item, schedule_names_dict)
    return schedule_names_dict

# Function to categorize journeys into hourly slots
def categorize_into_slots(timetable):
    slots = [[] for _ in range(24)]
    for journey in timetable:
        hour = int(journey['hour'])  # Convert hour to integer
        if 0 <= hour < 24:  # Ensure hour is within the valid range
            slots[hour].append(journey)
    return slots

# Function to fetch the current day of the week
def get_day_of_week():
    bst = pytz.timezone('Europe/London')
    now = datetime.now(bst)
    return now.strftime('%A')  # %A gives full weekday name (e.g., 'Monday')

# Function to retrieve stop names from TfL API and match with Route_Dir_QSI_No
def find_route_details(lineID, df):
    # Ensure the 'Route_Dir_QSI_No' column exists
    if 'Route_Dir_QSI_No' not in df.columns:
        print("The 'Route_Dir_QSI_No' column is not present in the provided file.")
        return

    # Convert the lineID to uppercase to ensure case-insensitivity
    lineID = lineID.upper()

    # Convert the 'Route_Dir_QSI_No' column to uppercase for comparison
    df['Route_Dir_QSI_No'] = df['Route_Dir_QSI_No'].str.upper()

    # Normalize the stop names in the DataFrame
    df['STOP_NAME'] = df['STOP_NAME'].apply(normalize_stop_name)

    # Filter the DataFrame based on the lineID
    pattern_A = f"^{lineID}_A\\d+$"  # Regular expression for D7_A**
    pattern_B = f"^{lineID}_B\\d+$"  # Regular expression for D7_B**

    # Filter rows where the 'Route_Dir_QSI_No' column matches the pattern
    filtered_df_A = df[df['Route_Dir_QSI_No'].str.match(pattern_A, na=False)]
    filtered_df_B = df[df['Route_Dir_QSI_No'].str.match(pattern_B, na=False)]

    # Function to fetch and process route sequence data from TfL API
    def fetch_and_process_route_data(route_type, pattern, filtered_df):
        api_url = f"https://api.tfl.gov.uk/Line/{lineID}/Route/Sequence/{route_type}"
        response = requests.get(api_url)

        results_list = []

        if response.status_code == 200:
            route_data = response.json()

            # Iterate through each stop in the route data
            for stop in route_data['stopPointSequences'][0]['stopPoint']:
                stop_name_api = normalize_stop_name(stop['name'])
                stop_id = stop['id']

                # Check if the stop_name_api exists in the filtered DataFrame for the correct direction
                matched_row = filtered_df[(filtered_df['STOP_NAME'] == stop_name_api) &
                                          (filtered_df['Route_Dir_QSI_No'].str.match(pattern))]

                if not matched_row.empty:
                    route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                    results_list.append({
                        'Route_Dir_QSI_No': route_dir_qsi_no,
                        'STOP_Name': stop['name'],
                        'ID': stop_id
                    })
                else:
                    # If exact match not found, try partial matching based on words before and after '/'
                    api_stop_name_parts = stop_name_api.split('/')
                    for index, row in filtered_df.iterrows():
                        df_stop_name_parts = row['STOP_NAME'].split('/')
                        for api_part in api_stop_name_parts:
                            for df_part in df_stop_name_parts:
                                if SequenceMatcher(None, df_part.strip(), api_part.strip()).ratio() > 0.8:
                                    matched_row = pd.DataFrame([row])
                                    break
                            if not matched_row.empty:
                                break
                        if not matched_row.empty:
                            break

                    if not matched_row.empty:
                        route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                        results_list.append({
                            'Route_Dir_QSI_No': route_dir_qsi_no,
                            'STOP_Name': stop['name'],
                            'ID': stop_id
                        })
        else:
            print(f"Failed to fetch route sequence data from TfL API for {route_type} route. Status code: {response.status_code}")

        return results_list

    # Fetch and process outbound route data for _A**
    matched_results_A = fetch_and_process_route_data('outbound', pattern_A, filtered_df_A)

    # Fetch and process inbound route data for _B**
    matched_results_B = fetch_and_process_route_data('inbound', pattern_B, filtered_df_B)

    # Create DataFrames from the matched results for each direction
    matched_results_df_A = pd.DataFrame(matched_results_A)
    matched_results_df_B = pd.DataFrame(matched_results_B)

    # Function to remove partial matches if exact matches are found
    def remove_partial_matches(exact_df, matched_df):
        for index, row in exact_df.iterrows():
            exact_stop_name = row['STOP_NAME']
            route_dir_qsi_no = row['Route_Dir_QSI_No']
            # Find exact matches in matched_df
            exact_matches = matched_df[(matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                       (matched_df['STOP_Name'].apply(normalize_stop_name) == exact_stop_name)]
            if not exact_matches.empty:
                # Remove partial matches
                matched_df = matched_df[~((matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                          (matched_df['STOP_Name'].apply(normalize_stop_name) != exact_stop_name))]
        return matched_df

    # Remove partial matches for direction A
    matched_results_df_A = remove_partial_matches(filtered_df_A, matched_results_df_A)

    # Remove partial matches for direction B
    matched_results_df_B = remove_partial_matches(filtered_df_B, matched_results_df_B)

    # Remove duplicate stop names with the same Route_Dir_QSI_No and different IDs
    matched_results_df_A = matched_results_df_A.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')
    matched_results_df_B = matched_results_df_B.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')

    # Print the matched results for direction A
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_A\033[0m\n")
    matched_results_df_A = matched_results_df_A[matched_results_df_A['Route_Dir_QSI_No'].str.match(pattern_A)]
    print(matched_results_df_A[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Print the matched results for direction B
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_B\033[0m\n")
    matched_results_df_B = matched_results_df_B[matched_results_df_B['Route_Dir_QSI_No'].str.match(pattern_B)]
    print(matched_results_df_B[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Concatenate the matched results DataFrames for directions A and B
    combined_df = pd.concat([matched_results_df_A, matched_results_df_B], ignore_index=True)

    # DataFrames to store SWT data
    swt_data = {
        'Route_Dir_QSI_No': [],
        'ID': [],
        'SWT_minutes': [],
        'Number_of_buses': []
    }

    # Fetch timetable for each stop point ID and calculate SWT
    bst = pytz.timezone('Europe/London')
    current_hour = datetime.now(bst).hour
    day_of_week = get_day_of_week()

    # Store selected schedule name to ensure it's printed only once
    selected_schedule_name = None
    printed_schedule_name = False

    # Track printed timetable stop IDs
    printed_timetable_stop_ids = []

    while True:
        # Update current time and hour
        now = datetime.now(bst)
        current_hour = now.hour
        day_of_week = get_day_of_week()

        # Clear previous SWT data
        swt_data = {
            'Route_Dir_QSI_No': [],
            'ID': [],
            'SWT_minutes': [],
            'Number_of_buses': []
        }

        for index, row in combined_df.iterrows():
            qsi_number = row['Route_Dir_QSI_No']
            stop_point_id = row['ID']

            # Fetch timetable data
            timetable_url = f"https://api.tfl.gov.uk/Line/{lineID}/Timetable/{stop_point_id}"
            timetable_data = fetch_data(timetable_url)

            # Extract schedule names and find the selected schedule name
            schedule_names_dict = extract_schedule_names(timetable_data)
            if schedule_names_dict:
                for name in schedule_names_dict.keys():
                    if day_of_week in name:
                        selected_schedule_name = name
                        break

            if selected_schedule_name:
                if not printed_schedule_name:
                    print(f"\033[1mSelected schedule name: {selected_schedule_name}\033[0m")
                    printed_schedule_name = True

                timetable = schedule_names_dict[selected_schedule_name]

                # Iterate through the timetable and calculate the number of buses and SWT
                buses_count = 0
                swt_minutes = None
                current_hour_buses = []

                for bus in timetable:
                    # Extract and convert bus time to datetime
                    bus_time_str = bus['time']
                    bus_time = datetime.strptime(bus_time_str, '%H:%M:%S').time()
                    bus_datetime = datetime.combine(now.date(), bus_time, bst)

                    # Calculate the difference in minutes
                    minutes_diff = (bus_datetime - now).total_seconds() / 60

                    if minutes_diff >= 0:
                        buses_count += 1
                        if swt_minutes is None or minutes_diff < swt_minutes:
                            swt_minutes = minutes_diff

                swt_data['Route_Dir_QSI_No'].append(qsi_number)
                swt_data['ID'].append(stop_point_id)
                swt_data['SWT_minutes'].append(swt_minutes if swt_minutes is not None else 'N/A')
                swt_data['Number_of_buses'].append(buses_count)

                # Print timetable if not already printed for this stop ID
                if stop_point_id not in printed_timetable_stop_ids:
                    printed_timetable_stop_ids.append(stop_point_id)
                    print(f"\n\n\033[1mTimetable for stop point ID {stop_point_id} (Route_Dir_QSI_No: {qsi_number}):\033[0m\n")
                    timetable_df = pd.DataFrame(timetable)
                    timetable_df['time'] = timetable_df['hour'] + ':' + timetable_df['minute']
                    timetable_df['time'] = timetable_df['time'].apply(lambda x: datetime.strptime(x, '%H:%M').time())
                    print(timetable_df[['time']])

        # Print SWT data
        swt_df = pd.DataFrame(swt_data)
        print("\n\n\033[1mStop Waiting Time (SWT) Data:\033[0m")
        print(swt_df[['Route_Dir_QSI_No', 'ID', 'SWT_minutes', 'Number_of_buses']])

        # Sleep for 5 minutes
        time.sleep(300)

# Example usage
if __name__ == "__main__":
    # Load the Excel file into a DataFrame
    file_path = '/content/QSI points.xlsx'  # Modify this path accordingly
    df = pd.read_excel(file_path)

    # Ask the user to enter a lineID
    line_id_input = input("Please enter the lineID: ")

    # Find and display the route details
    find_route_details(line_id_input, df)


Please enter the lineID: d7


[1m[4mQSI stop points for direction D7_A[0m

  Route_Dir_QSI_No                    STOP_Name          ID
0            D7_A1   Poplar / All Saints Church  490011107G
1            D7_A2               Stewart Street  490013513S
2            D7_A3       Island Gardens Station  490002048Z
3            D7_A4  Arnhem Wharf Primary School  490006092N
5            D7_A5         East India Dock Road  490004584N


[1m[4mQSI stop points for direction D7_B[0m

  Route_Dir_QSI_No                    STOP_Name          ID
0            D7_B1  Mile End Station / Bow Road  490015151H
2            D7_B2         East India Dock Road  490004584S
3            D7_B3         Canary Wharf Station  490000038F
5            D7_B4  Arnhem Wharf Primary School  490006092S
6            D7_B5       Island Gardens Station  490002048X
7            D7_B6               Stewart Street  490013513N


[1mStop Waiting Time (SWT) Data:[0m
Empty DataFrame
Columns: [Route_Dir_QSI_No, ID, SWT_

KeyboardInterrupt: 

In [None]:
from IPython.display import clear_output
import pandas as pd
import requests
from difflib import SequenceMatcher
from datetime import datetime, timedelta
import pytz
import time

# Function to normalize stop names
def normalize_stop_name(name):
    return ' '.join(name.lower().split())

# Function to fetch data from the API
def fetch_data(url):
    response = requests.get(url)
    return response.json()

# Function to extract schedule names
def extract_schedule_names(data, schedule_names_dict={}):
    if isinstance(data, dict):
        if data.get('$type') == "Tfl.Api.Presentation.Entities.Schedule, Tfl.Api.Presentation.Entities" and 'knownJourneys' in data:
            if 'name' in data:
                schedule_names_dict[data['name']] = data['knownJourneys']
        for key, value in data.items():
            extract_schedule_names(value, schedule_names_dict)
    elif isinstance(data, list):
        for item in data:
            extract_schedule_names(item, schedule_names_dict)
    return schedule_names_dict

# Function to categorize journeys into hourly slots
def categorize_into_slots(timetable):
    slots = [[] for _ in range(24)]
    for journey in timetable:
        hour = int(journey['hour'])  # Convert hour to integer
        if 0 <= hour < 24:  # Ensure hour is within the valid range
            slots[hour].append(journey)
    return slots

# Function to fetch the current day of the week
def get_day_of_week():
    bst = pytz.timezone('Europe/London')
    now = datetime.now(bst)
    return now.strftime('%A')  # %A gives full weekday name (e.g., 'Monday')

# Function to retrieve stop names from TfL API and match with Route_Dir_QSI_No
def find_route_details(lineID, df):
    # Ensure the 'Route_Dir_QSI_No' column exists
    if 'Route_Dir_QSI_No' not in df.columns:
        print("The 'Route_Dir_QSI_No' column is not present in the provided file.")
        return

    # Convert the lineID to uppercase to ensure case-insensitivity
    lineID = lineID.upper()

    # Convert the 'Route_Dir_QSI_No' column to uppercase for comparison
    df['Route_Dir_QSI_No'] = df['Route_Dir_QSI_No'].str.upper()

    # Normalize the stop names in the DataFrame
    df['STOP_NAME'] = df['STOP_NAME'].apply(normalize_stop_name)

    # Filter the DataFrame based on the lineID
    pattern_A = f"^{lineID}_A\\d+$"  # Regular expression for D7_A**
    pattern_B = f"^{lineID}_B\\d+$"  # Regular expression for D7_B**

    # Filter rows where the 'Route_Dir_QSI_No' column matches the pattern
    filtered_df_A = df[df['Route_Dir_QSI_No'].str.match(pattern_A, na=False)]
    filtered_df_B = df[df['Route_Dir_QSI_No'].str.match(pattern_B, na=False)]

    # Function to fetch and process route sequence data from TfL API
    def fetch_and_process_route_data(route_type, pattern, filtered_df):
        api_url = f"https://api.tfl.gov.uk/Line/{lineID}/Route/Sequence/{route_type}"
        response = requests.get(api_url)

        results_list = []

        if response.status_code == 200:
            route_data = response.json()

            # Iterate through each stop in the route data
            for stop in route_data['stopPointSequences'][0]['stopPoint']:
                stop_name_api = normalize_stop_name(stop['name'])
                stop_id = stop['id']

                # Check if the stop_name_api exists in the filtered DataFrame for the correct direction
                matched_row = filtered_df[(filtered_df['STOP_NAME'] == stop_name_api) &
                                          (filtered_df['Route_Dir_QSI_No'].str.match(pattern))]

                if not matched_row.empty:
                    route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                    results_list.append({
                        'Route_Dir_QSI_No': route_dir_qsi_no,
                        'STOP_Name': stop['name'],
                        'ID': stop_id
                    })
                else:
                    # If exact match not found, try partial matching based on words before and after '/'
                    api_stop_name_parts = stop_name_api.split('/')
                    for index, row in filtered_df.iterrows():
                        df_stop_name_parts = row['STOP_NAME'].split('/')
                        for api_part in api_stop_name_parts:
                            for df_part in df_stop_name_parts:
                                if SequenceMatcher(None, df_part.strip(), api_part.strip()).ratio() > 0.8:
                                    matched_row = pd.DataFrame([row])
                                    break
                            if not matched_row.empty:
                                break
                        if not matched_row.empty:
                            break

                    if not matched_row.empty:
                        route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                        results_list.append({
                            'Route_Dir_QSI_No': route_dir_qsi_no,
                            'STOP_Name': stop['name'],
                            'ID': stop_id
                        })
        else:
            print(f"Failed to fetch route sequence data from TfL API for {route_type} route. Status code: {response.status_code}")

        return results_list

    # Fetch and process outbound route data for _A**
    matched_results_A = fetch_and_process_route_data('outbound', pattern_A, filtered_df_A)

    # Fetch and process inbound route data for _B**
    matched_results_B = fetch_and_process_route_data('inbound', pattern_B, filtered_df_B)

    # Create DataFrames from the matched results for each direction
    matched_results_df_A = pd.DataFrame(matched_results_A)
    matched_results_df_B = pd.DataFrame(matched_results_B)

    # Function to remove partial matches if exact matches are found
    def remove_partial_matches(exact_df, matched_df):
        for index, row in exact_df.iterrows():
            exact_stop_name = row['STOP_NAME']
            route_dir_qsi_no = row['Route_Dir_QSI_No']
            # Find exact matches in matched_df
            exact_matches = matched_df[(matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                       (matched_df['STOP_Name'].apply(normalize_stop_name) == exact_stop_name)]
            if not exact_matches.empty:
                # Remove partial matches
                matched_df = matched_df[~((matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                          (matched_df['STOP_Name'].apply(normalize_stop_name) != exact_stop_name))]
        return matched_df

    # Remove partial matches for direction A
    matched_results_df_A = remove_partial_matches(filtered_df_A, matched_results_df_A)

    # Remove partial matches for direction B
    matched_results_df_B = remove_partial_matches(filtered_df_B, matched_results_df_B)

    # Remove duplicate stop names with the same Route_Dir_QSI_No and different IDs
    matched_results_df_A = matched_results_df_A.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')
    matched_results_df_B = matched_results_df_B.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')

    # Print the matched results for direction A
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_A\033[0m\n")
    matched_results_df_A = matched_results_df_A[matched_results_df_A['Route_Dir_QSI_No'].str.match(pattern_A)]
    print(matched_results_df_A[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Print the matched results for direction B
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_B\033[0m\n")
    matched_results_df_B = matched_results_df_B[matched_results_df_B['Route_Dir_QSI_No'].str.match(pattern_B)]
    print(matched_results_df_B[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Concatenate the matched results DataFrames for directions A and B
    combined_df = pd.concat([matched_results_df_A, matched_results_df_B], ignore_index=True)

    # DataFrames to store SWT data
    swt_data = {
        'Route_Dir_QSI_No': [],
        'ID': [],
        'SWT_minutes': [],
        'Number_of_buses': []
    }

    # Fetch timetable for each stop point ID and calculate SWT
    bst = pytz.timezone('Europe/London')
    current_hour = datetime.now(bst).hour
    day_of_week = get_day_of_week()

    # Store selected schedule name to ensure it's printed only once
    selected_schedule_name = None
    printed_schedule_name = False

    # Track printed timetable stop IDs
    printed_timetable_stop_ids = []

    while True:
        # Update current time and hour
        now = datetime.now(bst)
        current_hour = now.hour
        day_of_week = get_day_of_week()

        # Clear previous SWT data
        swt_data = {
            'Route_Dir_QSI_No': [],
            'ID': [],
            'SWT_minutes': [],
            'Number_of_buses': []
        }

        for index, row in combined_df.iterrows():
            stop_point_id = row['ID']
            route_dir_qsi_no = row['Route_Dir_QSI_No']

            if f"{lineID}_A" in route_dir_qsi_no:
                direction = 'outbound'
            elif f"{lineID}_B" in route_dir_qsi_no:
                direction = 'inbound'
            else:
                print(f"Invalid route direction for Route_Dir_QSI_No: {route_dir_qsi_no}")
                continue

            url = f'https://api.tfl.gov.uk/Line/{lineID}/Timetable/{stop_point_id}?direction={direction}'
            data = fetch_data(url)

            schedule_names_dict = extract_schedule_names(data)

            if not selected_schedule_name:
                if day_of_week.lower() in ['monday', 'tuesday', 'wednesday', 'thursday']:
                    preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Thursday', 'Monday to Friday']
                elif day_of_week.lower() == 'friday':
                    preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Friday', 'Friday']
                elif day_of_week.lower() == 'saturday':
                    preferred_schedule_names = ['Saturday']
                elif day_of_week.lower() == 'sunday':
                    preferred_schedule_names = ['Sunday']
                else:
                    preferred_schedule_names = [day_of_week]

                for preferred_name in preferred_schedule_names:
                    if preferred_name in schedule_names_dict:
                        selected_schedule_name = preferred_name
                        break

            if selected_schedule_name and not printed_schedule_name:
                print(f"\nToday is {day_of_week}. The selected Schedule name is {selected_schedule_name}.")
                printed_schedule_name = True

            if selected_schedule_name:
                timetable = schedule_names_dict[selected_schedule_name]
                slots = categorize_into_slots(timetable)

                # Calculate Scheduled Wait Time (SWT)
                total_buses_this_hour = len(slots[current_hour])
                if total_buses_this_hour > 0:
                    scheduled_wait_time = 60 / (total_buses_this_hour * 2)  # SWT formula
                else:
                    scheduled_wait_time = float('inf')  # Handle division by zero scenario (though unlikely)

                # Store SWT data
                swt_data['Route_Dir_QSI_No'].append(route_dir_qsi_no)
                swt_data['ID'].append(stop_point_id)
                swt_data['SWT_minutes'].append(scheduled_wait_time)
                swt_data['Number_of_buses'].append(total_buses_this_hour)

                # Fetch arrival predictions based on SWT data for printed timetable stop IDs
                if stop_point_id in printed_timetable_stop_ids:
                    # Fetch arrival predictions
                    arrival_predictions_df, station_name = fetch_arrival_predictions(lineID, stop_point_id, direction)

                    if arrival_predictions_df is not None and not arrival_predictions_df.empty:
                        #print(f"\nArrival Predictions for stop point {stop_point_id} ({station_name}):")
                        #print(arrival_predictions_df.to_string(index=False))
                        print(f"   ")
                        # Calculating summary metrics
                        total_wawt = arrival_predictions_df['WAWT'].sum()
                        min_arrival = arrival_predictions_df['Expected Arrival (BST)'].min().replace(second=0, microsecond=0)
                        max_arrival = arrival_predictions_df['Expected Arrival (BST)'].max().replace(second=0, microsecond=0)
                        time_diff_minutes = (max_arrival - min_arrival).total_seconds() / 60
                        num_buses_observed = arrival_predictions_df['Vehicle ID'].nunique()

                        # Calculating AWT, SWT, and EWT
                        nbph = swt_data['Number_of_buses'][swt_data['ID'].index(stop_point_id)]
                        swt = swt_data['SWT_minutes'][swt_data['ID'].index(stop_point_id)]
                        awt = round(total_wawt / time_diff_minutes, 2) if time_diff_minutes > 0 else 0
                        ewt = round(awt - swt, 2)

                        summary_df = pd.DataFrame({
                            'Metric': ['Number of buses scheduled per hour (nbph)', 'Number of buses observed', 'Total WAWT (minutes)',
                                       'Time difference between 1st and last observed buses (minutes)', 'AWT (minutes)', 'SWT (minutes)', 'EWT (minutes)'],
                            'Value': [nbph, num_buses_observed, total_wawt, time_diff_minutes, awt, swt, ewt]
                        })

                        print(f"\nSummary Metrics:{swt_data['Route_Dir_QSI_No'][swt_data['ID'].index(stop_point_id)]}     {stop_point_id}")
                        print(summary_df)

                    # Remove the stop ID from printed_timetable_stop_ids to avoid redundant fetches
                    printed_timetable_stop_ids.remove(stop_point_id)

        # Update printed timetable stop IDs
        printed_timetable_stop_ids = swt_data['ID']

        # Create DataFrame for SWT data
        swt_df = pd.DataFrame(swt_data)

        # Print the SWT DataFrame
        print(f"\n\nSWT DataFrame at hour {current_hour}")
        print(swt_df)

        # Wait for the next 30 seconds
        #clear_output()
        print("\n\nWaiting to fetch updated data...\n\n")
        time.sleep(30)

# Function to fetch arrival predictions with error handling
def fetch_arrival_predictions(line_id, stop_point_id, direction):
    try:
        base_url = f"https://api.tfl.gov.uk/Line/{line_id}/Arrivals/{stop_point_id}"
        params = {'direction': direction}
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        if len(data) == 0:
            return pd.DataFrame(), None  # No data available
        station_name = data[0]['stationName']
        predictions = []
        for item in data:
            arrival_time = datetime.strptime(item['expectedArrival'], '%Y-%m-%dT%H:%M:%SZ')
            arrival_time_bst = arrival_time + timedelta(hours=1)
            predictions.append({
                'Line': item['lineName'],
                'Vehicle ID': item['vehicleId'],
                'Stop Point': stop_point_id,
                'Direction': direction,
                'Expected Arrival (BST)': arrival_time_bst,
                'Expected Arrival (HM)': arrival_time_bst.strftime('%H:%M')
            })
        df = pd.DataFrame(predictions)
        df = df.sort_values(by='Expected Arrival (BST)', ascending=True)
        df['Expected Arrival (BST)'] = pd.to_datetime(df['Expected Arrival (BST)'])  # Convert to datetime
        df['Expected Arrival (HM)'] = pd.to_datetime(df['Expected Arrival (HM)'], format='%H:%M')
        df['Headway (minutes)'] = df['Expected Arrival (HM)'].diff().fillna(pd.Timedelta(seconds=0)).dt.total_seconds() / 60
        df['AWT/bus (minutes)'] = (df['Headway (minutes)'] / 2).round(2)
        df['WAWT'] = (df['Headway (minutes)'] * df['AWT/bus (minutes)']).round(2)
        return df, station_name
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None, None

# Main loop to fetch and display data
def main_loop(line_id, direction, qsi_stop_ids):

    while True:
        qsi_data = []  # List to accumulate data for all QSI stop points
        for idx, stop_point_id in enumerate(qsi_stop_ids):
            df, station_name = fetch_arrival_predictions(line_id, stop_point_id, direction)
            if df is not None and station_name is not None:
                total_wawt = df['WAWT'].sum()
                min_arrival = df['Expected Arrival (BST)'].min().replace(second=0, microsecond=0)
                max_arrival = df['Expected Arrival (BST)'].max().replace(second=0, microsecond=0)
                time_diff_minutes = (max_arrival - min_arrival).total_seconds() / 60
                awt = round(total_wawt / time_diff_minutes, 2) if time_diff_minutes > 0 else 0
                num_buses_observed = df['Vehicle ID'].nunique()
                summary_df = pd.DataFrame({
                    'Metric': ['Number of buses observed', 'Total WAWT (minutes)', 'Time difference between 1st and last observed buses (minutes)', 'AWT (minutes)'],
                    'Value': [num_buses_observed, total_wawt, time_diff_minutes, awt]
                })
                print(f"\nSummary Metrics for QSI stop point {stop_point_id}:")
                print(summary_df)
            qsi_data.append({
                'Stop Point': stop_point_id,
                'Arrival Predictions': df,
                'Station Name': station_name,
                'Summary Metrics': summary_df
            })
            time.sleep(30)  # Wait for 30 seconds before fetching data again

# Example usage
if __name__ == "__main__":
    # Load the Excel file into a DataFrame
    file_path = '/content/QSI points.xlsx'  # Modify this path accordingly
    df = pd.read_excel(file_path)

    # Ask the user to enter a lineID
    line_id_input = input("Please enter the lineID: ")

    # Find and display the route details
    find_route_details(line_id_input, df)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
6                                      EWT (minutes)    0.0
   

Summary Metrics:D7_B4     490006092S
                                              Metric  Value
0          Number of buses scheduled per hour (nbph)    5.0
1                           Number of buses observed    3.0
2                               Total WAWT (minutes)  144.0
3  Time difference between 1st and last observed ...   24.0
4                                      AWT (minutes)    6.0
5                                      SWT (minutes)    6.0
6                                      EWT (minutes)    0.0
   

Summary Metrics:D7_B5     490002048X
                                              Metric  Value
0          Number of buses scheduled per hour (nbph)    5.0
1                           Number of buses observed    2.0
2                               Total WAWT (minutes)   72.0
3  Time difference between 1st and last observed ...   12.0
4          

KeyboardInterrupt: 

In [None]:
from IPython.display import clear_output
import pandas as pd
import requests
from difflib import SequenceMatcher
from datetime import datetime, timedelta
import pytz
import time

# Function to normalize stop names
def normalize_stop_name(name):
    return ' '.join(name.lower().split())

# Function to fetch data from the API
def fetch_data(url):
    response = requests.get(url)
    return response.json()

# Function to extract schedule names
def extract_schedule_names(data, schedule_names_dict={}):
    if isinstance(data, dict):
        if data.get('$type') == "Tfl.Api.Presentation.Entities.Schedule, Tfl.Api.Presentation.Entities" and 'knownJourneys' in data:
            if 'name' in data:
                schedule_names_dict[data['name']] = data['knownJourneys']
        for key, value in data.items():
            extract_schedule_names(value, schedule_names_dict)
    elif isinstance(data, list):
        for item in data:
            extract_schedule_names(item, schedule_names_dict)
    return schedule_names_dict

# Function to categorize journeys into hourly slots
def categorize_into_slots(timetable):
    slots = [[] for _ in range(24)]
    for journey in timetable:
        hour = int(journey['hour'])  # Convert hour to integer
        if 0 <= hour < 24:  # Ensure hour is within the valid range
            slots[hour].append(journey)
    return slots

# Function to fetch the current day of the week
def get_day_of_week():
    bst = pytz.timezone('Europe/London')
    now = datetime.now(bst)
    return now.strftime('%A')  # %A gives full weekday name (e.g., 'Monday')

# Function to retrieve stop names from TfL API and match with Route_Dir_QSI_No
def find_route_details(lineID, df):
    # Ensure the 'Route_Dir_QSI_No' column exists
    if 'Route_Dir_QSI_No' not in df.columns:
        print("The 'Route_Dir_QSI_No' column is not present in the provided file.")
        return

    # Convert the lineID to uppercase to ensure case-insensitivity
    lineID = lineID.upper()

    # Convert the 'Route_Dir_QSI_No' column to uppercase for comparison
    df['Route_Dir_QSI_No'] = df['Route_Dir_QSI_No'].str.upper()

    # Normalize the stop names in the DataFrame
    df['STOP_NAME'] = df['STOP_NAME'].apply(normalize_stop_name)

    # Filter the DataFrame based on the lineID
    pattern_A = f"^{lineID}_A\\d+$"  # Regular expression for D7_A**
    pattern_B = f"^{lineID}_B\\d+$"  # Regular expression for D7_B**

    # Filter rows where the 'Route_Dir_QSI_No' column matches the pattern
    filtered_df_A = df[df['Route_Dir_QSI_No'].str.match(pattern_A, na=False)]
    filtered_df_B = df[df['Route_Dir_QSI_No'].str.match(pattern_B, na=False)]

    # Function to fetch and process route sequence data from TfL API
    def fetch_and_process_route_data(route_type, pattern, filtered_df):
        api_url = f"https://api.tfl.gov.uk/Line/{lineID}/Route/Sequence/{route_type}"
        response = requests.get(api_url)

        results_list = []

        if response.status_code == 200:
            route_data = response.json()

            # Iterate through each stop in the route data
            for stop in route_data['stopPointSequences'][0]['stopPoint']:
                stop_name_api = normalize_stop_name(stop['name'])
                stop_id = stop['id']

                # Check if the stop_name_api exists in the filtered DataFrame for the correct direction
                matched_row = filtered_df[(filtered_df['STOP_NAME'] == stop_name_api) &
                                          (filtered_df['Route_Dir_QSI_No'].str.match(pattern))]

                if not matched_row.empty:
                    route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                    results_list.append({
                        'Route_Dir_QSI_No': route_dir_qsi_no,
                        'STOP_Name': stop['name'],
                        'ID': stop_id
                    })
                else:
                    # If exact match not found, try partial matching based on words before and after '/'
                    api_stop_name_parts = stop_name_api.split('/')
                    for index, row in filtered_df.iterrows():
                        df_stop_name_parts = row['STOP_NAME'].split('/')
                        for api_part in api_stop_name_parts:
                            for df_part in df_stop_name_parts:
                                if SequenceMatcher(None, df_part.strip(), api_part.strip()).ratio() > 0.8:
                                    matched_row = pd.DataFrame([row])
                                    break
                            if not matched_row.empty:
                                break
                        if not matched_row.empty:
                            break

                    if not matched_row.empty:
                        route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                        results_list.append({
                            'Route_Dir_QSI_No': route_dir_qsi_no,
                            'STOP_Name': stop['name'],
                            'ID': stop_id
                        })
        else:
            print(f"Failed to fetch route sequence data from TfL API for {route_type} route. Status code: {response.status_code}")

        return results_list

    # Fetch and process outbound route data for _A**
    matched_results_A = fetch_and_process_route_data('outbound', pattern_A, filtered_df_A)

    # Fetch and process inbound route data for _B**
    matched_results_B = fetch_and_process_route_data('inbound', pattern_B, filtered_df_B)

    # Create DataFrames from the matched results for each direction
    matched_results_df_A = pd.DataFrame(matched_results_A)
    matched_results_df_B = pd.DataFrame(matched_results_B)

    # Function to remove partial matches if exact matches are found
    def remove_partial_matches(exact_df, matched_df):
        for index, row in exact_df.iterrows():
            exact_stop_name = row['STOP_NAME']
            route_dir_qsi_no = row['Route_Dir_QSI_No']
            # Find exact matches in matched_df
            exact_matches = matched_df[(matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                       (matched_df['STOP_Name'].apply(normalize_stop_name) == exact_stop_name)]
            if not exact_matches.empty:
                # Remove partial matches
                matched_df = matched_df[~((matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                          (matched_df['STOP_Name'].apply(normalize_stop_name) != exact_stop_name))]
        return matched_df

    # Remove partial matches for direction A
    matched_results_df_A = remove_partial_matches(filtered_df_A, matched_results_df_A)

    # Remove partial matches for direction B
    matched_results_df_B = remove_partial_matches(filtered_df_B, matched_results_df_B)

    # Remove duplicate stop names with the same Route_Dir_QSI_No and different IDs
    matched_results_df_A = matched_results_df_A.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')
    matched_results_df_B = matched_results_df_B.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')

    # Print the matched results for direction A
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_A\033[0m\n")
    matched_results_df_A = matched_results_df_A[matched_results_df_A['Route_Dir_QSI_No'].str.match(pattern_A)]
    print(matched_results_df_A[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Print the matched results for direction B
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_B\033[0m\n")
    matched_results_df_B = matched_results_df_B[matched_results_df_B['Route_Dir_QSI_No'].str.match(pattern_B)]
    print(matched_results_df_B[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Concatenate the matched results DataFrames for directions A and B
    combined_df = pd.concat([matched_results_df_A, matched_results_df_B], ignore_index=True)

    # DataFrames to store SWT data
    swt_data = {
        'Route_Dir_QSI_No': [],
        'ID': [],
        'SWT_minutes': [],
        'Number_of_buses': []
    }

    # Initialize the awt_data DataFrame
    awt_data = pd.DataFrame(columns=['stop_point_id', 'awt', 'num_bus_observed'])

    # Fetch timetable for each stop point ID and calculate SWT
    bst = pytz.timezone('Europe/London')
    current_hour = datetime.now(bst).hour
    day_of_week = get_day_of_week()

    # Store selected schedule name to ensure it's printed only once
    selected_schedule_name = None
    printed_schedule_name = False

    # Track printed timetable stop IDs
    printed_timetable_stop_ids = []

    while True:
        # Update current time and hour
        now = datetime.now(bst)
        current_hour = now.hour

        # If we've already printed the timetable for the current hour, wait until the next hour
        if any(stop_id in printed_timetable_stop_ids for stop_id in combined_df['ID']):
            time.sleep(60)  # Sleep for 1 minute before checking again
            continue

        for stop_id in combined_df['ID']:
            timetable_url = f"https://api.tfl.gov.uk/StopPoint/{stop_id}/timetable/{lineID}"
            timetable_response = requests.get(timetable_url)

            if timetable_response.status_code == 200:
                timetable_data = timetable_response.json()
                schedule_names_dict = extract_schedule_names(timetable_data)

                if schedule_names_dict:
                    for schedule_name, known_journeys in schedule_names_dict.items():
                        if day_of_week in schedule_name:
                            selected_schedule_name = schedule_name
                            selected_timetable = known_journeys
                            break
                    else:
                        print(f"No timetable found for the current day at stop {stop_id}")
                        continue

                    if not printed_schedule_name:
                        print(f"\n\033[1mTimetable schedule for {day_of_week} at stop {stop_id} - Schedule Name: {selected_schedule_name}\033[0m\n")
                        printed_schedule_name = True

                    hourly_slots = categorize_into_slots(selected_timetable)

                    # Print the timetable for the current hour
                    print(f"\033[1mTimetable for {day_of_week} at stop {stop_id} - Hour: {current_hour}\033[0m")
                    for journey in hourly_slots[current_hour]:
                        print(f"Destination: {journey['destination']}, Time: {journey['time']}")

                    # Update printed_timetable_stop_ids to prevent re-printing for this hour
                    printed_timetable_stop_ids.append(stop_id)

                    # Calculate AWT for the selected timetable at the current hour
                    arrival_times = [
                        datetime.strptime(journey['time'], '%H:%M') for journey in hourly_slots[current_hour]
                    ]

                    if len(arrival_times) >= 2:
                        intervals = [
                            (arrival_times[i + 1] - arrival_times[i]).total_seconds() / 60
                            for i in range(len(arrival_times) - 1)
                        ]
                        awt = sum(intervals) / len(intervals)
                        num_buses_observed = len(arrival_times)

                        # Update the swt_data dictionary
                        swt_data['Route_Dir_QSI_No'].append(combined_df.loc[combined_df['ID'] == stop_id, 'Route_Dir_QSI_No'].values[0])
                        swt_data['ID'].append(stop_id)
                        swt_data['SWT_minutes'].append(awt)
                        swt_data['Number_of_buses'].append(num_buses_observed)

                        # Add the calculated AWT to awt_data DataFrame
                        awt_data = awt_data.append({'stop_point_id': stop_id, 'awt': awt, 'num_bus_observed': num_buses_observed}, ignore_index=True)

        # Print the SWT DataFrame
        swt_df = pd.DataFrame(swt_data)
        print("\n\033[1m\033[4mSWT Data\033[0m\n")
        print(swt_df)

        # Print the summary metrics
        print("\n\033[1m\033[4mSummary Metrics\033[0m\n")
        print(f"Number of Stops Processed: {len(combined_df)}")
        print(f"Total SWT Entries: {len(swt_df)}")

        # Print the combined AWT DataFrame after all the summary metrics are computed
        print("\n\033[1m\033[4mCombined AWT Data\033[0m\n")
        print(awt_data)

        # Wait until the next hour to fetch and print new data
        while datetime.now(bst).hour == current_hour:
            time.sleep(60)  # Sleep for 1 minute before checking again

# Example usage
# lineID = "D7"
# df = pd.read_excel("route_data.xlsx")
# find_route_details(lineID, df)


In [None]:
from IPython.display import clear_output
import pandas as pd
import requests
from difflib import SequenceMatcher
from datetime import datetime, timedelta
import pytz
import time

# Function to normalize stop names
def normalize_stop_name(name):
    return ' '.join(name.lower().split())

# Function to fetch data from the API
def fetch_data(url):
    response = requests.get(url)
    return response.json()

# Function to extract schedule names
def extract_schedule_names(data, schedule_names_dict={}):
    if isinstance(data, dict):
        if data.get('$type') == "Tfl.Api.Presentation.Entities.Schedule, Tfl.Api.Presentation.Entities" and 'knownJourneys' in data:
            if 'name' in data:
                schedule_names_dict[data['name']] = data['knownJourneys']
        for key, value in data.items():
            extract_schedule_names(value, schedule_names_dict)
    elif isinstance(data, list):
        for item in data:
            extract_schedule_names(item, schedule_names_dict)
    return schedule_names_dict

# Function to categorize journeys into hourly slots
def categorize_into_slots(timetable):
    slots = [[] for _ in range(24)]
    for journey in timetable:
        hour = int(journey['hour'])  # Convert hour to integer
        if 0 <= hour < 24:  # Ensure hour is within the valid range
            slots[hour].append(journey)
    return slots

# Function to fetch the current day of the week
def get_day_of_week():
    bst = pytz.timezone('Europe/London')
    now = datetime.now(bst)
    return now.strftime('%A')  # %A gives full weekday name (e.g., 'Monday')

# Function to retrieve stop names from TfL API and match with Route_Dir_QSI_No
def find_route_details(lineID, df):
    # Ensure the 'Route_Dir_QSI_No' column exists
    if 'Route_Dir_QSI_No' not in df.columns:
        print("The 'Route_Dir_QSI_No' column is not present in the provided file.")
        return

    # Convert the lineID to uppercase to ensure case-insensitivity
    lineID = lineID.upper()

    # Convert the 'Route_Dir_QSI_No' column to uppercase for comparison
    df['Route_Dir_QSI_No'] = df['Route_Dir_QSI_No'].str.upper()

    # Normalize the stop names in the DataFrame
    df['STOP_NAME'] = df['STOP_NAME'].apply(normalize_stop_name)

    # Filter the DataFrame based on the lineID
    pattern_A = f"^{lineID}_A\\d+$"  # Regular expression for D7_A**
    pattern_B = f"^{lineID}_B\\d+$"  # Regular expression for D7_B**

    # Filter rows where the 'Route_Dir_QSI_No' column matches the pattern
    filtered_df_A = df[df['Route_Dir_QSI_No'].str.match(pattern_A, na=False)]
    filtered_df_B = df[df['Route_Dir_QSI_No'].str.match(pattern_B, na=False)]

    # Function to fetch and process route sequence data from TfL API
    def fetch_and_process_route_data(route_type, pattern, filtered_df):
        api_url = f"https://api.tfl.gov.uk/Line/{lineID}/Route/Sequence/{route_type}"
        response = requests.get(api_url)

        results_list = []

        if response.status_code == 200:
            route_data = response.json()

            # Iterate through each stop in the route data
            for stop in route_data['stopPointSequences'][0]['stopPoint']:
                stop_name_api = normalize_stop_name(stop['name'])
                stop_id = stop['id']

                # Check if the stop_name_api exists in the filtered DataFrame for the correct direction
                matched_row = filtered_df[(filtered_df['STOP_NAME'] == stop_name_api) &
                                          (filtered_df['Route_Dir_QSI_No'].str.match(pattern))]

                if not matched_row.empty:
                    route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                    results_list.append({
                        'Route_Dir_QSI_No': route_dir_qsi_no,
                        'STOP_Name': stop['name'],
                        'ID': stop_id
                    })
                else:
                    # If exact match not found, try partial matching based on words before and after '/'
                    api_stop_name_parts = stop_name_api.split('/')
                    for index, row in filtered_df.iterrows():
                        df_stop_name_parts = row['STOP_NAME'].split('/')
                        for api_part in api_stop_name_parts:
                            for df_part in df_stop_name_parts:
                                if SequenceMatcher(None, df_part.strip(), api_part.strip()).ratio() > 0.8:
                                    matched_row = pd.DataFrame([row])
                                    break
                            if not matched_row.empty:
                                break
                        if not matched_row.empty:
                            break

                    if not matched_row.empty:
                        route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                        results_list.append({
                            'Route_Dir_QSI_No': route_dir_qsi_no,
                            'STOP_Name': stop['name'],
                            'ID': stop_id
                        })
        else:
            print(f"Failed to fetch route sequence data from TfL API for {route_type} route. Status code: {response.status_code}")

        return results_list

    # Fetch and process outbound route data for _A**
    matched_results_A = fetch_and_process_route_data('outbound', pattern_A, filtered_df_A)

    # Fetch and process inbound route data for _B**
    matched_results_B = fetch_and_process_route_data('inbound', pattern_B, filtered_df_B)

    # Create DataFrames from the matched results for each direction
    matched_results_df_A = pd.DataFrame(matched_results_A)
    matched_results_df_B = pd.DataFrame(matched_results_B)

    # Function to remove partial matches if exact matches are found
    def remove_partial_matches(exact_df, matched_df):
        for index, row in exact_df.iterrows():
            exact_stop_name = row['STOP_NAME']
            route_dir_qsi_no = row['Route_Dir_QSI_No']
            # Find exact matches in matched_df
            exact_matches = matched_df[(matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                       (matched_df['STOP_Name'].apply(normalize_stop_name) == exact_stop_name)]
            if not exact_matches.empty:
                # Remove partial matches
                matched_df = matched_df[~((matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                          (matched_df['STOP_Name'].apply(normalize_stop_name) != exact_stop_name))]
        return matched_df

    # Remove partial matches for direction A
    matched_results_df_A = remove_partial_matches(filtered_df_A, matched_results_df_A)

    # Remove partial matches for direction B
    matched_results_df_B = remove_partial_matches(filtered_df_B, matched_results_df_B)

    # Remove duplicate stop names with the same Route_Dir_QSI_No and different IDs
    matched_results_df_A = matched_results_df_A.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')
    matched_results_df_B = matched_results_df_B.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')

    # Print the matched results for direction A
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_A\033[0m\n")
    matched_results_df_A = matched_results_df_A[matched_results_df_A['Route_Dir_QSI_No'].str.match(pattern_A)]
    print(matched_results_df_A[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Print the matched results for direction B
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_B\033[0m\n")
    matched_results_df_B = matched_results_df_B[matched_results_df_B['Route_Dir_QSI_No'].str.match(pattern_B)]
    print(matched_results_df_B[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Concatenate the matched results DataFrames for directions A and B
    combined_df = pd.concat([matched_results_df_A, matched_results_df_B], ignore_index=True)

    # DataFrames to store SWT data
    swt_data = {
        'Route_Dir_QSI_No': [],
        'ID': [],
        'SWT_minutes': [],
        'Number_of_buses': []
    }

    # Initialize the awt_data DataFrame
    awt_data = pd.DataFrame(columns=['stop_point_id', 'awt', 'num_bus_observed'])

    # Fetch timetable for each stop point ID and calculate SWT
    bst = pytz.timezone('Europe/London')
    current_hour = datetime.now(bst).hour
    day_of_week = get_day_of_week()

    # Store selected schedule name to ensure it's printed only once
    selected_schedule_name = None
    printed_schedule_name = False

    # Track printed timetable stop IDs
    printed_timetable_stop_ids = []

    while True:
        # Update current time and hour
        now = datetime.now(bst)
        current_hour = now.hour

        # If we've already printed the timetable for the current hour, wait until the next hour
        if any(stop_id in printed_timetable_stop_ids for stop_id in combined_df['ID']):
            time.sleep(60)  # Sleep for 1 minute before checking again
            continue

        for stop_id in combined_df['ID']:
            timetable_url = f"https://api.tfl.gov.uk/StopPoint/{stop_id}/timetable/{lineID}"
            timetable_response = requests.get(timetable_url)

            if timetable_response.status_code == 200:
                timetable_data = timetable_response.json()
                schedule_names_dict = extract_schedule_names(timetable_data)

                # Select preferred schedule name based on the day of the week
                if not selected_schedule_name:
                    if day_of_week.lower() in ['monday', 'tuesday', 'wednesday', 'thursday']:
                        preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Thursday', 'Monday to Friday']
                    elif day_of_week.lower() == 'friday':
                        preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Friday', 'Friday']
                    elif day_of_week.lower() == 'saturday':
                        preferred_schedule_names = ['Saturday']
                    elif day_of_week.lower() == 'sunday':
                        preferred_schedule_names = ['Sunday']
                    else:
                        preferred_schedule_names = [day_of_week]

                    for preferred_name in preferred_schedule_names:
                        if preferred_name in schedule_names_dict:
                            selected_schedule_name = preferred_name
                            break

                # Fetch timetable for the selected schedule name
                if selected_schedule_name in schedule_names_dict:
                    selected_timetable = schedule_names_dict[selected_schedule_name]

                    if not printed_schedule_name:
                        print(f"\n\033[1mTimetable schedule for {day_of_week} at stop {stop_id} - Schedule Name: {selected_schedule_name}\033[0m\n")
                        printed_schedule_name = True

                    hourly_slots = categorize_into_slots(selected_timetable)

                    # Print the timetable for the current hour
                    print(f"\033[1mTimetable for {day_of_week} at stop {stop_id} - Hour: {current_hour}\033[0m")
                    for journey in hourly_slots[current_hour]:
                        print(f"Destination: {journey['destination']}, Time: {journey['time']}")

                    # Update printed_timetable_stop_ids to prevent re-printing for this hour
                    printed_timetable_stop_ids.append(stop_id)

                    # Calculate AWT for the selected timetable at the current hour
                    arrival_times = [
                        datetime.strptime(journey['time'], '%H:%M') for journey in hourly_slots[current_hour]
                    ]

                    if len(arrival_times) >= 2:
                        intervals = [
                            (arrival_times[i + 1] - arrival_times[i]).total_seconds() / 60
                            for i in range(len(arrival_times) - 1)
                        ]
                        awt = sum(intervals) / len(intervals)
                        num_buses_observed = len(arrival_times)

                        # Update the swt_data dictionary
                        swt_data['Route_Dir_QSI_No'].append(combined_df.loc[combined_df['ID'] == stop_id, 'Route_Dir_QSI_No'].values[0])
                        swt_data['ID'].append(stop_id)
                        swt_data['SWT_minutes'].append(awt)
                        swt_data['Number_of_buses'].append(num_buses_observed)

                        # Add the calculated AWT to awt_data DataFrame
                        awt_data = awt_data.append({'stop_point_id': stop_id, 'awt': awt, 'num_bus_observed': num_buses_observed}, ignore_index=True)

        # Print the SWT DataFrame
        swt_df = pd.DataFrame(swt_data)
        clear_output(wait=True)
        print("\n\033[1m\033[4mSWT Data\033[0m\n")
        print(swt_df)

        # Print the summary metrics
        print("\n\033[1m\033[4mSummary Metrics\033[0m\n")
        print(f"Number of Stops Processed: {len(combined_df)}")
        print(f"Total SWT Entries: {len(swt_df)}")

        # Print the combined AWT DataFrame after all the summary metrics are computed
        print("\n\033[1m\033[4mCombined AWT Data\033[0m\n")
        print(awt_data)

        # Wait until the next hour to fetch and print new data
        while datetime.now(bst).hour == current_hour:
            time.sleep(60)  # Sleep for 1 minute before checking again

# Example usage
if __name__ == "__main__":
    # Load the Excel file into a DataFrame
    file_path = '/content/QSI points.xlsx'  # Modify this path accordingly
    df = pd.read_excel(file_path)

    # Ask the user to enter a lineID
    line_id_input = input("Please enter the lineID: ")

    # Find and display the route details
    find_route_details(line_id_input, df)



[1m[4mSWT Data[0m

Empty DataFrame
Columns: [Route_Dir_QSI_No, ID, SWT_minutes, Number_of_buses]
Index: []

[1m[4mSummary Metrics[0m

Number of Stops Processed: 11
Total SWT Entries: 0

[1m[4mCombined AWT Data[0m

Empty DataFrame
Columns: [stop_point_id, awt, num_bus_observed]
Index: []


KeyboardInterrupt: 

In [None]:
from IPython.display import clear_output
import pandas as pd
import requests
from difflib import SequenceMatcher
from datetime import datetime, timedelta
import pytz
import time

# Function to normalize stop names
def normalize_stop_name(name):
    return ' '.join(name.lower().split())

# Function to fetch data from the API
def fetch_data(url):
    response = requests.get(url)
    return response.json()

# Function to extract schedule names
def extract_schedule_names(data, schedule_names_dict={}):
    if isinstance(data, dict):
        if data.get('$type') == "Tfl.Api.Presentation.Entities.Schedule, Tfl.Api.Presentation.Entities" and 'knownJourneys' in data:
            if 'name' in data:
                schedule_names_dict[data['name']] = data['knownJourneys']
        for key, value in data.items():
            extract_schedule_names(value, schedule_names_dict)
    elif isinstance(data, list):
        for item in data:
            extract_schedule_names(item, schedule_names_dict)
    return schedule_names_dict

# Function to categorize journeys into hourly slots
def categorize_into_slots(timetable):
    slots = [[] for _ in range(24)]
    for journey in timetable:
        hour = int(journey['hour'])  # Convert hour to integer
        if 0 <= hour < 24:  # Ensure hour is within the valid range
            slots[hour].append(journey)
    return slots

# Function to fetch the current day of the week
def get_day_of_week():
    bst = pytz.timezone('Europe/London')
    now = datetime.now(bst)
    return now.strftime('%A')  # %A gives full weekday name (e.g., 'Monday')

# Function to retrieve stop names from TfL API and match with Route_Dir_QSI_No
def find_route_details(lineID, df):
    # Ensure the 'Route_Dir_QSI_No' column exists
    if 'Route_Dir_QSI_No' not in df.columns:
        print("The 'Route_Dir_QSI_No' column is not present in the provided file.")
        return

    # Convert the lineID to uppercase to ensure case-insensitivity
    lineID = lineID.upper()

    # Convert the 'Route_Dir_QSI_No' column to uppercase for comparison
    df['Route_Dir_QSI_No'] = df['Route_Dir_QSI_No'].str.upper()

    # Normalize the stop names in the DataFrame
    df['STOP_NAME'] = df['STOP_NAME'].apply(normalize_stop_name)

    # Filter the DataFrame based on the lineID
    pattern_A = f"^{lineID}_A\\d+$"  # Regular expression for D7_A**
    pattern_B = f"^{lineID}_B\\d+$"  # Regular expression for D7_B**

    # Filter rows where the 'Route_Dir_QSI_No' column matches the pattern
    filtered_df_A = df[df['Route_Dir_QSI_No'].str.match(pattern_A, na=False)]
    filtered_df_B = df[df['Route_Dir_QSI_No'].str.match(pattern_B, na=False)]

    # Function to fetch and process route sequence data from TfL API
    def fetch_and_process_route_data(route_type, pattern, filtered_df):
        api_url = f"https://api.tfl.gov.uk/Line/{lineID}/Route/Sequence/{route_type}"
        response = requests.get(api_url)

        results_list = []

        if response.status_code == 200:
            route_data = response.json()

            # Iterate through each stop in the route data
            for stop in route_data['stopPointSequences'][0]['stopPoint']:
                stop_name_api = normalize_stop_name(stop['name'])
                stop_id = stop['id']

                # Check if the stop_name_api exists in the filtered DataFrame for the correct direction
                matched_row = filtered_df[(filtered_df['STOP_NAME'] == stop_name_api) &
                                          (filtered_df['Route_Dir_QSI_No'].str.match(pattern))]

                if not matched_row.empty:
                    route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                    results_list.append({
                        'Route_Dir_QSI_No': route_dir_qsi_no,
                        'STOP_Name': stop['name'],
                        'ID': stop_id
                    })
                else:
                    # If exact match not found, try partial matching based on words before and after '/'
                    api_stop_name_parts = stop_name_api.split('/')
                    for index, row in filtered_df.iterrows():
                        df_stop_name_parts = row['STOP_NAME'].split('/')
                        for api_part in api_stop_name_parts:
                            for df_part in df_stop_name_parts:
                                if SequenceMatcher(None, df_part.strip(), api_part.strip()).ratio() > 0.8:
                                    matched_row = pd.DataFrame([row])
                                    break
                            if not matched_row.empty:
                                break
                        if not matched_row.empty:
                            break

                    if not matched_row.empty:
                        route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                        results_list.append({
                            'Route_Dir_QSI_No': route_dir_qsi_no,
                            'STOP_Name': stop['name'],
                            'ID': stop_id
                        })
        else:
            print(f"Failed to fetch route sequence data from TfL API for {route_type} route. Status code: {response.status_code}")

        return results_list

    # Fetch and process outbound route data for _A**
    matched_results_A = fetch_and_process_route_data('outbound', pattern_A, filtered_df_A)

    # Fetch and process inbound route data for _B**
    matched_results_B = fetch_and_process_route_data('inbound', pattern_B, filtered_df_B)

    # Create DataFrames from the matched results for each direction
    matched_results_df_A = pd.DataFrame(matched_results_A)
    matched_results_df_B = pd.DataFrame(matched_results_B)

    # Function to remove partial matches if exact matches are found
    def remove_partial_matches(exact_df, matched_df):
        for index, row in exact_df.iterrows():
            exact_stop_name = row['STOP_NAME']
            route_dir_qsi_no = row['Route_Dir_QSI_No']
            # Find exact matches in matched_df
            exact_matches = matched_df[(matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                       (matched_df['STOP_Name'].apply(normalize_stop_name) == exact_stop_name)]
            if not exact_matches.empty:
                # Remove partial matches
                matched_df = matched_df[~((matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                          (matched_df['STOP_Name'].apply(normalize_stop_name) != exact_stop_name))]
        return matched_df

    # Remove partial matches for direction A
    matched_results_df_A = remove_partial_matches(filtered_df_A, matched_results_df_A)

    # Remove partial matches for direction B
    matched_results_df_B = remove_partial_matches(filtered_df_B, matched_results_df_B)

    # Remove duplicate stop names with the same Route_Dir_QSI_No and different IDs
    matched_results_df_A = matched_results_df_A.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')
    matched_results_df_B = matched_results_df_B.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')

    # Print the matched results for direction A
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_A\033[0m\n")
    matched_results_df_A = matched_results_df_A[matched_results_df_A['Route_Dir_QSI_No'].str.match(pattern_A)]
    print(matched_results_df_A[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Print the matched results for direction B
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_B\033[0m\n")
    matched_results_df_B = matched_results_df_B[matched_results_df_B['Route_Dir_QSI_No'].str.match(pattern_B)]
    print(matched_results_df_B[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Concatenate the matched results DataFrames for directions A and B
    combined_df = pd.concat([matched_results_df_A, matched_results_df_B], ignore_index=True)

    # DataFrames to store SWT data
    swt_data = {
        'Route_Dir_QSI_No': [],
        'ID': [],
        'SWT_minutes': [],
        'Number_of_buses': []
    }

    # Fetch timetable for each stop point ID and calculate SWT
    bst = pytz.timezone('Europe/London')
    current_hour = datetime.now(bst).hour
    day_of_week = get_day_of_week()

    # Store selected schedule name to ensure it's printed only once
    selected_schedule_name = None
    printed_schedule_name = False

    # Track printed timetable stop IDs
    printed_timetable_stop_ids = []

    while True:
        # Update current time and hour
        now = datetime.now(bst)
        current_hour = now.hour
        day_of_week = get_day_of_week()

        # Clear previous SWT data
        swt_data = {
            'Route_Dir_QSI_No': [],
            'ID': [],
            'SWT_minutes': [],
            'Number_of_buses': []
        }

        for index, row in combined_df.iterrows():
            stop_point_id = row['ID']
            route_dir_qsi_no = row['Route_Dir_QSI_No']

            if f"{lineID}_A" in route_dir_qsi_no:
                direction = 'outbound'
            elif f"{lineID}_B" in route_dir_qsi_no:
                direction = 'inbound'
            else:
                print(f"Invalid route direction for Route_Dir_QSI_No: {route_dir_qsi_no}")
                continue

            url = f'https://api.tfl.gov.uk/Line/{lineID}/Timetable/{stop_point_id}?direction={direction}'
            data = fetch_data(url)

            schedule_names_dict = extract_schedule_names(data)

            if not selected_schedule_name:
                if day_of_week.lower() in ['monday', 'tuesday', 'wednesday', 'thursday']:
                    preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Thursday', 'Monday to Friday']
                elif day_of_week.lower() == 'friday':
                    preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Friday', 'Friday']
                elif day_of_week.lower() == 'saturday':
                    preferred_schedule_names = ['Saturday']
                elif day_of_week.lower() == 'sunday':
                    preferred_schedule_names = ['Sunday']
                else:
                    preferred_schedule_names = [day_of_week]

                for preferred_name in preferred_schedule_names:
                    if preferred_name in schedule_names_dict:
                        selected_schedule_name = preferred_name
                        break

            if selected_schedule_name and not printed_schedule_name:
                print(f"\nToday is {day_of_week}. The selected Schedule name is {selected_schedule_name}.")
                printed_schedule_name = True

            if selected_schedule_name:
                timetable = schedule_names_dict[selected_schedule_name]
                slots = categorize_into_slots(timetable)

                # Calculate Scheduled Wait Time (SWT)
                total_buses_this_hour = len(slots[current_hour])
                if total_buses_this_hour > 0:
                    scheduled_wait_time = 60 / (total_buses_this_hour * 2)  # SWT formula
                else:
                    scheduled_wait_time = float('inf')  # Handle division by zero scenario (though unlikely)

                # Store SWT data
                swt_data['Route_Dir_QSI_No'].append(route_dir_qsi_no)
                swt_data['ID'].append(stop_point_id)
                swt_data['SWT_minutes'].append(scheduled_wait_time)
                swt_data['Number_of_buses'].append(total_buses_this_hour)

                # Fetch arrival predictions based on SWT data for printed timetable stop IDs
                if stop_point_id in printed_timetable_stop_ids:
                    # Fetch arrival predictions
                    arrival_predictions_df, station_name = fetch_arrival_predictions(lineID, stop_point_id, direction)

                    if arrival_predictions_df is not None and not arrival_predictions_df.empty:
                        #print(f"\nArrival Predictions for stop point {stop_point_id} ({station_name}):")
                        #print(arrival_predictions_df.to_string(index=False))
                        print(f"   ")
                        # Calculating summary metrics
                        total_wawt = arrival_predictions_df['WAWT'].sum()
                        min_arrival = arrival_predictions_df['Expected Arrival (BST)'].min().replace(second=0, microsecond=0)
                        max_arrival = arrival_predictions_df['Expected Arrival (BST)'].max().replace(second=0, microsecond=0)
                        time_diff_minutes = (max_arrival - min_arrival).total_seconds() / 60
                        num_buses_observed = arrival_predictions_df['Vehicle ID'].nunique()

                        # Calculating AWT, SWT, and EWT
                        nbph = swt_data['Number_of_buses'][swt_data['ID'].index(stop_point_id)]
                        swt = swt_data['SWT_minutes'][swt_data['ID'].index(stop_point_id)]
                        awt = round(total_wawt / time_diff_minutes, 2) if time_diff_minutes > 0 else 0
                        ewt = round(awt - swt, 2)

                        summary_df = pd.DataFrame({
                            'Metric': ['Number of buses scheduled per hour (nbph)', 'Number of buses observed', 'Total WAWT (minutes)',
                                       'Time difference between 1st and last observed buses (minutes)', 'AWT (minutes)', 'SWT (minutes)', 'EWT (minutes)'],
                            'Value': [nbph, num_buses_observed, total_wawt, time_diff_minutes, awt, swt, ewt]
                        })

                        print(f"\nSummary Metrics:{swt_data['Route_Dir_QSI_No'][swt_data['ID'].index(stop_point_id)]}     {stop_point_id}")
                        print(summary_df)

                    # Remove the stop ID from printed_timetable_stop_ids to avoid redundant fetches
                    printed_timetable_stop_ids.remove(stop_point_id)

        # Update printed timetable stop IDs
        printed_timetable_stop_ids = swt_data['ID']

        # Create DataFrame for SWT data
        swt_df = pd.DataFrame(swt_data)

        # Print the SWT DataFrame
        print(f"\n\nSWT DataFrame at hour {current_hour}")
        print(swt_df)

        # Wait for the next 30 seconds
        #clear_output()
        print("\n\nWaiting to fetch updated data...\n\n")
        time.sleep(30)

# Function to fetch arrival predictions with error handling
def fetch_arrival_predictions(line_id, stop_point_id, direction):
    try:
        base_url = f"https://api.tfl.gov.uk/Line/{line_id}/Arrivals/{stop_point_id}"
        params = {'direction': direction}
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        if len(data) == 0:
            return pd.DataFrame(), None  # No data available
        station_name = data[0]['stationName']
        predictions = []
        for item in data:
            arrival_time = datetime.strptime(item['expectedArrival'], '%Y-%m-%dT%H:%M:%SZ')
            arrival_time_bst = arrival_time + timedelta(hours=1)
            predictions.append({
                'Line': item['lineName'],
                'Vehicle ID': item['vehicleId'],
                'Stop Point': stop_point_id,
                'Direction': direction,
                'Expected Arrival (BST)': arrival_time_bst,
                'Expected Arrival (HM)': arrival_time_bst.strftime('%H:%M')
            })
        df = pd.DataFrame(predictions)
        df = df.sort_values(by='Expected Arrival (BST)', ascending=True)
        df['Expected Arrival (BST)'] = pd.to_datetime(df['Expected Arrival (BST)'])  # Convert to datetime
        df['Expected Arrival (HM)'] = pd.to_datetime(df['Expected Arrival (HM)'], format='%H:%M')
        df['Headway (minutes)'] = df['Expected Arrival (HM)'].diff().fillna(pd.Timedelta(seconds=0)).dt.total_seconds() / 60
        df['AWT/bus (minutes)'] = (df['Headway (minutes)'] / 2).round(2)
        df['WAWT'] = (df['Headway (minutes)'] * df['AWT/bus (minutes)']).round(2)
        return df, station_name
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None, None
        time.sleep(30)  # Wait for 30 seconds before fetching data again

# Example usage
if __name__ == "__main__":
    # Load the Excel file into a DataFrame
    file_path = '/content/QSI points.xlsx'  # Modify this path accordingly
    df = pd.read_excel(file_path)

    # Ask the user to enter a lineID
    line_id_input = input("Please enter the lineID: ")

    # Find and display the route details
    find_route_details(line_id_input, df)


Please enter the lineID: d7


[1m[4mQSI stop points for direction D7_A[0m

  Route_Dir_QSI_No                    STOP_Name          ID
0            D7_A1   Poplar / All Saints Church  490011107G
1            D7_A2               Stewart Street  490013513S
2            D7_A3       Island Gardens Station  490002048Z
3            D7_A4  Arnhem Wharf Primary School  490006092N
5            D7_A5         East India Dock Road  490004584N


[1m[4mQSI stop points for direction D7_B[0m

  Route_Dir_QSI_No                    STOP_Name          ID
0            D7_B1  Mile End Station / Bow Road  490015151H
2            D7_B2         East India Dock Road  490004584S
3            D7_B3         Canary Wharf Station  490000038F
5            D7_B4  Arnhem Wharf Primary School  490006092S
6            D7_B5       Island Gardens Station  490002048X
7            D7_B6               Stewart Street  490013513N

Today is Thursday. The selected Schedule name is Monday to Friday.


SWT DataFrame at hour 

KeyboardInterrupt: 

In [None]:
import pandas as pd
import requests
from difflib import SequenceMatcher
from datetime import datetime, timedelta
import pytz
import time

# Initialize empty DataFrame for AWT data
awt_data = pd.DataFrame(columns=['Stop Point ID', 'Number of Buses Observed', 'AWT (minutes)'])

# Function to normalize stop names
def normalize_stop_name(name):
    return ' '.join(name.lower().split())

# Function to fetch data from the API
def fetch_data(url):
    response = requests.get(url)
    return response.json()

# Function to extract schedule names
def extract_schedule_names(data, schedule_names_dict={}):
    if isinstance(data, dict):
        if data.get('$type') == "Tfl.Api.Presentation.Entities.Schedule, Tfl.Api.Presentation.Entities" and 'knownJourneys' in data:
            if 'name' in data:
                schedule_names_dict[data['name']] = data['knownJourneys']
        for key, value in data.items():
            extract_schedule_names(value, schedule_names_dict)
    elif isinstance(data, list):
        for item in data:
            extract_schedule_names(item, schedule_names_dict)
    return schedule_names_dict

# Function to categorize journeys into hourly slots
def categorize_into_slots(timetable):
    slots = [[] for _ in range(24)]
    for journey in timetable:
        hour = int(journey['hour'])  # Convert hour to integer
        if 0 <= hour < 24:  # Ensure hour is within the valid range
            slots[hour].append(journey)
    return slots

# Function to fetch the current day of the week
def get_day_of_week():
    bst = pytz.timezone('Europe/London')
    now = datetime.now(bst)
    return now.strftime('%A')  # %A gives full weekday name (e.g., 'Monday')

# Function to retrieve stop names from TfL API and match with Route_Dir_QSI_No
def find_route_details(lineID, df):
    # Ensure the 'Route_Dir_QSI_No' column exists
    if 'Route_Dir_QSI_No' not in df.columns:
        print("The 'Route_Dir_QSI_No' column is not present in the provided file.")
        return

    # Convert the lineID to uppercase to ensure case-insensitivity
    lineID = lineID.upper()

    # Convert the 'Route_Dir_QSI_No' column to uppercase for comparison
    df['Route_Dir_QSI_No'] = df['Route_Dir_QSI_No'].str.upper()

    # Normalize the stop names in the DataFrame
    df['STOP_NAME'] = df['STOP_NAME'].apply(normalize_stop_name)

    # Filter the DataFrame based on the lineID
    pattern_A = f"^{lineID}_A\\d+$"  # Regular expression for D7_A**
    pattern_B = f"^{lineID}_B\\d+$"  # Regular expression for D7_B**

    # Filter rows where the 'Route_Dir_QSI_No' column matches the pattern
    filtered_df_A = df[df['Route_Dir_QSI_No'].str.match(pattern_A, na=False)]
    filtered_df_B = df[df['Route_Dir_QSI_No'].str.match(pattern_B, na=False)]

    # Function to fetch and process route sequence data from TfL API
    def fetch_and_process_route_data(route_type, pattern, filtered_df):
        api_url = f"https://api.tfl.gov.uk/Line/{lineID}/Route/Sequence/{route_type}"
        response = requests.get(api_url)

        results_list = []

        if response.status_code == 200:
            route_data = response.json()

            # Iterate through each stop in the route data
            for stop in route_data['stopPointSequences'][0]['stopPoint']:
                stop_name_api = normalize_stop_name(stop['name'])
                stop_id = stop['id']

                # Check if the stop_name_api exists in the filtered DataFrame for the correct direction
                matched_row = filtered_df[(filtered_df['STOP_NAME'] == stop_name_api) &
                                          (filtered_df['Route_Dir_QSI_No'].str.match(pattern))]

                if not matched_row.empty:
                    route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                    results_list.append({
                        'Route_Dir_QSI_No': route_dir_qsi_no,
                        'STOP_Name': stop['name'],
                        'ID': stop_id
                    })
                else:
                    # If exact match not found, try partial matching based on words before and after '/'
                    api_stop_name_parts = stop_name_api.split('/')
                    for index, row in filtered_df.iterrows():
                        df_stop_name_parts = row['STOP_NAME'].split('/')
                        for api_part in api_stop_name_parts:
                            for df_part in df_stop_name_parts:
                                if SequenceMatcher(None, df_part.strip(), api_part.strip()).ratio() > 0.8:
                                    matched_row = pd.DataFrame([row])
                                    break
                            if not matched_row.empty:
                                break
                        if not matched_row.empty:
                            break

                    if not matched_row.empty:
                        route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                        results_list.append({
                            'Route_Dir_QSI_No': route_dir_qsi_no,
                            'STOP_Name': stop['name'],
                            'ID': stop_id
                        })
        else:
            print(f"Failed to fetch route sequence data from TfL API for {route_type} route. Status code: {response.status_code}")

        return results_list

    # Fetch and process outbound route data for _A**
    matched_results_A = fetch_and_process_route_data('outbound', pattern_A, filtered_df_A)

    # Fetch and process inbound route data for _B**
    matched_results_B = fetch_and_process_route_data('inbound', pattern_B, filtered_df_B)

    # Create DataFrames from the matched results for each direction
    matched_results_df_A = pd.DataFrame(matched_results_A)
    matched_results_df_B = pd.DataFrame(matched_results_B)

    # Function to remove partial matches if exact matches are found
    def remove_partial_matches(exact_df, matched_df):
        for index, row in exact_df.iterrows():
            exact_stop_name = row['STOP_NAME']
            route_dir_qsi_no = row['Route_Dir_QSI_No']
            # Find exact matches in matched_df
            exact_matches = matched_df[(matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                       (matched_df['STOP_Name'].apply(normalize_stop_name) == exact_stop_name)]
            if not exact_matches.empty:
                # Remove partial matches
                matched_df = matched_df[~((matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                          (matched_df['STOP_Name'].apply(normalize_stop_name) != exact_stop_name))]
        return matched_df

    # Remove partial matches for direction A
    matched_results_df_A = remove_partial_matches(filtered_df_A, matched_results_df_A)

    # Remove partial matches for direction B
    matched_results_df_B = remove_partial_matches(filtered_df_B, matched_results_df_B)

    # Remove duplicate stop names with the same Route_Dir_QSI_No and different IDs
    matched_results_df_A = matched_results_df_A.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')
    matched_results_df_B = matched_results_df_B.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')

    # Print the matched results for direction A
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_A\033[0m\n")
    matched_results_df_A = matched_results_df_A[matched_results_df_A['Route_Dir_QSI_No'].str.match(pattern_A)]
    print(matched_results_df_A[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Print the matched results for direction B
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_B\033[0m\n")
    matched_results_df_B = matched_results_df_B[matched_results_df_B['Route_Dir_QSI_No'].str.match(pattern_B)]
    print(matched_results_df_B[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Concatenate the matched results DataFrames for directions A and B
    combined_df = pd.concat([matched_results_df_A, matched_results_df_B], ignore_index=True)

    # DataFrames to store SWT data
    swt_data = {
        'Route_Dir_QSI_No': [],
        'ID': [],
        'SWT_minutes': [],
        'Number_of_buses': []
    }

    # Fetch timetable for each stop point ID and calculate SWT
    bst = pytz.timezone('Europe/London')
    current_hour = datetime.now(bst).hour
    day_of_week = get_day_of_week()

    # Store selected schedule name to ensure it's printed only once
    selected_schedule_name = None
    printed_schedule_name = False

    # Track printed timetable stop IDs
    printed_timetable_stop_ids = []

    while True:
        # Update current time and hour
        now = datetime.now(bst)
        current_hour = now.hour
        day_of_week = get_day_of_week()

        # Clear previous SWT data
        swt_data = {
            'Route_Dir_QSI_No': [],
            'ID': [],
            'SWT_minutes': [],
            'Number_of_buses': []
        }

        for index, row in combined_df.iterrows():
            stop_point_id = row['ID']
            route_dir_qsi_no = row['Route_Dir_QSI_No']

            if f"{lineID}_A" in route_dir_qsi_no:
                direction = 'outbound'
            elif f"{lineID}_B" in route_dir_qsi_no:
                direction = 'inbound'
            else:
                print(f"Invalid route direction for Route_Dir_QSI_No: {route_dir_qsi_no}")
                continue

            url = f'https://api.tfl.gov.uk/Line/{lineID}/Timetable/{stop_point_id}?direction={direction}'
            data = fetch_data(url)

            schedule_names_dict = extract_schedule_names(data)

            if not selected_schedule_name:
                if day_of_week.lower() in ['monday', 'tuesday', 'wednesday', 'thursday']:
                    preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Thursday', 'Monday to Friday']
                elif day_of_week.lower() == 'friday':
                    preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Friday', 'Friday']
                elif day_of_week.lower() == 'saturday':
                    preferred_schedule_names = ['Saturday']
                elif day_of_week.lower() == 'sunday':
                    preferred_schedule_names = ['Sunday']
                else:
                    preferred_schedule_names = [day_of_week]

                for preferred_name in preferred_schedule_names:
                    if preferred_name in schedule_names_dict:
                        selected_schedule_name = preferred_name
                        break

            if selected_schedule_name and not printed_schedule_name:
                print(f"\nToday is {day_of_week}. The selected Schedule name is {selected_schedule_name}.")
                printed_schedule_name = True

            if selected_schedule_name:
                timetable = schedule_names_dict[selected_schedule_name]
                slots = categorize_into_slots(timetable)

                # Calculate Scheduled Wait Time (SWT)
                total_buses_this_hour = len(slots[current_hour])
                if total_buses_this_hour > 0:
                    scheduled_wait_time = 60 / (total_buses_this_hour * 2)  # SWT formula
                else:
                    scheduled_wait_time = float('inf')  # Handle division by zero scenario (though unlikely)

                # Store SWT data
                swt_data['Route_Dir_QSI_No'].append(route_dir_qsi_no)
                swt_data['ID'].append(stop_point_id)
                swt_data['SWT_minutes'].append(scheduled_wait_time)
                swt_data['Number_of_buses'].append(total_buses_this_hour)

                # Fetch arrival predictions based on SWT data for printed timetable stop IDs
                if stop_point_id in printed_timetable_stop_ids:
                    # Fetch arrival predictions
                    arrival_predictions_df, station_name = fetch_arrival_predictions(lineID, stop_point_id, direction)

                    if arrival_predictions_df is not None and not arrival_predictions_df.empty:
                        #print(f"\nArrival Predictions for stop point {stop_point_id} ({station_name}):")
                        #print(arrival_predictions_df.to_string(index=False))
                        print(f"   ")
                        # Calculating summary metrics
                        total_wawt = arrival_predictions_df['WAWT'].sum()
                        min_arrival = arrival_predictions_df['Expected Arrival (BST)'].min().replace(second=0, microsecond=0)
                        max_arrival = arrival_predictions_df['Expected Arrival (BST)'].max().replace(second=0, microsecond=0)
                        time_diff_minutes = (max_arrival - min_arrival).total_seconds() / 60
                        num_buses_observed = arrival_predictions_df['Vehicle ID'].nunique()

                        # Calculating AWT, SWT, and EWT
                        nbph = swt_data['Number_of_buses'][swt_data['ID'].index(stop_point_id)]
                        swt = swt_data['SWT_minutes'][swt_data['ID'].index(stop_point_id)]
                        awt = round(total_wawt / time_diff_minutes, 2) if time_diff_minutes > 0 else 0
                        ewt = round(awt - swt, 2)

                        summary_df = pd.DataFrame({
                            'Metric': ['Number of buses scheduled per hour (nbph)', 'Number of buses observed', 'Total WAWT (minutes)',
                                       'Time difference between 1st and last observed buses (minutes)', 'AWT (minutes)', 'SWT (minutes)', 'EWT (minutes)'],
                            'Value': [nbph, num_buses_observed, total_wawt, time_diff_minutes, awt, swt, ewt]
                        })

                        print(f"\nSummary Metrics for {stop_point_id}:")
                        print(summary_df)

                        # Append to awt_data DataFrame
                        awt_data.loc[len(awt_data)] = [stop_point_id, num_buses_observed, awt]

                    # Remove the stop ID from printed_timetable_stop_ids to avoid redundant fetches
                    printed_timetable_stop_ids.remove(stop_point_id)

        # Update printed timetable stop IDs
        printed_timetable_stop_ids = swt_data['ID']

        # Create DataFrame for SWT data
        swt_df = pd.DataFrame(swt_data)

        # Print the SWT DataFrame
        print(f"\n\nSWT DataFrame at hour {current_hour}")
        print(swt_df)


        # Wait for the next 30 seconds
        #clear_output()
        print("\n\nWaiting to fetch updated data...\n\n")
        time.sleep(30)

# Function to fetch arrival predictions with error handling
def fetch_arrival_predictions(line_id, stop_point_id, direction):
    try:
        base_url = f"https://api.tfl.gov.uk/Line/{line_id}/Arrivals/{stop_point_id}"
        params = {'direction': direction}
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        if len(data) == 0:
            return pd.DataFrame(), None  # No data available
        station_name = data[0]['stationName']
        predictions = []
        for item in data:
            arrival_time = datetime.strptime(item['expectedArrival'], '%Y-%m-%dT%H:%M:%SZ')
            arrival_time_bst = arrival_time + timedelta(hours=1)
            predictions.append({
                'Line': item['lineName'],
                'Vehicle ID': item['vehicleId'],
                'Stop Point': stop_point_id,
                'Direction': direction,
                'Expected Arrival (BST)': arrival_time_bst,
                'Expected Arrival (HM)': arrival_time_bst.strftime('%H:%M')
            })
        df = pd.DataFrame(predictions)
        df = df.sort_values(by='Expected Arrival (BST)', ascending=True)
        df['Expected Arrival (BST)'] = pd.to_datetime(df['Expected Arrival (BST)'])  # Convert to datetime
        df['Expected Arrival (HM)'] = pd.to_datetime(df['Expected Arrival (HM)'], format='%H:%M')
        df['Headway (minutes)'] = df['Expected Arrival (HM)'].diff().fillna(pd.Timedelta(seconds=0)).dt.total_seconds() / 60
        df['AWT/bus (minutes)'] = (df['Headway (minutes)'] / 2).round(2)
        df['WAWT'] = (df['Headway (minutes)'] * df['AWT/bus (minutes)']).round(2)
        return df, station_name
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None, None
        time.sleep(30)  # Wait for 30 seconds before fetching data again

# Example usage
if __name__ == "__main__":
    # Load the Excel file into a DataFrame
    file_path = '/content/QSI points.xlsx'  # Modify this path accordingly
    df = pd.read_excel(file_path)

    # Ask the user to enter a lineID
    line_id_input = input("Please enter the lineID: ")

    # Find and display the route details
    find_route_details(line_id_input, df)


Please enter the lineID: w14


[1m[4mQSI stop points for direction W14_A[0m

  Route_Dir_QSI_No                          STOP_Name          ID
0           W14_A1                         Cross Road  490004984N
1           W14_A2                     Orchard Estate  490010612W
2           W14_A3             South Woodford Station  490000217D
4           W14_A4   Wanstead Station  / George Green  490015125C
5           W14_A5                Leytonstone Station  490000137W
7           W14_A6  Leyton High Road / Leyton Station  490009136C


[1m[4mQSI stop points for direction W14_B[0m

  Route_Dir_QSI_No                          STOP_Name          ID
0           W14_B1                       Leyton Mills  490015670N
1           W14_B2  Leyton High Road / Leyton Station  490009136D
2           W14_B3                        Harvey Road  490007882H
3           W14_B4                   Wanstead Station  490015125B
4           W14_B5             South Woodford Station  490015122C
5          

KeyboardInterrupt: 

In [None]:
    # Print the awt_data DataFrame after all summary metrics have been printed
    print("\n\nAWT Data:")
    print(awt_data)



AWT Data:
   Stop Point ID  Number of Buses Observed  AWT (minutes)
0     490010612W                         2            7.5
1     490000217D                         2            9.0
2     490015125C                         2            6.5
3     490000137W                         2            7.0
4     490009136C                         2            8.0
5     490015670N                         1            0.0
6     490009136D                         1            0.0
7     490007882H                         1            0.0
8     490015125B                         2            8.0
9     490015122C                         2            8.5
10    490010612E                         2            7.0


In [1]:
import pandas as pd
import requests
from difflib import SequenceMatcher
from datetime import datetime, timedelta
import pytz
import time

# Initialize empty DataFrame for AWT data
awt_data = pd.DataFrame(columns=['Stop Point ID', 'Number of Buses Observed', 'AWT (minutes)'])

# Function to normalize stop names
def normalize_stop_name(name):
    return ' '.join(name.lower().split())

# Function to fetch data from the API
def fetch_data(url):
    response = requests.get(url)
    return response.json()

# Function to extract schedule names
def extract_schedule_names(data, schedule_names_dict={}):
    if isinstance(data, dict):
        if data.get('$type') == "Tfl.Api.Presentation.Entities.Schedule, Tfl.Api.Presentation.Entities" and 'knownJourneys' in data:
            if 'name' in data:
                schedule_names_dict[data['name']] = data['knownJourneys']
        for key, value in data.items():
            extract_schedule_names(value, schedule_names_dict)
    elif isinstance(data, list):
        for item in data:
            extract_schedule_names(item, schedule_names_dict)
    return schedule_names_dict

# Function to categorize journeys into hourly slots
def categorize_into_slots(timetable):
    slots = [[] for _ in range(24)]
    for journey in timetable:
        hour = int(journey['hour'])  # Convert hour to integer
        if 0 <= hour < 24:  # Ensure hour is within the valid range
            slots[hour].append(journey)
    return slots

# Function to fetch the current day of the week
def get_day_of_week():
    bst = pytz.timezone('Europe/London')
    now = datetime.now(bst)
    return now.strftime('%A')  # %A gives full weekday name (e.g., 'Monday')

# Function to retrieve stop names from TfL API and match with Route_Dir_QSI_No
def find_route_details(lineID, df):
    # Ensure the 'Route_Dir_QSI_No' column exists
    if 'Route_Dir_QSI_No' not in df.columns:
        print("The 'Route_Dir_QSI_No' column is not present in the provided file.")
        return

    # Convert the lineID to uppercase to ensure case-insensitivity
    lineID = lineID.upper()

    # Convert the 'Route_Dir_QSI_No' column to uppercase for comparison
    df['Route_Dir_QSI_No'] = df['Route_Dir_QSI_No'].str.upper()

    # Normalize the stop names in the DataFrame
    df['STOP_NAME'] = df['STOP_NAME'].apply(normalize_stop_name)

    # Filter the DataFrame based on the lineID
    pattern_A = f"^{lineID}_A\\d+$"  # Regular expression for D7_A**
    pattern_B = f"^{lineID}_B\\d+$"  # Regular expression for D7_B**

    # Filter rows where the 'Route_Dir_QSI_No' column matches the pattern
    filtered_df_A = df[df['Route_Dir_QSI_No'].str.match(pattern_A, na=False)]
    filtered_df_B = df[df['Route_Dir_QSI_No'].str.match(pattern_B, na=False)]

    # Function to fetch and process route sequence data from TfL API
    def fetch_and_process_route_data(route_type, pattern, filtered_df):
        api_url = f"https://api.tfl.gov.uk/Line/{lineID}/Route/Sequence/{route_type}"
        response = requests.get(api_url)

        results_list = []

        if response.status_code == 200:
            route_data = response.json()

            # Iterate through each stop in the route data
            for stop in route_data['stopPointSequences'][0]['stopPoint']:
                stop_name_api = normalize_stop_name(stop['name'])
                stop_id = stop['id']

                # Check if the stop_name_api exists in the filtered DataFrame for the correct direction
                matched_row = filtered_df[(filtered_df['STOP_NAME'] == stop_name_api) &
                                          (filtered_df['Route_Dir_QSI_No'].str.match(pattern))]

                if not matched_row.empty:
                    route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                    results_list.append({
                        'Route_Dir_QSI_No': route_dir_qsi_no,
                        'STOP_Name': stop['name'],
                        'ID': stop_id
                    })
                else:
                    # If exact match not found, try partial matching based on words before and after '/'
                    api_stop_name_parts = stop_name_api.split('/')
                    for index, row in filtered_df.iterrows():
                        df_stop_name_parts = row['STOP_NAME'].split('/')
                        for api_part in api_stop_name_parts:
                            for df_part in df_stop_name_parts:
                                if SequenceMatcher(None, df_part.strip(), api_part.strip()).ratio() > 0.8:
                                    matched_row = pd.DataFrame([row])
                                    break
                            if not matched_row.empty:
                                break
                        if not matched_row.empty:
                            break

                    if not matched_row.empty:
                        route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                        results_list.append({
                            'Route_Dir_QSI_No': route_dir_qsi_no,
                            'STOP_Name': stop['name'],
                            'ID': stop_id
                        })
        else:
            print(f"Failed to fetch route sequence data from TfL API for {route_type} route. Status code: {response.status_code}")

        return results_list

    # Fetch and process outbound route data for _A**
    matched_results_A = fetch_and_process_route_data('outbound', pattern_A, filtered_df_A)

    # Fetch and process inbound route data for _B**
    matched_results_B = fetch_and_process_route_data('inbound', pattern_B, filtered_df_B)

    # Create DataFrames from the matched results for each direction
    matched_results_df_A = pd.DataFrame(matched_results_A)
    matched_results_df_B = pd.DataFrame(matched_results_B)

    # Function to remove partial matches if exact matches are found
    def remove_partial_matches(exact_df, matched_df):
        for index, row in exact_df.iterrows():
            exact_stop_name = row['STOP_NAME']
            route_dir_qsi_no = row['Route_Dir_QSI_No']
            # Find exact matches in matched_df
            exact_matches = matched_df[(matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                       (matched_df['STOP_Name'].apply(normalize_stop_name) == exact_stop_name)]
            if not exact_matches.empty:
                # Remove partial matches
                matched_df = matched_df[~((matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                          (matched_df['STOP_Name'].apply(normalize_stop_name) != exact_stop_name))]
        return matched_df

    # Remove partial matches for direction A
    matched_results_df_A = remove_partial_matches(filtered_df_A, matched_results_df_A)

    # Remove partial matches for direction B
    matched_results_df_B = remove_partial_matches(filtered_df_B, matched_results_df_B)

    # Remove duplicate stop names with the same Route_Dir_QSI_No and different IDs
    matched_results_df_A = matched_results_df_A.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')
    matched_results_df_B = matched_results_df_B.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')

    # Print the matched results for direction A
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_A\033[0m\n")
    matched_results_df_A = matched_results_df_A[matched_results_df_A['Route_Dir_QSI_No'].str.match(pattern_A)]
    print(matched_results_df_A[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Print the matched results for direction B
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_B\033[0m\n")
    matched_results_df_B = matched_results_df_B[matched_results_df_B['Route_Dir_QSI_No'].str.match(pattern_B)]
    print(matched_results_df_B[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Concatenate the matched results DataFrames for directions A and B
    combined_df = pd.concat([matched_results_df_A, matched_results_df_B], ignore_index=True)

    # DataFrames to store SWT data
    swt_data = {
        'Route_Dir_QSI_No': [],
        'ID': [],
        'SWT_minutes': [],
        'Number_of_buses': []
    }

    # Fetch timetable for each stop point ID and calculate SWT
    bst = pytz.timezone('Europe/London')
    current_hour = datetime.now(bst).hour
    day_of_week = get_day_of_week()

    # Store selected schedule name to ensure it's printed only once
    selected_schedule_name = None
    printed_schedule_name = False

    # Track printed timetable stop IDs
    printed_timetable_stop_ids = []

    while True:
        # Update current time and hour
        now = datetime.now(bst)
        current_hour = now.hour
        day_of_week = get_day_of_week()

        # Clear previous SWT data
        swt_data = {
            'Route_Dir_QSI_No': [],
            'ID': [],
            'SWT_minutes': [],
            'Number_of_buses': []
        }

        for index, row in combined_df.iterrows():
            stop_point_id = row['ID']
            route_dir_qsi_no = row['Route_Dir_QSI_No']

            if f"{lineID}_A" in route_dir_qsi_no:
                direction = 'outbound'
            elif f"{lineID}_B" in route_dir_qsi_no:
                direction = 'inbound'
            else:
                print(f"Invalid route direction for Route_Dir_QSI_No: {route_dir_qsi_no}")
                continue

            url = f'https://api.tfl.gov.uk/Line/{lineID}/Timetable/{stop_point_id}?direction={direction}'
            data = fetch_data(url)

            schedule_names_dict = extract_schedule_names(data)

            if not selected_schedule_name:
                if day_of_week.lower() in ['monday', 'tuesday', 'wednesday', 'thursday']:
                    preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Thursday', 'Monday to Friday']
                elif day_of_week.lower() == 'friday':
                    preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Friday', 'Friday']
                elif day_of_week.lower() == 'saturday':
                    preferred_schedule_names = ['Saturday']
                elif day_of_week.lower() == 'sunday':
                    preferred_schedule_names = ['Sunday']
                else:
                    preferred_schedule_names = [day_of_week]

                for preferred_name in preferred_schedule_names:
                    if preferred_name in schedule_names_dict:
                        selected_schedule_name = preferred_name
                        break

            if selected_schedule_name and not printed_schedule_name:
                print(f"\nToday is {day_of_week}. The selected Schedule name is {selected_schedule_name}.")
                printed_schedule_name = True

            if selected_schedule_name:
                timetable = schedule_names_dict[selected_schedule_name]
                slots = categorize_into_slots(timetable)

                # Calculate Scheduled Wait Time (SWT)
                total_buses_this_hour = len(slots[current_hour])
                if total_buses_this_hour > 0:
                    scheduled_wait_time = 60 / (total_buses_this_hour * 2)  # SWT formula
                else:
                    scheduled_wait_time = float('inf')  # Handle division by zero scenario (though unlikely)

                # Store SWT data
                swt_data['Route_Dir_QSI_No'].append(route_dir_qsi_no)
                swt_data['ID'].append(stop_point_id)
                swt_data['SWT_minutes'].append(scheduled_wait_time)
                swt_data['Number_of_buses'].append(total_buses_this_hour)

                # Fetch arrival predictions based on SWT data for printed timetable stop IDs
                if stop_point_id in printed_timetable_stop_ids:
                    # Fetch arrival predictions
                    arrival_predictions_df, station_name = fetch_arrival_predictions(lineID, stop_point_id, direction)

                    if arrival_predictions_df is not None and not arrival_predictions_df.empty:
                        #print(f"\nArrival Predictions for stop point {stop_point_id} ({station_name}):")
                        #print(arrival_predictions_df.to_string(index=False))
                        print(f"   ")
                        # Calculating summary metrics
                        total_wawt = arrival_predictions_df['WAWT'].sum()
                        min_arrival = arrival_predictions_df['Expected Arrival (BST)'].min().replace(second=0, microsecond=0)
                        max_arrival = arrival_predictions_df['Expected Arrival (BST)'].max().replace(second=0, microsecond=0)
                        time_diff_minutes = (max_arrival - min_arrival).total_seconds() / 60
                        num_buses_observed = arrival_predictions_df['Vehicle ID'].nunique()

                        # Calculating AWT, SWT, and EWT
                        nbph = swt_data['Number_of_buses'][swt_data['ID'].index(stop_point_id)]
                        swt = swt_data['SWT_minutes'][swt_data['ID'].index(stop_point_id)]
                        awt = round(total_wawt / time_diff_minutes, 2) if time_diff_minutes > 0 else 0
                        ewt = round(awt - swt, 2)

                        summary_df = pd.DataFrame({
                            'Metric': ['Number of buses scheduled per hour (nbph)', 'Number of buses observed', 'Total WAWT (minutes)',
                                       'Time difference between 1st and last observed buses (minutes)', 'AWT (minutes)', 'SWT (minutes)', 'EWT (minutes)'],
                            'Value': [nbph, num_buses_observed, total_wawt, time_diff_minutes, awt, swt, ewt]
                        })

                        print(f"\nSummary Metrics for {stop_point_id}:")
                        print(summary_df)

                        # Append to awt_data DataFrame
                        awt_data.loc[len(awt_data)] = [stop_point_id, num_buses_observed, awt]

                    # Remove the stop ID from printed_timetable_stop_ids to avoid redundant fetches
                    printed_timetable_stop_ids.remove(stop_point_id)

        # Update printed timetable stop IDs
        printed_timetable_stop_ids = swt_data['ID']

        # Create DataFrame for SWT data
        swt_df = pd.DataFrame(swt_data)

        # Print the SWT DataFrame
        print(f"\n\nSWT DataFrame at hour {current_hour}")
        print(swt_df)


        # Wait for the next 30 seconds
        #clear_output()
        print("\n\nWaiting to fetch updated data...\n\n")
        time.sleep(30)

# Function to fetch arrival predictions with error handling
def fetch_arrival_predictions(line_id, stop_point_id, direction):
    try:
        base_url = f"https://api.tfl.gov.uk/Line/{line_id}/Arrivals/{stop_point_id}"
        params = {'direction': direction}
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        if len(data) == 0:
            return pd.DataFrame(), None  # No data available
        station_name = data[0]['stationName']
        predictions = []
        for item in data:
            arrival_time = datetime.strptime(item['expectedArrival'], '%Y-%m-%dT%H:%M:%SZ')
            arrival_time_bst = arrival_time + timedelta(hours=1)
            predictions.append({
                'Line': item['lineName'],
                'Vehicle ID': item['vehicleId'],
                'Stop Point': stop_point_id,
                'Direction': direction,
                'Expected Arrival (BST)': arrival_time_bst,
                'Expected Arrival (HM)': arrival_time_bst.strftime('%H:%M')
            })
        df = pd.DataFrame(predictions)
        df = df.sort_values(by='Expected Arrival (BST)', ascending=True)
        df['Expected Arrival (BST)'] = pd.to_datetime(df['Expected Arrival (BST)'])  # Convert to datetime
        df['Expected Arrival (HM)'] = pd.to_datetime(df['Expected Arrival (HM)'], format='%H:%M')
        df['Headway (minutes)'] = df['Expected Arrival (HM)'].diff().fillna(pd.Timedelta(seconds=0)).dt.total_seconds() / 60
        df['AWT/bus (minutes)'] = (df['Headway (minutes)'] / 2).round(2)
        df['WAWT'] = (df['Headway (minutes)'] * df['AWT/bus (minutes)']).round(2)
        return df, station_name
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None, None
        time.sleep(30)  # Wait for 30 seconds before fetching data again

# Example usage
if __name__ == "__main__":
    # Load the Excel file into a DataFrame
    file_path = '/content/QSI points.xlsx'  # Modify this path accordingly
    df = pd.read_excel(file_path)

    # Ask the user to enter a lineID
    line_id_input = input("Please enter the lineID: ")

    # Find and display the route details
    find_route_details(line_id_input, df)


Please enter the lineID: w14


[1m[4mQSI stop points for direction W14_A[0m

  Route_Dir_QSI_No                          STOP_Name          ID
0           W14_A1                         Cross Road  490004984N
1           W14_A2                     Orchard Estate  490010612W
2           W14_A3             South Woodford Station  490000217D
4           W14_A4   Wanstead Station  / George Green  490015125C
5           W14_A5                Leytonstone Station  490000137W
7           W14_A6  Leyton High Road / Leyton Station  490009136C


[1m[4mQSI stop points for direction W14_B[0m

  Route_Dir_QSI_No                          STOP_Name          ID
0           W14_B1                       Leyton Mills  490015670N
1           W14_B2  Leyton High Road / Leyton Station  490009136D
2           W14_B3                        Harvey Road  490007882H
3           W14_B4                   Wanstead Station  490015125B
4           W14_B5             South Woodford Station  490015122C
5          

KeyboardInterrupt: 

Working code with awt dataframe. , but appending data every iteration

In [6]:
import pandas as pd
import requests
from difflib import SequenceMatcher
from datetime import datetime, timedelta
import pytz
import time

# Initialize empty DataFrame for AWT data
awt_data = pd.DataFrame(columns=['Stop Point ID', 'Number of Buses Observed', 'AWT (minutes)'])

# Function to normalize stop names
def normalize_stop_name(name):
    return ' '.join(name.lower().split())

# Function to fetch data from the API
def fetch_data(url):
    response = requests.get(url)
    return response.json()

# Function to extract schedule names
def extract_schedule_names(data, schedule_names_dict={}):
    if isinstance(data, dict):
        if data.get('$type') == "Tfl.Api.Presentation.Entities.Schedule, Tfl.Api.Presentation.Entities" and 'knownJourneys' in data:
            if 'name' in data:
                schedule_names_dict[data['name']] = data['knownJourneys']
        for key, value in data.items():
            extract_schedule_names(value, schedule_names_dict)
    elif isinstance(data, list):
        for item in data:
            extract_schedule_names(item, schedule_names_dict)
    return schedule_names_dict

# Function to categorize journeys into hourly slots
def categorize_into_slots(timetable):
    slots = [[] for _ in range(24)]
    for journey in timetable:
        hour = int(journey['hour'])  # Convert hour to integer
        if 0 <= hour < 24:  # Ensure hour is within the valid range
            slots[hour].append(journey)
    return slots

# Function to fetch the current day of the week
def get_day_of_week():
    bst = pytz.timezone('Europe/London')
    now = datetime.now(bst)
    return now.strftime('%A')  # %A gives full weekday name (e.g., 'Monday')

# Function to retrieve stop names from TfL API and match with Route_Dir_QSI_No
def find_route_details(lineID, df):
    # Ensure the 'Route_Dir_QSI_No' column exists
    if 'Route_Dir_QSI_No' not in df.columns:
        print("The 'Route_Dir_QSI_No' column is not present in the provided file.")
        return

    # Convert the lineID to uppercase to ensure case-insensitivity
    lineID = lineID.upper()

    # Convert the 'Route_Dir_QSI_No' column to uppercase for comparison
    df['Route_Dir_QSI_No'] = df['Route_Dir_QSI_No'].str.upper()

    # Normalize the stop names in the DataFrame
    df['STOP_NAME'] = df['STOP_NAME'].apply(normalize_stop_name)

    # Filter the DataFrame based on the lineID
    pattern_A = f"^{lineID}_A\\d+$"  # Regular expression for D7_A**
    pattern_B = f"^{lineID}_B\\d+$"  # Regular expression for D7_B**

    # Filter rows where the 'Route_Dir_QSI_No' column matches the pattern
    filtered_df_A = df[df['Route_Dir_QSI_No'].str.match(pattern_A, na=False)]
    filtered_df_B = df[df['Route_Dir_QSI_No'].str.match(pattern_B, na=False)]

    # Function to fetch and process route sequence data from TfL API
    def fetch_and_process_route_data(route_type, pattern, filtered_df):
        api_url = f"https://api.tfl.gov.uk/Line/{lineID}/Route/Sequence/{route_type}"
        response = requests.get(api_url)

        results_list = []

        if response.status_code == 200:
            route_data = response.json()

            # Iterate through each stop in the route data
            for stop in route_data['stopPointSequences'][0]['stopPoint']:
                stop_name_api = normalize_stop_name(stop['name'])
                stop_id = stop['id']

                # Check if the stop_name_api exists in the filtered DataFrame for the correct direction
                matched_row = filtered_df[(filtered_df['STOP_NAME'] == stop_name_api) &
                                          (filtered_df['Route_Dir_QSI_No'].str.match(pattern))]

                if not matched_row.empty:
                    route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                    results_list.append({
                        'Route_Dir_QSI_No': route_dir_qsi_no,
                        'STOP_Name': stop['name'],
                        'ID': stop_id
                    })
                else:
                    # If exact match not found, try partial matching based on words before and after '/'
                    api_stop_name_parts = stop_name_api.split('/')
                    for index, row in filtered_df.iterrows():
                        df_stop_name_parts = row['STOP_NAME'].split('/')
                        for api_part in api_stop_name_parts:
                            for df_part in df_stop_name_parts:
                                if SequenceMatcher(None, df_part.strip(), api_part.strip()).ratio() > 0.8:
                                    matched_row = pd.DataFrame([row])
                                    break
                            if not matched_row.empty:
                                break
                        if not matched_row.empty:
                            break

                    if not matched_row.empty:
                        route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                        results_list.append({
                            'Route_Dir_QSI_No': route_dir_qsi_no,
                            'STOP_Name': stop['name'],
                            'ID': stop_id
                        })
        else:
            print(f"Failed to fetch route sequence data from TfL API for {route_type} route. Status code: {response.status_code}")

        return results_list

    # Fetch and process outbound route data for _A**
    matched_results_A = fetch_and_process_route_data('outbound', pattern_A, filtered_df_A)

    # Fetch and process inbound route data for _B**
    matched_results_B = fetch_and_process_route_data('inbound', pattern_B, filtered_df_B)

    # Create DataFrames from the matched results for each direction
    matched_results_df_A = pd.DataFrame(matched_results_A)
    matched_results_df_B = pd.DataFrame(matched_results_B)

    # Function to remove partial matches if exact matches are found
    def remove_partial_matches(exact_df, matched_df):
        for index, row in exact_df.iterrows():
            exact_stop_name = row['STOP_NAME']
            route_dir_qsi_no = row['Route_Dir_QSI_No']
            # Find exact matches in matched_df
            exact_matches = matched_df[(matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                       (matched_df['STOP_Name'].apply(normalize_stop_name) == exact_stop_name)]
            if not exact_matches.empty:
                # Remove partial matches
                matched_df = matched_df[~((matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                          (matched_df['STOP_Name'].apply(normalize_stop_name) != exact_stop_name))]
        return matched_df

    # Remove partial matches for direction A
    matched_results_df_A = remove_partial_matches(filtered_df_A, matched_results_df_A)

    # Remove partial matches for direction B
    matched_results_df_B = remove_partial_matches(filtered_df_B, matched_results_df_B)

    # Remove duplicate stop names with the same Route_Dir_QSI_No and different IDs
    matched_results_df_A = matched_results_df_A.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')
    matched_results_df_B = matched_results_df_B.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')

    # Print the matched results for direction A
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_A\033[0m\n")
    matched_results_df_A = matched_results_df_A[matched_results_df_A['Route_Dir_QSI_No'].str.match(pattern_A)]
    print(matched_results_df_A[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Print the matched results for direction B
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_B\033[0m\n")
    matched_results_df_B = matched_results_df_B[matched_results_df_B['Route_Dir_QSI_No'].str.match(pattern_B)]
    print(matched_results_df_B[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Concatenate the matched results DataFrames for directions A and B
    combined_df = pd.concat([matched_results_df_A, matched_results_df_B], ignore_index=True)

    # DataFrames to store SWT data
    swt_data = {
        'Route_Dir_QSI_No': [],
        'ID': [],
        'SWT_minutes': [],
        'Number_of_buses': []
    }

    # Fetch timetable for each stop point ID and calculate SWT
    bst = pytz.timezone('Europe/London')
    current_hour = datetime.now(bst).hour
    day_of_week = get_day_of_week()

    # Store selected schedule name to ensure it's printed only once
    selected_schedule_name = None
    printed_schedule_name = False

    # Track printed timetable stop IDs
    printed_timetable_stop_ids = []

    while True:
        # Update current time and hour
        now = datetime.now(bst)
        current_hour = now.hour
        day_of_week = get_day_of_week()

        # Clear previous SWT data
        swt_data = {
            'Route_Dir_QSI_No': [],
            'ID': [],
            'SWT_minutes': [],
            'Number_of_buses': []
        }

        for index, row in combined_df.iterrows():
            stop_point_id = row['ID']
            route_dir_qsi_no = row['Route_Dir_QSI_No']

            if f"{lineID}_A" in route_dir_qsi_no:
                direction = 'outbound'
            elif f"{lineID}_B" in route_dir_qsi_no:
                direction = 'inbound'
            else:
                print(f"Invalid route direction for Route_Dir_QSI_No: {route_dir_qsi_no}")
                continue

            url = f'https://api.tfl.gov.uk/Line/{lineID}/Timetable/{stop_point_id}?direction={direction}'
            data = fetch_data(url)

            schedule_names_dict = extract_schedule_names(data)

            if not selected_schedule_name:
                if day_of_week.lower() in ['monday', 'tuesday', 'wednesday', 'thursday']:
                    preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Thursday', 'Monday to Friday']
                elif day_of_week.lower() == 'friday':
                    preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Friday', 'Friday']
                elif day_of_week.lower() == 'saturday':
                    preferred_schedule_names = ['Saturday']
                elif day_of_week.lower() == 'sunday':
                    preferred_schedule_names = ['Sunday']
                else:
                    preferred_schedule_names = [day_of_week]

                for preferred_name in preferred_schedule_names:
                    if preferred_name in schedule_names_dict:
                        selected_schedule_name = preferred_name
                        break

            if selected_schedule_name and not printed_schedule_name:
                print(f"\nToday is {day_of_week}. The selected Schedule name is {selected_schedule_name}.")
                printed_schedule_name = True

            if selected_schedule_name:
                timetable = schedule_names_dict[selected_schedule_name]
                slots = categorize_into_slots(timetable)

                # Calculate Scheduled Wait Time (SWT)
                total_buses_this_hour = len(slots[current_hour])
                if total_buses_this_hour > 0:
                    scheduled_wait_time = 60 / (total_buses_this_hour * 2)  # SWT formula
                else:
                    scheduled_wait_time = float('inf')  # Handle division by zero scenario (though unlikely)

                # Store SWT data
                swt_data['Route_Dir_QSI_No'].append(route_dir_qsi_no)
                swt_data['ID'].append(stop_point_id)
                swt_data['SWT_minutes'].append(scheduled_wait_time)
                swt_data['Number_of_buses'].append(total_buses_this_hour)

                # Fetch arrival predictions based on SWT data for printed timetable stop IDs
                if stop_point_id in printed_timetable_stop_ids:
                    # Fetch arrival predictions
                    arrival_predictions_df, station_name = fetch_arrival_predictions(lineID, stop_point_id, direction)

                    if arrival_predictions_df is not None and not arrival_predictions_df.empty:
                        #print(f"\nArrival Predictions for stop point {stop_point_id} ({station_name}):")
                        #print(arrival_predictions_df.to_string(index=False))
                        print(f"   ")
                        # Calculating summary metrics
                        total_wawt = arrival_predictions_df['WAWT'].sum()
                        min_arrival = arrival_predictions_df['Expected Arrival (BST)'].min().replace(second=0, microsecond=0)
                        max_arrival = arrival_predictions_df['Expected Arrival (BST)'].max().replace(second=0, microsecond=0)
                        time_diff_minutes = (max_arrival - min_arrival).total_seconds() / 60
                        num_buses_observed = arrival_predictions_df['Vehicle ID'].nunique()

                        # Calculating AWT, SWT, and EWT
                        nbph = swt_data['Number_of_buses'][swt_data['ID'].index(stop_point_id)]
                        swt = swt_data['SWT_minutes'][swt_data['ID'].index(stop_point_id)]
                        awt = round(total_wawt / time_diff_minutes, 2) if time_diff_minutes > 0 else 0
                        ewt = round(awt - swt, 2)

                        summary_df = pd.DataFrame({
                            'Metric': ['Number of buses scheduled per hour (nbph)', 'Number of buses observed', 'Total WAWT (minutes)',
                                       'Time difference between 1st and last observed buses (minutes)', 'AWT (minutes)', 'SWT (minutes)', 'EWT (minutes)'],
                            'Value': [nbph, num_buses_observed, total_wawt, time_diff_minutes, awt, swt, ewt]
                        })

                        print(f"\nSummary Metrics for {stop_point_id}:")
                        print(summary_df)

                        # Append to awt_data DataFrame
                        awt_data.loc[len(awt_data)] = [stop_point_id, num_buses_observed, awt]

                    # Remove the stop ID from printed_timetable_stop_ids to avoid redundant fetches
                    printed_timetable_stop_ids.remove(stop_point_id)

        # Update printed timetable stop IDs
        printed_timetable_stop_ids = swt_data['ID']

        # Create DataFrame for SWT data
        swt_df = pd.DataFrame(swt_data)
        awt_df = pd.DataFrame(awt_data)
        # Print the SWT DataFrame
        print(f"\n\nSWT DataFrame at hour {current_hour}")
        print(swt_df)

        print(f"\n\nAWT DataFrame at hour {current_hour}")
        print(awt_df)

        # Wait for the next 30 seconds
        #clear_output()
        print("\n\nWaiting to fetch updated data...\n\n")
        time.sleep(30)

# Function to fetch arrival predictions with error handling
def fetch_arrival_predictions(line_id, stop_point_id, direction):
    try:
        base_url = f"https://api.tfl.gov.uk/Line/{line_id}/Arrivals/{stop_point_id}"
        params = {'direction': direction}
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        if len(data) == 0:
            return pd.DataFrame(), None  # No data available
        station_name = data[0]['stationName']
        predictions = []
        for item in data:
            arrival_time = datetime.strptime(item['expectedArrival'], '%Y-%m-%dT%H:%M:%SZ')
            arrival_time_bst = arrival_time + timedelta(hours=1)
            predictions.append({
                'Line': item['lineName'],
                'Vehicle ID': item['vehicleId'],
                'Stop Point': stop_point_id,
                'Direction': direction,
                'Expected Arrival (BST)': arrival_time_bst,
                'Expected Arrival (HM)': arrival_time_bst.strftime('%H:%M')
            })
        df = pd.DataFrame(predictions)
        df = df.sort_values(by='Expected Arrival (BST)', ascending=True)
        df['Expected Arrival (BST)'] = pd.to_datetime(df['Expected Arrival (BST)'])  # Convert to datetime
        df['Expected Arrival (HM)'] = pd.to_datetime(df['Expected Arrival (HM)'], format='%H:%M')
        df['Headway (minutes)'] = df['Expected Arrival (HM)'].diff().fillna(pd.Timedelta(seconds=0)).dt.total_seconds() / 60
        df['AWT/bus (minutes)'] = (df['Headway (minutes)'] / 2).round(2)
        df['WAWT'] = (df['Headway (minutes)'] * df['AWT/bus (minutes)']).round(2)
        return df, station_name
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None, None
        time.sleep(30)  # Wait for 30 seconds before fetching data again

# Example usage
if __name__ == "__main__":
    # Load the Excel file into a DataFrame
    file_path = '/content/QSI points.xlsx'  # Modify this path accordingly
    df = pd.read_excel(file_path)

    # Ask the user to enter a lineID
    line_id_input = input("Please enter the lineID: ")

    while True:
        try:
            # Find and display the route details
            find_route_details(line_id_input, df)

            # Print SWT DataFrame
            print(f"\n\nCurrent SWT Dataframe:\n{swt_df}")

            # Print AWT Data
            print(f"\n\nCurrent AWT Data:\n{awt_data}")

            # Wait for 30 seconds before fetching updated data
            print("\n\nWaiting for 30 seconds before fetching updated data...\n\n")
            time.sleep(30)

        except KeyboardInterrupt:
            print("\n\nExecution interrupted. Exiting the loop.\n")
            break

Please enter the lineID: 25


[1m[4mQSI stop points for direction 25_A[0m

  Route_Dir_QSI_No                             STOP_Name          ID
0            25_A1                       Hainault Street  490007657V
1            25_A2                     High Street North  490008189W
2            25_A3                          Green Street  490007497W
3            25_A4                 Stratford Bus Station  490012904T
4            25_A5                    Bow Church Station  490002019B
5            25_A6                      Mile End Station  490000146C
6            25_A7  Whitechapel Stn  / Royal London Hosp  490013541A
8            25_A8                       Aldgate Station  490000003R
9            25_A9              Bank Station  / Cornhill  490000013E


[1m[4mQSI stop points for direction 25_B[0m

  Route_Dir_QSI_No                              STOP_Name          ID
1            25_B2               Bank Station  / Cornhill  490000013D
2            25_B3                        

Modifying the above code for preventing the data appending in awt df

In [11]:
import pandas as pd
import requests
from difflib import SequenceMatcher
from datetime import datetime, timedelta
import pytz
import time

# Initialize empty DataFrame for AWT data
awt_data = pd.DataFrame(columns=['Stop Point ID', 'Number of Buses Observed', 'AWT (minutes)'])

# Function to normalize stop names
def normalize_stop_name(name):
    return ' '.join(name.lower().split())

# Function to fetch data from the API
def fetch_data(url):
    response = requests.get(url)
    return response.json()

# Function to extract schedule names
def extract_schedule_names(data, schedule_names_dict={}):
    if isinstance(data, dict):
        if data.get('$type') == "Tfl.Api.Presentation.Entities.Schedule, Tfl.Api.Presentation.Entities" and 'knownJourneys' in data:
            if 'name' in data:
                schedule_names_dict[data['name']] = data['knownJourneys']
        for key, value in data.items():
            extract_schedule_names(value, schedule_names_dict)
    elif isinstance(data, list):
        for item in data:
            extract_schedule_names(item, schedule_names_dict)
    return schedule_names_dict

# Function to categorize journeys into hourly slots
def categorize_into_slots(timetable):
    slots = [[] for _ in range(24)]
    for journey in timetable:
        hour = int(journey['hour'])  # Convert hour to integer
        if 0 <= hour < 24:  # Ensure hour is within the valid range
            slots[hour].append(journey)
    return slots

# Function to fetch the current day of the week
def get_day_of_week():
    bst = pytz.timezone('Europe/London')
    now = datetime.now(bst)
    return now.strftime('%A')  # %A gives full weekday name (e.g., 'Monday')

# Function to retrieve stop names from TfL API and match with Route_Dir_QSI_No
def find_route_details(lineID, df):
    # Ensure the 'Route_Dir_QSI_No' column exists
    if 'Route_Dir_QSI_No' not in df.columns:
        print("The 'Route_Dir_QSI_No' column is not present in the provided file.")
        return

    # Convert the lineID to uppercase to ensure case-insensitivity
    lineID = lineID.upper()

    # Convert the 'Route_Dir_QSI_No' column to uppercase for comparison
    df['Route_Dir_QSI_No'] = df['Route_Dir_QSI_No'].str.upper()

    # Normalize the stop names in the DataFrame
    df['STOP_NAME'] = df['STOP_NAME'].apply(normalize_stop_name)

    # Filter the DataFrame based on the lineID
    pattern_A = f"^{lineID}_A\\d+$"  # Regular expression for D7_A**
    pattern_B = f"^{lineID}_B\\d+$"  # Regular expression for D7_B**

    # Filter rows where the 'Route_Dir_QSI_No' column matches the pattern
    filtered_df_A = df[df['Route_Dir_QSI_No'].str.match(pattern_A, na=False)]
    filtered_df_B = df[df['Route_Dir_QSI_No'].str.match(pattern_B, na=False)]

    # Function to fetch and process route sequence data from TfL API
    def fetch_and_process_route_data(route_type, pattern, filtered_df):
        api_url = f"https://api.tfl.gov.uk/Line/{lineID}/Route/Sequence/{route_type}"
        response = requests.get(api_url)

        results_list = []

        if response.status_code == 200:
            route_data = response.json()

            # Iterate through each stop in the route data
            for stop in route_data['stopPointSequences'][0]['stopPoint']:
                stop_name_api = normalize_stop_name(stop['name'])
                stop_id = stop['id']

                # Check if the stop_name_api exists in the filtered DataFrame for the correct direction
                matched_row = filtered_df[(filtered_df['STOP_NAME'] == stop_name_api) &
                                          (filtered_df['Route_Dir_QSI_No'].str.match(pattern))]

                if not matched_row.empty:
                    route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                    results_list.append({
                        'Route_Dir_QSI_No': route_dir_qsi_no,
                        'STOP_Name': stop['name'],
                        'ID': stop_id
                    })
                else:
                    # If exact match not found, try partial matching based on words before and after '/'
                    api_stop_name_parts = stop_name_api.split('/')
                    for index, row in filtered_df.iterrows():
                        df_stop_name_parts = row['STOP_NAME'].split('/')
                        for api_part in api_stop_name_parts:
                            for df_part in df_stop_name_parts:
                                if SequenceMatcher(None, df_part.strip(), api_part.strip()).ratio() > 0.8:
                                    matched_row = pd.DataFrame([row])
                                    break
                            if not matched_row.empty:
                                break
                        if not matched_row.empty:
                            break

                    if not matched_row.empty:
                        route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                        results_list.append({
                            'Route_Dir_QSI_No': route_dir_qsi_no,
                            'STOP_Name': stop['name'],
                            'ID': stop_id
                        })
        else:
            print(f"Failed to fetch route sequence data from TfL API for {route_type} route. Status code: {response.status_code}")

        return results_list

    # Fetch and process outbound route data for _A**
    matched_results_A = fetch_and_process_route_data('outbound', pattern_A, filtered_df_A)

    # Fetch and process inbound route data for _B**
    matched_results_B = fetch_and_process_route_data('inbound', pattern_B, filtered_df_B)

    # Create DataFrames from the matched results for each direction
    matched_results_df_A = pd.DataFrame(matched_results_A)
    matched_results_df_B = pd.DataFrame(matched_results_B)

    # Function to remove partial matches if exact matches are found
    def remove_partial_matches(exact_df, matched_df):
        for index, row in exact_df.iterrows():
            exact_stop_name = row['STOP_NAME']
            route_dir_qsi_no = row['Route_Dir_QSI_No']
            # Find exact matches in matched_df
            exact_matches = matched_df[(matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                       (matched_df['STOP_Name'].apply(normalize_stop_name) == exact_stop_name)]
            if not exact_matches.empty:
                # Remove partial matches
                matched_df = matched_df[~((matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                          (matched_df['STOP_Name'].apply(normalize_stop_name) != exact_stop_name))]
        return matched_df

    # Remove partial matches for direction A
    matched_results_df_A = remove_partial_matches(filtered_df_A, matched_results_df_A)

    # Remove partial matches for direction B
    matched_results_df_B = remove_partial_matches(filtered_df_B, matched_results_df_B)

    # Remove duplicate stop names with the same Route_Dir_QSI_No and different IDs
    matched_results_df_A = matched_results_df_A.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')
    matched_results_df_B = matched_results_df_B.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')

    # Print the matched results for direction A
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_A\033[0m\n")
    matched_results_df_A = matched_results_df_A[matched_results_df_A['Route_Dir_QSI_No'].str.match(pattern_A)]
    print(matched_results_df_A[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Print the matched results for direction B
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_B\033[0m\n")
    matched_results_df_B = matched_results_df_B[matched_results_df_B['Route_Dir_QSI_No'].str.match(pattern_B)]
    print(matched_results_df_B[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Concatenate the matched results DataFrames for directions A and B
    combined_df = pd.concat([matched_results_df_A, matched_results_df_B], ignore_index=True)

    # DataFrames to store SWT data
    swt_data = {
        'Route_Dir_QSI_No': [],
        'ID': [],
        'SWT_minutes': [],
        'Number_of_buses': []
    }

    # Fetch timetable for each stop point ID and calculate SWT
    bst = pytz.timezone('Europe/London')
    current_hour = datetime.now(bst).hour
    day_of_week = get_day_of_week()

    # Store selected schedule name to ensure it's printed only once
    selected_schedule_name = None
    printed_schedule_name = False

    # Track printed timetable stop IDs
    printed_timetable_stop_ids = []

    while True:
        # Update current time and hour
        now = datetime.now(bst)
        current_hour = now.hour
        day_of_week = get_day_of_week()

        # Clear previous SWT data
        swt_data = {
            'Route_Dir_QSI_No': [],
            'ID': [],
            'SWT_minutes': [],
            'Number_of_buses': []
        }

        for index, row in combined_df.iterrows():
            stop_point_id = row['ID']
            route_dir_qsi_no = row['Route_Dir_QSI_No']

            if f"{lineID}_A" in route_dir_qsi_no:
                direction = 'outbound'
            elif f"{lineID}_B" in route_dir_qsi_no:
                direction = 'inbound'
            else:
                print(f"Invalid route direction for Route_Dir_QSI_No: {route_dir_qsi_no}")
                continue

            url = f'https://api.tfl.gov.uk/Line/{lineID}/Timetable/{stop_point_id}?direction={direction}'
            data = fetch_data(url)

            schedule_names_dict = extract_schedule_names(data)

            if not selected_schedule_name:
                if day_of_week.lower() in ['monday', 'tuesday', 'wednesday', 'thursday']:
                    preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Thursday', 'Monday to Friday']
                elif day_of_week.lower() == 'friday':
                    preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Friday', 'Friday']
                elif day_of_week.lower() == 'saturday':
                    preferred_schedule_names = ['Saturday']
                elif day_of_week.lower() == 'sunday':
                    preferred_schedule_names = ['Sunday']
                else:
                    preferred_schedule_names = [day_of_week]

                for preferred_name in preferred_schedule_names:
                    if preferred_name in schedule_names_dict:
                        selected_schedule_name = preferred_name
                        break

            if selected_schedule_name and not printed_schedule_name:
                print(f"\nToday is {day_of_week}. The selected Schedule name is {selected_schedule_name}.")
                printed_schedule_name = True

            if selected_schedule_name:
                timetable = schedule_names_dict[selected_schedule_name]
                slots = categorize_into_slots(timetable)

                # Calculate Scheduled Wait Time (SWT)
                total_buses_this_hour = len(slots[current_hour])
                if total_buses_this_hour > 0:
                    scheduled_wait_time = 60 / (total_buses_this_hour * 2)  # SWT formula
                else:
                    scheduled_wait_time = float('inf')  # Handle division by zero scenario (though unlikely)

                # Store SWT data
                swt_data['Route_Dir_QSI_No'].append(route_dir_qsi_no)
                swt_data['ID'].append(stop_point_id)
                swt_data['SWT_minutes'].append(scheduled_wait_time)
                swt_data['Number_of_buses'].append(total_buses_this_hour)

                # Fetch arrival predictions based on SWT data for printed timetable stop IDs
                if stop_point_id in printed_timetable_stop_ids:
                    # Fetch arrival predictions
                    arrival_predictions_df, station_name = fetch_arrival_predictions(lineID, stop_point_id, direction)

                    if arrival_predictions_df is not None and not arrival_predictions_df.empty:
                        #print(f"\nArrival Predictions for stop point {stop_point_id} ({station_name}):")
                        #print(arrival_predictions_df.to_string(index=False))
                        print(f"   ")
                        # Calculating summary metrics
                        total_wawt = arrival_predictions_df['WAWT'].sum()
                        min_arrival = arrival_predictions_df['Expected Arrival (BST)'].min().replace(second=0, microsecond=0)
                        max_arrival = arrival_predictions_df['Expected Arrival (BST)'].max().replace(second=0, microsecond=0)
                        time_diff_minutes = (max_arrival - min_arrival).total_seconds() / 60
                        num_buses_observed = arrival_predictions_df['Vehicle ID'].nunique()

                        # Calculating AWT, SWT, and EWT
                        nbph = swt_data['Number_of_buses'][swt_data['ID'].index(stop_point_id)]
                        swt = swt_data['SWT_minutes'][swt_data['ID'].index(stop_point_id)]
                        awt = round(total_wawt / time_diff_minutes, 2) if time_diff_minutes > 0 else 0
                        ewt = round(awt - swt, 2)

                        summary_df = pd.DataFrame({
                            'Metric': ['Number of buses scheduled per hour (nbph)', 'Number of buses observed', 'Total WAWT (minutes)',
                                       'Time difference between 1st and last observed buses (minutes)', 'AWT (minutes)', 'SWT (minutes)', 'EWT (minutes)'],
                            'Value': [nbph, num_buses_observed, total_wawt, time_diff_minutes, awt, swt, ewt]
                        })

                        print(f"\nSummary Metrics for {stop_point_id}:")
                        print(summary_df)

                        # Append to awt_data DataFrame
                        awt_data.loc[len(awt_data)] = [stop_point_id, num_buses_observed, awt]

                    # Remove the stop ID from printed_timetable_stop_ids to avoid redundant fetches
                    printed_timetable_stop_ids.remove(stop_point_id)

        # Update printed timetable stop IDs
        printed_timetable_stop_ids = swt_data['ID']

        # Create DataFrame for SWT data
        swt_df = pd.DataFrame(swt_data)
        awt_df = pd.DataFrame(awt_data)
        # Print the SWT DataFrame
        print(f"\n\nSWT DataFrame at hour {current_hour}")
        print(swt_df)

        print(f"\n\nAWT DataFrame at hour {current_hour}")
        print(awt_df)

        # Wait for the next 30 seconds
        #clear_output()
        print("\n\nWaiting to fetch updated data...\n\n")
        time.sleep(30)

# Function to fetch arrival predictions with error handling
def fetch_arrival_predictions(line_id, stop_point_id, direction):
    try:
        base_url = f"https://api.tfl.gov.uk/Line/{line_id}/Arrivals/{stop_point_id}"
        params = {'direction': direction}
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        if len(data) == 0:
            return pd.DataFrame(), None  # No data available
        station_name = data[0]['stationName']
        predictions = []
        for item in data:
            arrival_time = datetime.strptime(item['expectedArrival'], '%Y-%m-%dT%H:%M:%SZ')
            arrival_time_bst = arrival_time + timedelta(hours=1)
            predictions.append({
                'Line': item['lineName'],
                'Vehicle ID': item['vehicleId'],
                'Stop Point': stop_point_id,
                'Direction': direction,
                'Expected Arrival (BST)': arrival_time_bst,
                'Expected Arrival (HM)': arrival_time_bst.strftime('%H:%M')
            })
        df = pd.DataFrame(predictions)
        df = df.sort_values(by='Expected Arrival (BST)', ascending=True)
        df['Expected Arrival (BST)'] = pd.to_datetime(df['Expected Arrival (BST)'])  # Convert to datetime
        df['Expected Arrival (HM)'] = pd.to_datetime(df['Expected Arrival (HM)'], format='%H:%M')
        df['Headway (minutes)'] = df['Expected Arrival (HM)'].diff().fillna(pd.Timedelta(seconds=0)).dt.total_seconds() / 60
        df['AWT/bus (minutes)'] = (df['Headway (minutes)'] / 2).round(2)
        df['WAWT'] = (df['Headway (minutes)'] * df['AWT/bus (minutes)']).round(2)
        return df, station_name
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None, None
        time.sleep(30)  # Wait for 30 seconds before fetching data again

# Example usage
if __name__ == "__main__":
    # Load the Excel file into a DataFrame
    file_path = '/content/QSI points.xlsx'  # Modify this path accordingly
    df = pd.read_excel(file_path)

    # Ask the user to enter a lineID
    line_id_input = input("Please enter the lineID: ")

    while True:
        try:
            # Find and display the route details
            find_route_details(line_id_input, df)

            time.sleep(30)

        except KeyboardInterrupt:
            print("\n\nExecution interrupted. Exiting the loop.\n")
            break

Please enter the lineID: 425


[1m[4mQSI stop points for direction 425_A[0m

  Route_Dir_QSI_No                         STOP_Name          ID
0           425_A1                   Hainault Street  490007657V
1           425_A2                 High Street North  490008189W
2           425_A3                      Green Street  490007497W
3           425_A4             Stratford Bus Station  490012904T
4           425_A5                Bow Church Station  490002019B
6           425_A6  Mile End Station / Mile End Road  490000146G
7           425_A7                      Moulins Road  490010068N
8           425_A8                 Homerton Hospital  490008327F


[1m[4mQSI stop points for direction 425_B[0m

  Route_Dir_QSI_No                          STOP_Name          ID
0           425_B1                   Nightingale Road  490010280N
2           425_B2  Homerton Hospital / Wardle Street  490008327E
3           425_B3                     Penshurst Road  490012255S
4           425_B4  

Modification to overwrite awt