<a href="https://colab.research.google.com/github/RemyaVKarthikeyan/AA-Stagecoach-Project/blob/main/19_07_2024_07_56_Route_AWT_A_and_B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

with dataframe showing the number of buses observed in an hr and AWT calculation

18/07/2024

In [1]:
import pandas as pd
import requests
from difflib import SequenceMatcher

# Function to normalize stop names
def normalize_stop_name(name):
    return ' '.join(name.lower().split())

# Modify this path accordingly
file_path = '/content/QSI points.xlsx'

# Read the Excel file into a DataFrame
df = pd.read_excel(file_path)

# Get the lineID from the user
lineID = input("Please enter the lineID: ")

# Check if the necessary column is present
if 'Route_Dir_QSI_No' not in df.columns:
    print("The 'Route_Dir_QSI_No' column is not present in the provided file.")
else:
    # Convert the lineID to uppercase to ensure case-insensitivity
    lineID = lineID.upper()

    # Convert the 'Route_Dir_QSI_No' column to uppercase for comparison
    df['Route_Dir_QSI_No'] = df['Route_Dir_QSI_No'].str.upper()

    # Normalize the stop names in the DataFrame
    df['STOP_NAME'] = df['STOP_NAME'].apply(normalize_stop_name)

    # Create regular expressions for filtering
    pattern_A = f"^{lineID}_A\\d+$"  # Regular expression for lineID_A**
    pattern_B = f"^{lineID}_B\\d+$"  # Regular expression for lineID_B**

    # Filter rows where the 'Route_Dir_QSI_No' column matches the pattern
    filtered_df_A = df[df['Route_Dir_QSI_No'].str.match(pattern_A, na=False)][['Route_Dir_QSI_No', 'STOP_NAME']]
    filtered_df_B = df[df['Route_Dir_QSI_No'].str.match(pattern_B, na=False)][['Route_Dir_QSI_No', 'STOP_NAME']]

    # Function to fetch and process route sequence data from TfL API
    def fetch_and_process_route_data(route_type, pattern, filtered_df):
        api_url = f"https://api.tfl.gov.uk/Line/{lineID}/Route/Sequence/{route_type}"
        response = requests.get(api_url)

        results_list = []

        if response.status_code == 200:
            route_data = response.json()

            # Iterate through each stop in the route data
            for stop in route_data['stopPointSequences'][0]['stopPoint']:
                stop_name_api = normalize_stop_name(stop['name'])
                stop_id = stop['id']

                # Check if the stop_name_api exists in the filtered DataFrame for the correct direction
                matched_row = filtered_df[(filtered_df['STOP_NAME'] == stop_name_api) &
                                          (filtered_df['Route_Dir_QSI_No'].str.match(pattern))]

                if not matched_row.empty:
                    route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                    results_list.append({
                        'Route_Dir_QSI_No': route_dir_qsi_no,
                        'STOP_Name': stop['name'],
                        'ID': stop_id
                    })
                else:
                    # If exact match not found, try partial matching based on words before and after '/'
                    api_stop_name_parts = stop_name_api.split('/')
                    for index, row in filtered_df.iterrows():
                        df_stop_name_parts = row['STOP_NAME'].split('/')
                        for api_part in api_stop_name_parts:
                            for df_part in df_stop_name_parts:
                                if SequenceMatcher(None, df_part.strip(), api_part.strip()).ratio() > 0.8:
                                    matched_row = pd.DataFrame([row])
                                    break
                            if not matched_row.empty:
                                break
                        if not matched_row.empty:
                            break

                    if not matched_row.empty:
                        route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                        results_list.append({
                            'Route_Dir_QSI_No': route_dir_qsi_no,
                            'STOP_Name': stop['name'],
                            'ID': stop_id
                        })
        else:
            print(f"Failed to fetch route sequence data from TfL API for {route_type} route. Status code: {response.status_code}")

        return results_list

    # Fetch and process outbound route data for _A**
    matched_results_A = fetch_and_process_route_data('outbound', pattern_A, filtered_df_A)

    # Fetch and process inbound route data for _B**
    matched_results_B = fetch_and_process_route_data('inbound', pattern_B, filtered_df_B)

    # Create DataFrames from the matched results for each direction
    matched_results_df_A = pd.DataFrame(matched_results_A)
    matched_results_df_B = pd.DataFrame(matched_results_B)

    # Function to remove partial matches if exact matches are found
    def remove_partial_matches(exact_df, matched_df):
        for index, row in exact_df.iterrows():
            exact_stop_name = row['STOP_NAME']
            route_dir_qsi_no = row['Route_Dir_QSI_No']
            # Find exact matches in matched_df
            exact_matches = matched_df[(matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                       (matched_df['STOP_Name'].apply(normalize_stop_name) == exact_stop_name)]
            if not exact_matches.empty:
                # Remove partial matches
                matched_df = matched_df[~((matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                          (matched_df['STOP_Name'].apply(normalize_stop_name) != exact_stop_name))]
        return matched_df

    # Remove partial matches for direction A
    matched_results_df_A = remove_partial_matches(filtered_df_A, matched_results_df_A)

    # Remove partial matches for direction B
    matched_results_df_B = remove_partial_matches(filtered_df_B, matched_results_df_B)

    # Remove duplicate stop names with the same Route_Dir_QSI_No and different IDs
    matched_results_df_A = matched_results_df_A.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')
    matched_results_df_B = matched_results_df_B.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')

    # Print the matched results for direction A
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_A\033[0m\n")
    matched_results_df_A = matched_results_df_A[matched_results_df_A['Route_Dir_QSI_No'].str.match(pattern_A)]
    print(matched_results_df_A[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Print the matched results for direction B
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_B\033[0m\n")
    matched_results_df_B = matched_results_df_B[matched_results_df_B['Route_Dir_QSI_No'].str.match(pattern_B)]
    print(matched_results_df_B[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])

    # Concatenate the matched results DataFrames for directions A and B
    combined_df = pd.concat([matched_results_df_A, matched_results_df_B], ignore_index=True)

    # Display the combined DataFrame
    print("\n\n\033[1m\033[4mCombined QSI stop points for directions A and B\033[0m\n")
    print(combined_df[['Route_Dir_QSI_No', 'STOP_Name', 'ID']])


Please enter the lineID: d7


[1m[4mQSI stop points for direction D7_A[0m

  Route_Dir_QSI_No                    STOP_Name          ID
0            D7_A1   Poplar / All Saints Church  490011107G
1            D7_A2               Stewart Street  490013513S
2            D7_A3       Island Gardens Station  490002048Z
3            D7_A4  Arnhem Wharf Primary School  490006092N
5            D7_A5         East India Dock Road  490004584N


[1m[4mQSI stop points for direction D7_B[0m

  Route_Dir_QSI_No                    STOP_Name          ID
0            D7_B1  Mile End Station / Bow Road  490015151H
2            D7_B2         East India Dock Road  490004584S
3            D7_B3         Canary Wharf Station  490000038F
5            D7_B4  Arnhem Wharf Primary School  490006092S
6            D7_B5       Island Gardens Station  490002048X
7            D7_B6               Stewart Street  490013513N


[1m[4mCombined QSI stop points for directions A and B[0m

   Route_Dir_QSI_No          

In [2]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import pytz
import time

# Function to fetch arrival predictions
def fetch_arrival_predictions(line_id, stop_point_id, direction):
    try:
        base_url = f"https://api.tfl.gov.uk/Line/{line_id}/Arrivals/{stop_point_id}"
        params = {'direction': direction}
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        if len(data) == 0:
            return pd.DataFrame(), None  # No data available
        station_name = data[0]['stationName']
        predictions = []
        for item in data:
            arrival_time = datetime.strptime(item['expectedArrival'], '%Y-%m-%dT%H:%M:%SZ')
            arrival_time_bst = arrival_time + timedelta(hours=1)
            predictions.append({
                'Line': item['lineName'],
                'Vehicle ID': item['vehicleId'],
                'Stop Point': stop_point_id,
                'Direction': direction,
                'Expected Arrival (BST)': arrival_time_bst,
                'Expected Arrival (HM)': arrival_time_bst.strftime('%H:%M')
            })
        df = pd.DataFrame(predictions)
        df = df.sort_values(by='Expected Arrival (BST)', ascending=True)
        df['Expected Arrival (BST)'] = pd.to_datetime(df['Expected Arrival (BST)'])  # Convert to datetime
        df['Expected Arrival (HM)'] = pd.to_datetime(df['Expected Arrival (HM)'], format='%H:%M')
        df['Gap'] = df['Expected Arrival (HM)'].diff().fillna(pd.Timedelta(seconds=0)).dt.total_seconds() / 60
        df['2_Gap'] = (df['Gap'] * 2).round(2)
        df['Gap_Sq'] = (df['Gap'] * df['Gap']).round(2)
        return df, station_name
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None, None

def main():
    # Assuming combined_df is generated from the previous cell
    # combined_df should have columns like 'Route_Dir_QSI_No', 'STOP_Name', 'ID'

    # Example dictionary to hold cumulative dataframes for each stop point
    cumulative_dataframes = {}

    # Dictionary to hold the number of buses observed per stop point
    buses_observed = {}

    # Loop through unique stop points in combined_df
    for index, row in combined_df.iterrows():
        stop_point_id = row['ID']
        direction = 'outbound' if row['Route_Dir_QSI_No'].startswith(lineID + '_A') else 'inbound'

        cumulative_df = pd.DataFrame(columns=[
            'Line', 'Vehicle ID', 'Stop Point', 'Direction',
            'Expected Arrival (BST)', 'Expected Arrival (HM)',
            'Gap', '2_Gap', 'Gap_Sq'
        ])

        cumulative_dataframes[stop_point_id] = cumulative_df  # Initialize cumulative dataframe
        buses_observed[stop_point_id] = (0, 0, 0, 0)  # Initialize with zero values

    while True:
        for stop_point_id, cumulative_df in cumulative_dataframes.items():
            direction = 'outbound' if combined_df[combined_df['ID'] == stop_point_id]['Route_Dir_QSI_No'].iloc[0].startswith(lineID + '_A') else 'inbound'

            arrival_predictions_df, station_name = fetch_arrival_predictions(lineID, stop_point_id, direction)

            if arrival_predictions_df is not None and not arrival_predictions_df.empty:
                current_hour = datetime.now(pytz.timezone('Europe/London')).hour

                for _, row in arrival_predictions_df.iterrows():
                    vehicle_id = row['Vehicle ID']
                    expected_hour = row['Expected Arrival (BST)'].hour

                    mask = cumulative_df['Vehicle ID'] == vehicle_id

                    if cumulative_df[mask].empty:
                        cumulative_df = pd.concat([cumulative_df, row.to_frame().T], ignore_index=True)
                    else:
                        existing_hour = cumulative_df.loc[mask, 'Expected Arrival (BST)'].iloc[0].hour

                        if expected_hour > existing_hour + 1:
                            cumulative_df = pd.concat([cumulative_df, row.to_frame().T], ignore_index=True)
                        else:
                            cumulative_df.loc[mask, ['Line', 'Stop Point', 'Direction', 'Expected Arrival (BST)', 'Expected Arrival (HM)', 'Gap', '2_Gap', 'Gap_Sq']] = row[['Line', 'Stop Point', 'Direction', 'Expected Arrival (BST)', 'Expected Arrival (HM)', 'Gap', '2_Gap', 'Gap_Sq']].values

                cumulative_df = cumulative_df.sort_values(by='Expected Arrival (BST)', ascending=True)
                cumulative_df['Expected Arrival (BST)'] = pd.to_datetime(cumulative_df['Expected Arrival (BST)'])
                cumulative_df['Expected Arrival (HM)'] = pd.to_datetime(cumulative_df['Expected Arrival (HM)'], format='%H:%M')

                cumulative_df['Gap'] = cumulative_df['Expected Arrival (HM)'].diff().fillna(pd.Timedelta(seconds=0)).dt.total_seconds() / 60
                cumulative_df['2_Gap'] = (cumulative_df['Gap'] * 2).round(2)
                cumulative_df['Gap_Sq'] = (cumulative_df['Gap'] * cumulative_df['Gap']).round(2)

                # Update number of buses observed in the current hour
                num_buses_observed = cumulative_df[cumulative_df['Expected Arrival (BST)'].dt.hour == current_hour].shape[0]
                total_Gap_Sq = cumulative_df[cumulative_df['Expected Arrival (BST)'].dt.hour == current_hour]['Gap_Sq'].sum()
                total_2_Gap = cumulative_df[cumulative_df['Expected Arrival (BST)'].dt.hour == current_hour]['2_Gap'].sum()
                AWT = round(total_Gap_Sq / total_2_Gap, 2) if total_2_Gap > 0 else 0
                buses_observed[stop_point_id] = (num_buses_observed, total_Gap_Sq, total_2_Gap, AWT)

                print(f"\nArrival Predictions for stop point {stop_point_id} ({station_name}):")
                print(arrival_predictions_df.to_string(index=False))
                print("\nCumulative DataFrame:")
                print(cumulative_df.to_string(index=False))
                print(f"\nNumber of buses observed in the current hour: {num_buses_observed}")
            else:
                print("No arrival predictions available.")

            print("Refreshing data in 30 seconds...\n")
            time.sleep(30)

            # Update cumulative dataframe in dictionary
            cumulative_dataframes[stop_point_id] = cumulative_df

        # Create DataFrame to show number of buses observed for each stop point
        buses_observed_df = pd.DataFrame(list(buses_observed.items()), columns=['Stop Point', 'Metrics'])

        # Split 'Metrics' into separate columns
        buses_observed_df[['Num of Buses Observed', 'Total Gap Sq', 'Total 2 Gap', 'AWT']] = pd.DataFrame(
            buses_observed_df['Metrics'].tolist(), index=buses_observed_df.index
        )

        # Calculate WAWT as the product of AWT and Num of Buses Observed
        buses_observed_df['WAWT'] = buses_observed_df['AWT'] * buses_observed_df['Num of Buses Observed']

        # Drop the 'Metrics' column
        buses_observed_df.drop(columns=['Metrics'], inplace=True)

        print("\nNumber of Buses Observed DataFrame:")
        print(buses_observed_df)

        # Calculate Route AWT A and Route AWT B
        pattern_A = f"^{lineID}_A\\d+$"
        pattern_B = f"^{lineID}_B\\d+$"

        buses_observed_df_A = buses_observed_df[buses_observed_df['Stop Point'].isin(combined_df[combined_df['Route_Dir_QSI_No'].str.match(pattern_A)]['ID'])]
        buses_observed_df_B = buses_observed_df[buses_observed_df['Stop Point'].isin(combined_df[combined_df['Route_Dir_QSI_No'].str.match(pattern_B)]['ID'])]

        sum_WAWT_A = buses_observed_df_A['WAWT'].sum()
        sum_buses_observed_A = buses_observed_df_A['Num of Buses Observed'].sum()
        route_AWT_A = round(sum_WAWT_A / sum_buses_observed_A, 2) if sum_buses_observed_A > 0 else 0

        sum_WAWT_B = buses_observed_df_B['WAWT'].sum()
        sum_buses_observed_B = buses_observed_df_B['Num of Buses Observed'].sum()
        route_AWT_B = round(sum_WAWT_B / sum_buses_observed_B, 2) if sum_buses_observed_B > 0 else 0

        current_hour = datetime.now(pytz.timezone('Europe/London')).hour
        route_awt_df = pd.DataFrame({
            'Hour': [current_hour],
            'Route AWT A': [route_AWT_A],
            'Route AWT B': [route_AWT_B]
        })

        print("\nRoute AWT DataFrame:")
        print(route_awt_df)

if __name__ == "__main__":
    main()



Arrival Predictions for stop point 490011107G (Poplar / All Saints Church):
Line Vehicle ID Stop Point Direction Expected Arrival (BST) Expected Arrival (HM)  Gap  2_Gap  Gap_Sq
  D7    LX11BJU 490011107G  outbound    2024-07-19 08:11:22   1900-01-01 08:11:00  0.0    0.0     0.0
  D7    LX11BJV 490011107G  outbound    2024-07-19 08:22:22   1900-01-01 08:22:00 11.0   22.0   121.0

Cumulative DataFrame:
Line Vehicle ID Stop Point Direction Expected Arrival (BST) Expected Arrival (HM)  Gap  2_Gap  Gap_Sq
  D7    LX11BJU 490011107G  outbound    2024-07-19 08:11:22   1900-01-01 08:11:00  0.0    0.0     0.0
  D7    LX11BJV 490011107G  outbound    2024-07-19 08:22:22   1900-01-01 08:22:00 11.0   22.0   121.0

Number of buses observed in the current hour: 2
Refreshing data in 30 seconds...


Arrival Predictions for stop point 490013513S (Stewart Street):
Line Vehicle ID Stop Point Direction Expected Arrival (BST) Expected Arrival (HM)  Gap  2_Gap  Gap_Sq
  D7    LX11BFF 490013513S  outbound  

KeyboardInterrupt: 