<a href="https://colab.research.google.com/github/RemyaVKarthikeyan/AA-Stagecoach-Project/blob/main/16_Aug_2024_Final1_Prediction_steps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import requests
import os
from datetime import datetime, timedelta
import pytz
import time
from difflib import SequenceMatcher
import numpy as np
from sklearn.linear_model import LinearRegression
from google.colab import drive
drive.mount('/content/drive')

# Function to normalize stop names
def normalize_stop_name(name):
    return ' '.join(name.lower().split())
from google.colab import drive
drive.mount('/content/drive')

# Read the Excel file
excel_file_path = '/content/drive/MyDrive/Files/QSI points.xlsx'
df = pd.read_excel(excel_file_path)
df_sheet2 = pd.read_excel(excel_file_path, sheet_name='Sheet2')
output_dir = '/content/drive/My Drive/Files/'

# Get the lineID from the user
lineID = input("Please enter the lineID: ")

# Check if the necessary column is present
if 'Route_Dir_QSI_No' not in df.columns:
    print("The 'Route_Dir_QSI_No' column is not present in the provided file.")
else:
    # Convert the lineID to uppercase to ensure case-insensitivity
    lineID = lineID.upper()

    # Convert the 'Route_Dir_QSI_No' column to uppercase for comparison
    df['Route_Dir_QSI_No'] = df['Route_Dir_QSI_No'].str.upper()

    # Normalize the stop names in the DataFrame
    df['STOP_NAME'] = df['STOP_NAME'].apply(normalize_stop_name)

    # Create regular expressions for filtering
    pattern_A = f"^{lineID}_A\\d+$"  # Regular expression for lineID_A**
    pattern_B = f"^{lineID}_B\\d+$"  # Regular expression for lineID_B**

    # Filter rows where the 'Route_Dir_QSI_No' column matches the pattern
    filtered_df_A = df[df['Route_Dir_QSI_No'].str.match(pattern_A, na=False)][['Route_Dir_QSI_No', 'STOP_NAME']]
    filtered_df_B = df[df['Route_Dir_QSI_No'].str.match(pattern_B, na=False)][['Route_Dir_QSI_No', 'STOP_NAME']]

    # Function to fetch and process route data from TfL API
    def fetch_and_process_route_data(route_type, pattern, filtered_df):
        api_url = f"https://api.tfl.gov.uk/Line/{lineID}/Route/Sequence/{route_type}"
        response = requests.get(api_url)

        results_list = []

        if response.status_code == 200:
            route_data = response.json()

            # Iterate through each stop in the route data
            for stop in route_data['stopPointSequences'][0]['stopPoint']:
                stop_name_api = normalize_stop_name(stop['name'])
                stop_id = stop['id']

                # Check if the stop_name_api exists in the filtered DataFrame for the correct direction
                matched_row = filtered_df[(filtered_df['STOP_NAME'] == stop_name_api) &
                                          (filtered_df['Route_Dir_QSI_No'].str.match(pattern))]

                if not matched_row.empty:
                    route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                    results_list.append({
                        'Route_Dir_QSI_No': route_dir_qsi_no,
                        'STOP_Name': stop['name'],
                        'ID': stop_id
                    })
                else:
                    # If exact match not found, try partial matching based on words before and after '/'
                    api_stop_name_parts = stop_name_api.split('/')
                    for index, row in filtered_df.iterrows():
                        df_stop_name_parts = row['STOP_NAME'].split('/')
                        for api_part in api_stop_name_parts:
                            for df_part in df_stop_name_parts:
                                if SequenceMatcher(None, df_part.strip(), api_part.strip()).ratio() > 0.8:
                                    matched_row = pd.DataFrame([row])
                                    break
                            if not matched_row.empty:
                                break
                        if not matched_row.empty:
                            break

                    if not matched_row.empty:
                        route_dir_qsi_no = matched_row.iloc[0]['Route_Dir_QSI_No']
                        results_list.append({
                            'Route_Dir_QSI_No': route_dir_qsi_no,
                            'STOP_Name': stop['name'],
                            'ID': stop_id
                        })
        else:
            print(f"Failed to fetch route sequence data from TfL API for {route_type} route. Status code: {response.status_code}")

        return results_list

    # Fetch and process outbound route data for _A**
    matched_results_A = fetch_and_process_route_data('outbound', pattern_A, filtered_df_A)

    # Fetch and process inbound route data for _B**
    matched_results_B = fetch_and_process_route_data('inbound', pattern_B, filtered_df_B)

    # Create DataFrames from the matched results for each direction
    matched_results_df_A = pd.DataFrame(matched_results_A)
    matched_results_df_B = pd.DataFrame(matched_results_B)

    # Function to remove partial matches if exact matches are found
    def remove_partial_matches(exact_df, matched_df):
        for index, row in exact_df.iterrows():
            exact_stop_name = row['STOP_NAME']
            route_dir_qsi_no = row['Route_Dir_QSI_No']
            # Find exact matches in matched_df
            exact_matches = matched_df[(matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                       (matched_df['STOP_Name'].apply(normalize_stop_name) == exact_stop_name)]
            if not exact_matches.empty:
                # Remove partial matches
                matched_df = matched_df[~((matched_df['Route_Dir_QSI_No'] == route_dir_qsi_no) &
                                          (matched_df['STOP_Name'].apply(normalize_stop_name) != exact_stop_name))]
        return matched_df

    # Remove partial matches for direction A
    matched_results_df_A = remove_partial_matches(filtered_df_A, matched_results_df_A)

    # Remove partial matches for direction B
    matched_results_df_B = remove_partial_matches(filtered_df_B, matched_results_df_B)

    # Remove duplicate stop names with the same Route_Dir_QSI_No and different IDs
    matched_results_df_A = matched_results_df_A.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')
    matched_results_df_B = matched_results_df_B.drop_duplicates(subset=['Route_Dir_QSI_No', 'STOP_Name'], keep='first')

    # Read the bus-stops.csv file
    #bus_stops_file_path = '/content/drive/MyDrive/Report/bus-stops.csv'
    bus_stops_df = pd.read_excel(excel_file_path, sheet_name='Sheet3')

    # Normalize the stop names in the bus stops DataFrame
    bus_stops_df['Stop_Name'] = bus_stops_df['Stop_Name'].apply(normalize_stop_name)

    # Match QSI point IDs with bus stop codes
    def match_qsi_with_bus_stops(matched_df, bus_stops_df):
        matched_results = []
        for index, row in matched_df.iterrows():
            stop_id = row['ID']
            matched_stop = bus_stops_df[bus_stops_df['Naptan_Atco'] == stop_id]
            if not matched_stop.empty:
                bus_stop_code = matched_stop.iloc[0]['Bus_Stop_Code']
                matched_results.append({
                    'Route_Dir_QSI_No': row['Route_Dir_QSI_No'],
                    'STOP_Name': row['STOP_Name'],
                    'ID': row['ID'],
                    'Bus_Stop_Code': bus_stop_code
                })
        return pd.DataFrame(matched_results)

    # Perform the matching for direction A
    matched_results_df_A = match_qsi_with_bus_stops(matched_results_df_A, bus_stops_df)

    # Perform the matching for direction B
    matched_results_df_B = match_qsi_with_bus_stops(matched_results_df_B, bus_stops_df)

    # Print the matched results for direction A
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_A\033[0m\n")
    print(matched_results_df_A[['Route_Dir_QSI_No', 'STOP_Name', 'ID', 'Bus_Stop_Code']])

    # Print the matched results for direction B
    print(f"\n\n\033[1m\033[4mQSI stop points for direction {lineID}_B\033[0m\n")
    print(matched_results_df_B[['Route_Dir_QSI_No', 'STOP_Name', 'ID', 'Bus_Stop_Code']])

    # Concatenate the matched results DataFrames for directions A and B
    combined_df = pd.concat([matched_results_df_A, matched_results_df_B], ignore_index=True)

    # Display the combined DataFrame
    print("\n\n\033[1m\033[4mCombined QSI stop points for directions A and B\033[0m\n")
    print(combined_df[['Route_Dir_QSI_No', 'STOP_Name', 'ID', 'Bus_Stop_Code']])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Please enter the lineID: d7


[1m[4mQSI stop points for direction D7_A[0m

  Route_Dir_QSI_No                    STOP_Name          ID  Bus_Stop_Code
0            D7_A1   Poplar / All Saints Church  490011107G          73923
1            D7_A2               Stewart Street  490013513S          47950
2            D7_A3       Island Gardens Station  490002048Z          76947
3            D7_A4  Arnhem Wharf Primary School  490006092N          50948
4            D7_A5         East India Dock Road  490004584N          47475


[1m[4mQSI stop points for direction D7_B[0m

  Route_Dir_QSI_No                    STOP_Name          ID  Bus_Stop_Code
0            D7_B1  Mile End Station / Bow Road  490015151H          48439
1            D7_B2         East India Dock Road  490004584S          56224
2            D7_B3         Canary Wharf Station  490000038F         

In [None]:
# Function to normalize stop names
def normalize_stop_name(name):
    return ' '.join(name.lower().split())

# Function to fetch data from the TfL API
def fetch_data(url):
    response = requests.get(url)
    return response.json()

# Function to extract schedule names
def extract_schedule_names(data, schedule_names_dict={}):
    if isinstance(data, dict):
        if data.get('$type') == "Tfl.Api.Presentation.Entities.Schedule, Tfl.Api.Presentation.Entities" and 'knownJourneys' in data:
            if 'name' in data:
                schedule_names_dict[data['name']] = data['knownJourneys']
        for key, value in data.items():
            extract_schedule_names(value, schedule_names_dict)
    elif isinstance(data, list):
        for item in data:
            extract_schedule_names(item, schedule_names_dict)
    return schedule_names_dict

# Function to categorize journeys into hourly slots
def categorize_into_slots(timetable):
    slots = [[] for _ in range(24)]
    for journey in timetable:
        hour = int(journey['hour'])  # Convert hour to integer
        if 0 <= hour < 24:  # Ensure hour is within the valid range
            slots[hour].append(journey)
    return slots

# Function to fetch the current day of the week
def get_day_of_week():
    bst = pytz.timezone('Europe/London')
    now = datetime.now(bst)
    return now.strftime('%A')  # %A gives full weekday name (e.g., 'Monday')

# Function to select the preferred schedule name based on the current day of the week
def select_preferred_schedule(schedule_names_dict, day_of_week):
    if day_of_week.lower() in ['monday', 'tuesday', 'wednesday', 'thursday']:
        preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Thursday', 'Monday to Friday']
    elif day_of_week.lower() == 'friday':
        preferred_schedule_names = ['Mon-Fri Schooldays', 'Monday to Friday', 'Friday']
    elif day_of_week.lower() == 'saturday':
        preferred_schedule_names = ['Saturday']
    elif day_of_week.lower() == 'sunday':
        preferred_schedule_names = ['Sunday']
    else:
        preferred_schedule_names = [day_of_week]

    for preferred_name in preferred_schedule_names:
        if preferred_name in schedule_names_dict:
            return preferred_name
    return None


# Function to display the timetable for each stop point
def display_timetable_for_stop_points(combined_df, lineID):
    timetable_dict = {}

    # Fetch the current time in BST
    bst = pytz.timezone('Europe/London')
    now = datetime.now(bst)
    current_hour = now.hour
    current_minute = now.minute

    # Get the current day of the week
    day_of_week = get_day_of_week()

    for index, row in combined_df.iterrows():
        stop_point_id = row['ID']
        route_dir_qsi_no = row['Route_Dir_QSI_No']
        stop_name = row['STOP_Name']

        # Determine the direction
        if f"{lineID}_A" in route_dir_qsi_no:
            direction = 'outbound'
        elif f"{lineID}_B" in route_dir_qsi_no:
            direction = 'inbound'
        else:
            continue  # Skip if direction cannot be determined

        # Fetch timetable data for the stop point
        url = f'https://api.tfl.gov.uk/Line/{lineID}/Timetable/{stop_point_id}?direction={direction}'
        data = fetch_data(url)

        # Extract schedule names
        schedule_names_dict = extract_schedule_names(data)

        # Select the preferred schedule name based on the current day of the week

        selected_schedule_name = select_preferred_schedule(schedule_names_dict, day_of_week)

        if selected_schedule_name:
            # Categorize the timetable into hourly slots
            timetable = schedule_names_dict[selected_schedule_name]
            slots = categorize_into_slots(timetable)

            # Prepare a DataFrame for the timetable for the specified hours
            timetable_list = []

            for hour in range(24):
                for journey in slots[hour]:
                    journey_hour = str(journey['hour']).zfill(2)
                    journey_minute = str(journey['minute']).zfill(2)
                    time = f"{journey_hour}:{journey_minute}"
                    timetable_list.append({
                        'Line': lineID,
                        #'Stop Point': stop_name,
                        'Stop Point ID': stop_point_id,
                        'Direction': direction,
                        'Scheduled Time': time
                    })

            timetable_df = pd.DataFrame(timetable_list)
            timetable_dict[stop_point_id] = timetable_df

    return timetable_dict


# Display the timetable
timetable_dict = display_timetable_for_stop_points(combined_df, lineID)


# Save the timetable dictionary to files
for stop_point_id, timetable_df in timetable_dict.items():
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    timetable_df.to_csv(f'{output_dir}timetable_{stop_point_id}.csv', index=False)
    # Save the DataFrame to a CSV file
    print(f"\n\n\033[1m\033[4mTimetable for {stop_point_id}:\033[0m\n")
    print(timetable_df.to_string(index=False))




[1m[4mTimetable for 490011107G:[0m

Line Stop Point ID Direction Scheduled Time
  D7    490011107G  outbound          04:32
  D7    490011107G  outbound          04:52
  D7    490011107G  outbound          05:12
  D7    490011107G  outbound          05:32
  D7    490011107G  outbound          05:52
  D7    490011107G  outbound          06:07
  D7    490011107G  outbound          06:22
  D7    490011107G  outbound          06:32
  D7    490011107G  outbound          06:43
  D7    490011107G  outbound          06:54
  D7    490011107G  outbound          07:05
  D7    490011107G  outbound          07:16
  D7    490011107G  outbound          07:27
  D7    490011107G  outbound          07:38
  D7    490011107G  outbound          07:49
  D7    490011107G  outbound          08:00
  D7    490011107G  outbound          08:11
  D7    490011107G  outbound          08:22
  D7    490011107G  outbound          08:33
  D7    490011107G  outbound          08:45
  D7    490011107G  outbound       

In [None]:
# Function to fetch arrival predictions
def fetch_arrival_predictions(line_id, stop_point_id, direction):
    try:
        base_url = f"https://api.tfl.gov.uk/Line/{line_id}/Arrivals/{stop_point_id}"
        params = {'direction': direction}
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        if len(data) == 0:
            return pd.DataFrame(), None  # No data available
        station_name = data[0]['stationName']
        predictions = []
        for item in data:
            arrival_time = datetime.strptime(item['expectedArrival'], '%Y-%m-%dT%H:%M:%SZ')
            arrival_time_bst = arrival_time + timedelta(hours=1)
            predictions.append({
                'Line': item['lineName'],
                'Vehicle ID': item['vehicleId'],
                'Stop Point': stop_point_id,
                'Direction': direction,
                'Expected Arrival (BST)': arrival_time_bst,
                'Expected Arrival (HM)': arrival_time_bst.strftime('%H:%M')
            })
        df = pd.DataFrame(predictions)
        df = df.sort_values(by='Expected Arrival (BST)', ascending=True)
        df['Expected Arrival (BST)'] = pd.to_datetime(df['Expected Arrival (BST)'])  # Convert to datetime
        df['Expected Arrival (HM)'] = pd.to_datetime(df['Expected Arrival (HM)'], format='%H:%M')
        df['Gap'] = df['Expected Arrival (HM)'].diff().fillna(pd.Timedelta(seconds=0)).dt.total_seconds() / 60
        df['2_Gap'] = (df['Gap'] * 2).round(2)
        df['Gap_Sq'] = (df['Gap'] * df['Gap']).round(2)
        return df, station_name
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None, None

def fetch_current_hour_swt(route_swt_df):
    current_hour = datetime.now(pytz.timezone('Europe/London')).hour

    current_swt = route_swt_df[route_swt_df['Hour'] == current_hour]
    if not current_swt.empty:
        route_swt_a = current_swt['Route SWT A'].values[0]
        route_swt_b = current_swt['Route SWT B'].values[0]

        # Calculate double headway
        double_headway_a = route_swt_a * 2
        double_headway_b = route_swt_b * 2 * 2

        print(f"Route SWT A for the current hour ({current_hour}): {route_swt_a}")
        print(f"Route SWT B for the current hour ({current_hour}): {route_swt_b}")
        print(f"Double Headway for Route A: {double_headway_a}")
        print(f"Double Headway for Route B: {double_headway_b}")
        return double_headway_a, double_headway_b
    else:
        print(f"No SWT data available for the current hour ({current_hour}).")

def check_gaps_against_double_headway(cumulative_dataframes, double_headway_a, double_headway_b, combined_df, lineID):
    results = []

    # Create a mapping from ID to Bus_Stop_Code
    id_to_code = combined_df.set_index('ID')['Bus_Stop_Code'].to_dict()

    # Loop through each stop point ID in the cumulative dataframes
    for stop_point_id, cumulative_df in cumulative_dataframes.items():
        direction = 'A' if combined_df[combined_df['ID'] == stop_point_id]['Route_Dir_QSI_No'].iloc[0].startswith(lineID + '_A') else 'B'
        double_headway = double_headway_a if direction == 'A' else double_headway_b

        # Check if any 'Gap' is greater than the corresponding double headway
        gaps_exceeding_headway = cumulative_df[cumulative_df['Gap'] > double_headway]

        if not gaps_exceeding_headway.empty:
            # Extract the relevant Vehicle ID and Stop Point
            for _, row in gaps_exceeding_headway.iterrows():
                results.append({ 'Line': lineID,
                                'Vehicle ID': row['Vehicle ID'],
                                'Stop Point': row['Stop Point']
                              })

    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)

    if not results_df.empty:
        # Add Bus_Stop_Code to results_df
        results_df['Bus_Stop_Code'] = results_df['Stop Point'].map(id_to_code)
    if results_df.empty:
        print("\nNo Vehicle IDs and Stop Points where Gap exceeds Double Headway.")
    else:
        print("\nVehicle IDs and Stop Points where Gap exceeds Double Headway:")
        print(results_df.to_string(index=False))

    return results_df

def main():

    # Function to calculate Scheduled Wait Time (SWT) for all hours
    def calculate_swt_for_all_hours(slots):
        swt_per_hour = []
        for hour in range(24):
            total_buses = len(slots[hour])
            scheduled_wait_time = 60 / (total_buses * 2) if total_buses > 0 else None  # Use None to indicate no buses
            swt_per_hour.append((scheduled_wait_time, total_buses))
        return swt_per_hour

    # Main logic to fetch and calculate SWT
    def main(combined_df, lineID):
        bst = pytz.timezone('Europe/London')
        swt_data = {
            'Route_Dir_QSI_No': [],
            'ID': [],
        }
        # Initialize keys for all 24 hours in the dictionary
        for hour in range(24):
            swt_data[f'SWT_{hour}'] = []
            swt_data[f'Sch_{hour}'] = []

        # Update current time and hour
        current_time = datetime.now(bst)
        day_of_week = get_day_of_week()

        # Store selected schedule name to ensure it's printed only once
        selected_schedule_name = None
        printed_schedule_name = False

        for index, row in combined_df.iterrows():
            stop_point_id = row['ID']
            route_dir_qsi_no = row['Route_Dir_QSI_No']

            if f"{lineID}_A" in route_dir_qsi_no:
                direction = 'outbound'
            elif f"{lineID}_B" in route_dir_qsi_no:
                direction = 'inbound'
            else:
                continue

            url = f'https://api.tfl.gov.uk/Line/{lineID}/Timetable/{stop_point_id}?direction={direction}'
            data = fetch_data(url)

            schedule_names_dict = extract_schedule_names(data)

            if not selected_schedule_name:
                selected_schedule_name = select_preferred_schedule(schedule_names_dict, day_of_week)

            if selected_schedule_name and not printed_schedule_name:
                print(f"\n\033[1m\033[4mToday is {day_of_week}. The selected Schedule name is {selected_schedule_name}.\033[0m")
                printed_schedule_name = True

            if selected_schedule_name:
                timetable = schedule_names_dict[selected_schedule_name]
                slots = categorize_into_slots(timetable)

                # Calculate SWT for all hours
                swt_per_hour = calculate_swt_for_all_hours(slots)

                # Store SWT data for all hours
                swt_data['Route_Dir_QSI_No'].append(route_dir_qsi_no)
                swt_data['ID'].append(stop_point_id)
                for hour in range(24):
                    swt, total_buses = swt_per_hour[hour]
                    swt_data[f'SWT_{hour}'].append(swt)
                    swt_data[f'Sch_{hour}'].append(total_buses)

        # Create DataFrame for SWT data
        swt_df = pd.DataFrame(swt_data)

        # Calculate Route SWT for each hour for directions A and B
        route_swt_data = { 'Line' : [] ,
            'Hour': [],
            'Route SWT A': [],
            'Route SWT B': []
        }

        for hour in range(24):
            # Calculate Route SWT for direction A
            swt_a = swt_df[swt_df['Route_Dir_QSI_No'].str.contains(f'{lineID}_A')]
            valid_swt_a = swt_a[pd.notna(swt_a[f'SWT_{hour}'])]  # Filter out NaN values
            weighted_sum_a = sum(valid_swt_a[f'SWT_{hour}'] * valid_swt_a[f'Sch_{hour}']) if not valid_swt_a.empty else 0
            total_buses_a = sum(valid_swt_a[f'Sch_{hour}']) if not valid_swt_a.empty else 0
            route_swt_a = round(weighted_sum_a / total_buses_a, 2) if total_buses_a > 0 else None


            # Calculate Route SWT for direction B
            swt_b = swt_df[swt_df['Route_Dir_QSI_No'].str.contains(f'{lineID}_B')]
            valid_swt_b = swt_b[pd.notna(swt_b[f'SWT_{hour}'])]  # Filter out NaN values
            weighted_sum_b = sum(valid_swt_b[f'SWT_{hour}'] * valid_swt_b[f'Sch_{hour}']) if not valid_swt_b.empty else 0
            total_buses_b = sum(valid_swt_b[f'Sch_{hour}']) if not valid_swt_b.empty else 0
            route_swt_b = round(weighted_sum_b / total_buses_b, 2) if total_buses_b > 0 else None

            route_swt_data['Line'].append(lineID)
            route_swt_data['Hour'].append(hour)
            route_swt_data['Route SWT A'].append(route_swt_a)
            route_swt_data['Route SWT B'].append(route_swt_b)

        route_swt_df = pd.DataFrame(route_swt_data)
        return route_swt_df

    route_swt_df = main(combined_df, lineID)
    # Example dictionary to hold cumulative dataframes for each stop point

    cumulative_dataframes = {}
    cumulative_dataframes_new = {}
    # Dictionary to hold the number of buses observed per stop point
    buses_observed = {}
    buses_observed_new = {}
    # DataFrame to store Route AWT data
    route_awt_df = pd.DataFrame(columns=['Hour', 'Route AWT A', 'Route AWT B'])

    # Loop through unique stop points in combined_df
    for index, row in combined_df.iterrows():
        stop_point_id = row['ID']
        direction = 'outbound' if row['Route_Dir_QSI_No'].startswith(lineID + '_A') else 'inbound'

        cumulative_df = pd.DataFrame(columns=[
            'Line', 'Vehicle ID', 'Stop Point', 'Direction',
            'Expected Arrival (BST)', 'Expected Arrival (HM)',
            'Gap', '2_Gap', 'Gap_Sq'
        ])
        cumulative_df_new = cumulative_df
        cumulative_dataframes[stop_point_id] = cumulative_df  # Initialize cumulative dataframe
        cumulative_dataframes_new[stop_point_id] = cumulative_df_new
        buses_observed[stop_point_id] = (0, 0, 0, 0)  # Initialize with zero values
        buses_observed_new[stop_point_id] = (0, 0, 0, 0)  # Initialize with zero values

    while True:
        for stop_point_id, cumulative_df in cumulative_dataframes.items():
            direction = 'outbound' if combined_df[combined_df['ID'] == stop_point_id]['Route_Dir_QSI_No'].iloc[0].startswith(lineID + '_A') else 'inbound'

            arrival_predictions_df, station_name = fetch_arrival_predictions(lineID, stop_point_id, direction)

            if arrival_predictions_df is not None and not arrival_predictions_df.empty:
                current_hour = datetime.now(pytz.timezone('Europe/London')).hour

                for _, row in arrival_predictions_df.iterrows():
                    vehicle_id = row['Vehicle ID']

                    mask = cumulative_df['Vehicle ID'] == vehicle_id

                    if cumulative_df[mask].empty:
                        # If vehicle ID is not present in cumulative DataFrame, append the row
                        cumulative_df = pd.concat([cumulative_df, row.to_frame().T], ignore_index=True)
                    else:
                        # If vehicle ID is present, overwrite the row
                        cumulative_df.loc[mask, ['Line', 'Stop Point', 'Direction', 'Expected Arrival (BST)', 'Expected Arrival (HM)', 'Gap', '2_Gap', 'Gap_Sq']] = row[['Line', 'Stop Point', 'Direction', 'Expected Arrival (BST)', 'Expected Arrival (HM)', 'Gap', '2_Gap', 'Gap_Sq']].values

                # Convert to datetime
                cumulative_df['Expected Arrival (BST)'] = pd.to_datetime(cumulative_df['Expected Arrival (BST)'])
                cumulative_df['Expected Arrival (HM)'] = pd.to_datetime(cumulative_df['Expected Arrival (HM)'], format='%H:%M')

                # Sort the DataFrame
                cumulative_df = cumulative_df.sort_values(by='Expected Arrival (BST)', ascending=True).reset_index(drop=True)

                # Calculate gaps
                cumulative_df['Gap'] = (cumulative_df['Expected Arrival (BST)'].diff().dt.total_seconds() / 60).round(2)
                cumulative_df.loc[0, 'Gap'] = 0  # First row gap should be zero
                cumulative_df['2_Gap'] = (cumulative_df['Gap'] * 2).round(2)
                cumulative_df['Gap_Sq'] = (cumulative_df['Gap'] * cumulative_df['Gap']).round(2)

                # Update number of buses observed in the current hour
                num_buses_observed = len(cumulative_df[cumulative_df['Expected Arrival (BST)'].dt.hour == current_hour]['Vehicle ID'].unique())
                total_Gap_Sq = cumulative_df[cumulative_df['Expected Arrival (BST)'].dt.hour == current_hour]['Gap_Sq'].sum()
                total_2_Gap = cumulative_df[cumulative_df['Expected Arrival (BST)'].dt.hour == current_hour]['2_Gap'].sum()
                AWT = round(total_Gap_Sq / total_2_Gap, 2) if total_2_Gap > 0 else 0
                buses_observed[stop_point_id] = (num_buses_observed, total_Gap_Sq, total_2_Gap, AWT)

                print(f"\nArrival Predictions for stop point {stop_point_id} ({station_name}):")
                print(arrival_predictions_df.to_string(index=False))
                print("\nCumulative DataFrame:")
                print(cumulative_df.to_string(index=False))
                print(f"\nNumber of buses observed in the current hour: {num_buses_observed}")
            else:
                print("No arrival predictions available.")

            print("Refreshing data in 30 seconds...\n")
            time.sleep(30)

            # Update cumulative dataframe in dictionary
            cumulative_dataframes[stop_point_id] = cumulative_df
            cumulative_df_new = cumulative_df
            cumulative_dataframes_new[stop_point_id] = cumulative_df_new
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            cumulative_df.to_csv(f'{output_dir}cumulative_data_stop_{stop_point_id}.csv', index=False)

            if cumulative_df_new.empty:
                print("No data available in cumulative_df_new.")
                first_row = None  # Assign None or handle the absence of data appropriately
            else:
                first_row = cumulative_df_new.iloc[0]
                #print(first_row)
                # Retrieve the 'Expected Arrival (BST)' value and extract the time part
                expected_arrival_time = first_row['Expected Arrival (BST)'].time()
                #print(f"Expected arrival time: {expected_arrival_time}")

                timetable_dict[stop_point_id] = timetable_df
                #print(f"timetable: {timetable_df}")

                # Create a function to convert 'Scheduled Time' to a datetime.time object
                def to_time(time_str):
                    return datetime.strptime(time_str, '%H:%M').time()

                # Apply the function to the 'Scheduled Time' column
                timetable_df['Scheduled Time Only'] = timetable_df['Scheduled Time'].apply(to_time)

                # Filter the rows where 'Scheduled Time Only' is greater than 'expected_arrival_time'
                filtered_df = timetable_df[timetable_df['Scheduled Time Only'] > expected_arrival_time]

                # Select the top 10 'Scheduled Time Only' values
                top_20_scheduled_time = filtered_df['Scheduled Time Only'].head(20)
                print(f"top 20 schedule: {top_20_scheduled_time}")

                # Find the first row's scheduled time from top_20_scheduled_time
                first_time = top_20_scheduled_time.iloc[0]

                # Find the index of this time in the original timetable_df
                index_of_first_time = timetable_df[timetable_df['Scheduled Time Only'] == first_time].index[0]

                # Get the scheduled time immediately above it
                if index_of_first_time > 0:
                    previous_time = timetable_df.iloc[index_of_first_time - 1]['Scheduled Time Only']
                else:
                    # If it's the first element, there's no previous time
                    previous_time = None

                # Create a new DataFrame to include the previous time
                if previous_time:
                    top_21_scheduled_time = pd.concat([
                        pd.Series([previous_time], name='Scheduled Time Only'),
                        top_20_scheduled_time
                    ]).drop_duplicates().reset_index(drop=True)
                else:
                    top_21_scheduled_time = top_20_scheduled_time.reset_index(drop=True)

                # Print the result
                print("Updated top 20 'Scheduled Time Only' values including the time immediately above the first element:")
                print(top_21_scheduled_time)

                # Number of rows in cumulative_df
                num_rows_cumulative_new = len(cumulative_df_new)

                # Fetch the first 'num_rows_cumulative' scheduled times from top_21_scheduled_time
                scheduled_times_to_add = top_21_scheduled_time.head(num_rows_cumulative_new)

                # Add these scheduled times as a new column in cumulative_df
                cumulative_df_new['Scheduled Time'] = scheduled_times_to_add.values

                # Print the updated cumulative_df
                print("\nUpdated cumulative_df with new 'Scheduled Time' column:")
                print(cumulative_df_new)

                # Define the BST timezone
                bst = pytz.timezone('Europe/London')

                # Get the current time in BST
                current_time_bst = datetime.now(bst)

                # Fetch the current hour in BST
                current_hour_bst = current_time_bst.hour

                # Fetch the last row's 'Scheduled Time' from the updated cumulative_df
                last_row_scheduled_time = cumulative_df_new['Scheduled Time'].iloc[-1]

                # Convert the last_row_scheduled_time to BST if it's not already
                # Assuming last_row_scheduled_time is naive (no timezone info)
                last_row_scheduled_time_bst = last_row_scheduled_time.replace(tzinfo=bst)

                # Print the last row's 'Scheduled Time'
                print("\nLast row's 'Scheduled Time' in cumulative_df (BST):")
                print(last_row_scheduled_time_bst)

                # Fetch the scheduled times from timetable_df that meet the criteria
                filtered_times = timetable_df[
                    (timetable_df['Scheduled Time Only'] > last_row_scheduled_time_bst) &
                    (timetable_df['Scheduled Time Only'].apply(lambda x: x.hour).isin([current_hour_bst, current_hour_bst + 1, current_hour_bst + 2]))
                ]

                # Print the filtered scheduled times
                print("\nScheduled times greater than last_row_scheduled_time in BST and within the next 3 hours:")
                print(filtered_times['Scheduled Time Only'])

                # Step 1: Find the shape of `filtered_times['Scheduled Time Only']`
                filtered_times_shape = filtered_times['Scheduled Time Only'].shape[0]

                # Step 2: Create new rows with the filtered scheduled times
                new_rows = pd.DataFrame({
                    'Scheduled Time': filtered_times['Scheduled Time Only'].values,
                    'Line': [cumulative_df_new['Line'].iloc[-1]] * filtered_times_shape,
                    'Stop Point': [cumulative_df_new['Stop Point'].iloc[-1]] * filtered_times_shape,
                    'Direction': [cumulative_df_new['Direction'].iloc[-1]] * filtered_times_shape,
                    'Vehicle ID': [f"{str(cumulative_df_new['Stop Point'].iloc[-1])[-4:]}_{i+1}" for i in range(filtered_times_shape)],
                    'Expected Arrival (BST)': [0] * filtered_times_shape,
                    'Expected Arrival (HM)': [0] * filtered_times_shape,
                    'Gap': [0] * filtered_times_shape,
                    '2_Gap': [0] * filtered_times_shape,
                    'Gap_Sq': [0] * filtered_times_shape
                })

                # Step 3: Append the new rows to the `cumulative_df`
                cumulative_df_new = pd.concat([cumulative_df_new, new_rows], ignore_index=True)

                # Step 4: Print the updated cumulative_df
                print("\nUpdated cumulative_df after appending new rows:")
                print(cumulative_df_new)

































        print(f"\nRoute {lineID} SWT DataFrame:")
        print(route_swt_df)
        double_headway_a, double_headway_b = fetch_current_hour_swt(route_swt_df)

        if double_headway_a is not None and double_headway_b is not None:
          results_df = check_gaps_against_double_headway(cumulative_dataframes, double_headway_a, double_headway_b, combined_df, lineID)

          #print("\nVehicle IDs and Stop Points where Gap exceeds Double Headway:")
          #print(results_df.to_string(index=False))

        # Create DataFrame to show number of buses observed for each stop point
        buses_observed_df = pd.DataFrame(list(buses_observed.items()), columns=['Stop Point', 'Metrics'])

        # Split 'Metrics' into separate columns
        buses_observed_df[['Num of Buses Observed', 'Total Gap Sq', 'Total 2 Gap', 'AWT']] = pd.DataFrame(
            buses_observed_df['Metrics'].tolist(), index=buses_observed_df.index
        )

        # Calculate WAWT as the product of AWT and Num of Buses Observed
        buses_observed_df['WAWT'] = buses_observed_df['AWT'] * buses_observed_df['Num of Buses Observed']

        # Drop the 'Metrics' column
        buses_observed_df.drop(columns=['Metrics'], inplace=True)

        current_hour = datetime.now(pytz.timezone('Europe/London')).hour

        buses_observed_df['Line'] = lineID
        buses_observed_df['Hour'] = current_hour

        # Reorder columns to make 'Hour' the first column
        columns_order = ['Line'] + ['Hour'] + [col for col in buses_observed_df.columns if col not in ['Hour', 'Line']]
        buses_observed_df = buses_observed_df[columns_order]

        print(f"\nNumber of Buses Observed DataFrame for Route:{lineID}")
        print(buses_observed_df)

        # Calculate Route AWT A and Route AWT B
        pattern_A = f"^{lineID}_A\\d+$"
        pattern_B = f"^{lineID}_B\\d+$"

        buses_observed_df_A = buses_observed_df[buses_observed_df['Stop Point'].isin(combined_df[combined_df['Route_Dir_QSI_No'].str.match(pattern_A)]['ID'])]
        buses_observed_df_B = buses_observed_df[buses_observed_df['Stop Point'].isin(combined_df[combined_df['Route_Dir_QSI_No'].str.match(pattern_B)]['ID'])]

        sum_WAWT_A = buses_observed_df_A['WAWT'].sum()
        sum_buses_observed_A = buses_observed_df_A['Num of Buses Observed'].sum()
        route_AWT_A = round(sum_WAWT_A / sum_buses_observed_A, 2) if sum_buses_observed_A > 0 else 0

        sum_WAWT_B = buses_observed_df_B['WAWT'].sum()
        sum_buses_observed_B = buses_observed_df_B['Num of Buses Observed'].sum()
        route_AWT_B = round(sum_WAWT_B / sum_buses_observed_B, 2) if sum_buses_observed_B > 0 else 0

        if 'Route' not in route_awt_df.columns:
          route_awt_df['Route'] = lineID

        # Check if the current hour's data is already present
        if current_hour in route_awt_df['Hour'].values:
            route_awt_df.loc[route_awt_df['Hour'] == current_hour, ['Route AWT A', 'Route AWT B']] = [route_AWT_A, route_AWT_B]
        else:
            new_row = pd.DataFrame({
                'Route': [lineID],
                'Hour': [current_hour],
                'Route AWT A': [route_AWT_A],
                'Route AWT B': [route_AWT_B]
            })
            route_awt_df = pd.concat([route_awt_df, new_row], ignore_index=True)


        columns_order = ['Route'] + [col for col in route_awt_df.columns if col != 'Route']
        route_awt_df = route_awt_df[columns_order]


        route_ewt_df = pd.DataFrame(columns=['Route','Hour', 'Route EWT A', 'Route EWT B'])
        current_hour = datetime.now(pytz.timezone('Europe/London')).hour
        route_ewt_df = pd.DataFrame([[lineID,current_hour, None, None]], columns=['Route','Hour', 'Route EWT A','Route EWT B'])

        # Merge route_awt_df and route_swt_df on 'Hour'
        merged_df = pd.merge(route_awt_df, route_swt_df, on='Hour')


        # Calculate the 'Route EWT A' and 'Route EWT B' columns
        merged_df['Route EWT A'] = merged_df['Route AWT A'] - merged_df['Route SWT A']
        merged_df['Route EWT B'] = merged_df['Route AWT B'] - merged_df['Route SWT B']

        lineID_sheet2 = str(lineID)
        df_sheet2['Route'] = df_sheet2['Route'].astype(str)
        MPS_data = df_sheet2[(df_sheet2['Route'] == lineID_sheet2) | (df_sheet2['Route'] == lineID)]
        MPS = MPS_data['MPS'].iloc[0]
        print(f"\n\033[1m\033[4mMPS for Route {lineID} is {MPS}\033[0m\n")

        # Calculate the 'Route EWT (var) A' and 'Route EWT (var) B' columns
        merged_df['Route EWT VAR A'] = merged_df['Route EWT A'] - MPS
        merged_df['Route EWT VAR B'] = merged_df['Route EWT B'] - MPS

        # Reorder the columns so that 'Route EWT A' and 'Route EWT B' are after 'Route SWT B'
        new_columns_order = ['Route', 'Hour', 'Route SWT A', 'Route AWT A', 'Route EWT A', 'Route EWT VAR A', 'Route SWT B', 'Route AWT B', 'Route EWT B' , 'Route EWT VAR B']
        merged_df = merged_df[new_columns_order]

        # Split the DataFrame for Route A and Route B
        route_A_df = merged_df[['Route','Hour', 'Route SWT A', 'Route AWT A', 'Route EWT A', 'Route EWT VAR A']]
        route_B_df = merged_df[['Route','Hour', 'Route SWT B', 'Route AWT B', 'Route EWT B', 'Route EWT VAR B']]

        route_A_df = route_A_df.rename(columns={'Route': 'Line'})
        route_B_df = route_B_df.rename(columns={'Route': 'Line'})

        print(f"\nSWT, AWT, and EWT of Route {lineID} in direction A:")
        print(route_A_df)

        print(f"\nSWT, AWT, and EWT of Route {lineID} in direction B:")
        print(route_B_df)

        # Save DataFrames to CSV
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Save DataFrames to Google Drive
        route_swt_df.to_csv(os.path.join(output_dir, 'route_swt_df.csv'), index=False)
        results_df.to_csv(os.path.join(output_dir, 'results_df.csv'), index=False)
        route_A_df.to_csv(os.path.join(output_dir, 'route_A_df.csv'), index=False)
        route_B_df.to_csv(os.path.join(output_dir, 'route_B_df.csv'), index=False)


        print(f"CSV files saved to 'output' directory.\n")

if __name__ == "__main__":
    main()



[1m[4mToday is Friday. The selected Schedule name is Monday to Friday.[0m

Arrival Predictions for stop point 490011107G (Poplar / All Saints Church):
Line Vehicle ID Stop Point Direction Expected Arrival (BST) Expected Arrival (HM)  Gap  2_Gap  Gap_Sq
  D7    LX11BJV 490011107G  outbound    2024-08-16 12:57:32   1900-01-01 12:57:00  0.0    0.0     0.0
  D7    LX11BFF 490011107G  outbound    2024-08-16 13:09:32   1900-01-01 13:09:00 12.0   24.0   144.0

Cumulative DataFrame:
Line Vehicle ID Stop Point Direction Expected Arrival (BST) Expected Arrival (HM)  Gap  2_Gap  Gap_Sq
  D7    LX11BJV 490011107G  outbound    2024-08-16 12:57:32   1900-01-01 12:57:00  0.0    0.0     0.0
  D7    LX11BFF 490011107G  outbound    2024-08-16 13:09:32   1900-01-01 13:09:00 12.0   24.0   144.0

Number of buses observed in the current hour: 1
Refreshing data in 30 seconds...



KeyboardInterrupt: 