<a href="https://colab.research.google.com/github/RemyaVKarthikeyan/AA-Stagecoach-Project/blob/main/15_Aug_Trial_LF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from datetime import datetime
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the training data from Google Drive
file_path = '/content/drive/My Drive/Files/Training_set.csv'
df = pd.read_csv(file_path)

# Define the cumulative_df DataFrame with equal-length lists
cumulative_df = pd.DataFrame({
    'Line': ['D7'] * 12,
    'Vehicle ID': ['LX11BEY', 'LX11BJO', 'LX61DBO', '048Z_1', '048Z_2', '048Z_3', '048Z_4', '048Z_5', '048Z_6', '048Z_7', '048Z_8', '048Z_9'],
    'Stop Point': ['490002048Z'] * 12,
    'Direction': ['outbound'] * 12,
    'Expected Arrival (BST)': [
        '2024-08-15 14:43:33', '2024-08-15 14:56:33', '2024-08-15 15:07:27',
        '0', '0', '0', '0', '0', '0', '0', '0', '0'
    ],
    'Expected Arrival (HM)': [
        '1900-01-01 14:43:00', '1900-01-01 14:56:00', '1900-01-01 15:07:00',
        '0', '0', '0', '0', '0', '0', '0', '0', '0'
    ],
    'Gap': [0.0, 13.0, 10.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    '2_Gap': [0.0, 26.0, 21.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'Gap_Sq': [0.0, 169.0, 118.81, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'Scheduled Time': [
        '14:40:00', '14:52:00', '15:03:00', '15:15:00', '15:27:00', '15:38:00',
        '15:50:00', '16:02:00', '16:14:00', '16:26:00', '16:38:00', '16:50:00'
    ]
})

# Function to convert time string to seconds since start of day
def time_to_seconds(time_str):
    t = datetime.strptime(time_str, '%H:%M:%S')
    return t.hour * 3600 + t.minute * 60 + t.second

# Convert 'Scheduled Time' and 'Expected Arrival (BST)' in the training data to seconds
df['Scheduled Time (seconds)'] = df['Scheduled Time'].apply(time_to_seconds)

# Convert 'Expected Arrival (BST)' to seconds, handling cases where the value doesn't split correctly
def convert_to_seconds(time_str):
    try:
        time_part = time_str.split()[1]
        return time_to_seconds(time_part)
    except IndexError:
        # Handle cases where splitting fails or value is not in the expected format
        return np.nan

df['Expected Arrival (BST) (seconds)'] = df['Expected Arrival (BST)'].apply(convert_to_seconds)

# Filter out rows with NaN in 'Expected Arrival (BST) (seconds)' column
df = df.dropna(subset=['Expected Arrival (BST) (seconds)'])

# Filter cumulative_df to get rows where Expected Arrival (BST) is not zero
non_zero_df = cumulative_df[cumulative_df['Expected Arrival (BST)'] != '0']

# Extract scheduled time and expected arrival from cumulative_df into new_data DataFrame
new_data = pd.DataFrame({
    'Scheduled Time': non_zero_df['Scheduled Time'],
    'Expected Arrival (BST)': non_zero_df['Expected Arrival (BST)']
})

print(new_data)

# Convert new_data columns to seconds
new_data['Scheduled Time (seconds)'] = new_data['Scheduled Time'].apply(time_to_seconds)
new_data['Expected Arrival (BST) (seconds)'] = new_data['Expected Arrival (BST)'].apply(convert_to_seconds)

print(new_data)

# Filter out rows with NaN in the new_data DataFrame
new_data = new_data.dropna(subset=['Expected Arrival (BST) (seconds)'])

print(new_data)

# Append new_data to your existing DataFrame for training
df = pd.concat([df, new_data], ignore_index=True)

# Define features (X) and target (y) for training
X = df[['Scheduled Time (seconds)']]
y = df['Expected Arrival (BST) (seconds)']

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X, y)

# Extract scheduled times where Expected Arrival (BST) is zero
zero_arrival_df = cumulative_df[cumulative_df['Expected Arrival (BST)'] == '0']
new_scheduled_times = zero_arrival_df['Scheduled Time'].tolist()

# Convert new scheduled times to seconds
new_scheduled_times_seconds = [time_to_seconds(t) for t in new_scheduled_times]

# Predict expected arrival times
predicted_seconds = model.predict(pd.DataFrame({'Scheduled Time (seconds)': new_scheduled_times_seconds}))

# Adjust predictions
adjusted_predictions_seconds = []
previous_expected_seconds = new_scheduled_times_seconds[0]

for i, (scheduled_sec, predicted_sec) in enumerate(zip(new_scheduled_times_seconds, predicted_seconds)):
    if predicted_sec <= scheduled_sec:
        predicted_sec = scheduled_sec + 60
    if i > 0 and predicted_sec <= adjusted_predictions_seconds[-1]:
        predicted_sec = adjusted_predictions_seconds[-1] + 60
    adjusted_predictions_seconds.append(predicted_sec)

# Convert adjusted predictions to time strings
adjusted_times = [seconds_to_time(sec) for sec in adjusted_predictions_seconds]

# Update cumulative_df with the adjusted times
cumulative_df.loc[zero_arrival_df.index, 'Expected Arrival (BST)'] = adjusted_times

# Print the updated cumulative_df
print(cumulative_df)






Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
  Scheduled Time Expected Arrival (BST)
0       14:40:00    2024-08-15 14:43:33
1       14:52:00    2024-08-15 14:56:33
2       15:03:00    2024-08-15 15:07:27
  Scheduled Time Expected Arrival (BST)  Scheduled Time (seconds)  \
0       14:40:00    2024-08-15 14:43:33                     52800   
1       14:52:00    2024-08-15 14:56:33                     53520   
2       15:03:00    2024-08-15 15:07:27                     54180   

   Expected Arrival (BST) (seconds)  
0                             53013  
1                             53793  
2                             54447  
  Scheduled Time Expected Arrival (BST)  Scheduled Time (seconds)  \
0       14:40:00    2024-08-15 14:43:33                     52800   
1       14:52:00    2024-08-15 14:56:33                     53520   
2       15:03:00    2024-08-15 15:07:27                     54180   

   Ex

In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from datetime import datetime, timedelta
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the training data from Google Drive
file_path = '/content/drive/My Drive/Files/Training_set.csv'
df = pd.read_csv(file_path)

# Define the cumulative_df DataFrame with equal-length lists
cumulative_df = pd.DataFrame({
    'Line': ['D7'] * 12,
    'Vehicle ID': ['LX11BEY', 'LX11BJO', 'LX61DBO', '048Z_1', '048Z_2', '048Z_3', '048Z_4', '048Z_5', '048Z_6', '048Z_7', '048Z_8', '048Z_9'],
    'Stop Point': ['490002048Z'] * 12,
    'Direction': ['outbound'] * 12,
    'Expected Arrival (BST)': [
        '2024-08-15 14:43:33', '2024-08-15 14:56:33', '2024-08-15 15:07:27',
        '0', '0', '0', '0', '0', '0', '0', '0', '0'
    ],
    'Expected Arrival (HM)': [
        '1900-01-01 14:43:00', '1900-01-01 14:56:00', '1900-01-01 15:07:00',
        '0', '0', '0', '0', '0', '0', '0', '0', '0'
    ],
    'Gap': [0.0, 13.0, 10.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    '2_Gap': [0.0, 26.0, 21.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'Gap_Sq': [0.0, 169.0, 118.81, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'Scheduled Time': [
        '14:40:00', '14:52:00', '15:03:00', '15:15:00', '15:27:00', '15:38:00',
        '15:50:00', '16:02:00', '16:14:00', '16:26:00', '16:38:00', '16:50:00'
    ]
})

# Function to convert time string to seconds since start of day
def time_to_seconds(time_str):
    t = datetime.strptime(time_str, '%H:%M:%S')
    return t.hour * 3600 + t.minute * 60 + t.second

# Function to convert seconds to time string with current date
def seconds_to_time(seconds, date_str):
    return (datetime.strptime(date_str, '%Y-%m-%d') + timedelta(seconds=seconds)).strftime('%Y-%m-%d %H:%M:%S')

# Convert 'Scheduled Time' and 'Expected Arrival (BST)' in the training data to seconds
df['Scheduled Time (seconds)'] = df['Scheduled Time'].apply(time_to_seconds)

# Convert 'Expected Arrival (BST)' to seconds, handling cases where the value doesn't split correctly
def convert_to_seconds(time_str):
    try:
        time_part = time_str.split()[1]
        return time_to_seconds(time_part)
    except IndexError:
        # Handle cases where splitting fails or value is not in the expected format
        return np.nan

df['Expected Arrival (BST) (seconds)'] = df['Expected Arrival (BST)'].apply(convert_to_seconds)

# Filter out rows with NaN in 'Expected Arrival (BST) (seconds)' column
df = df.dropna(subset=['Expected Arrival (BST) (seconds)'])

# Filter cumulative_df to get rows where Expected Arrival (BST) is not zero
non_zero_df = cumulative_df[cumulative_df['Expected Arrival (BST)'] != '0']

# Extract scheduled time and expected arrival from cumulative_df into new_data DataFrame
new_data = pd.DataFrame({
    'Scheduled Time': non_zero_df['Scheduled Time'],
    'Expected Arrival (BST)': non_zero_df['Expected Arrival (BST)']
})

print(new_data)

# Convert new_data columns to seconds
new_data['Scheduled Time (seconds)'] = new_data['Scheduled Time'].apply(time_to_seconds)
new_data['Expected Arrival (BST) (seconds)'] = new_data['Expected Arrival (BST)'].apply(convert_to_seconds)

print(new_data)

# Filter out rows with NaN in the new_data DataFrame
new_data = new_data.dropna(subset=['Expected Arrival (BST) (seconds)'])

print(new_data)

# Append new_data to your existing DataFrame for training
df = pd.concat([df, new_data], ignore_index=True)

# Define features (X) and target (y) for training
X = df[['Scheduled Time (seconds)']]
y = df['Expected Arrival (BST) (seconds)']

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X, y)

# Extract scheduled times where Expected Arrival (BST) is zero
zero_arrival_df = cumulative_df[cumulative_df['Expected Arrival (BST)'] == '0']
new_scheduled_times = zero_arrival_df['Scheduled Time'].tolist()

# Convert new scheduled times to seconds
new_scheduled_times_seconds = [time_to_seconds(t) for t in new_scheduled_times]

# Predict expected arrival times
predicted_seconds = model.predict(pd.DataFrame({'Scheduled Time (seconds)': new_scheduled_times_seconds}))

# Adjust predictions
adjusted_predictions_seconds = []
previous_expected_seconds = new_scheduled_times_seconds[0]

for i, (scheduled_sec, predicted_sec) in enumerate(zip(new_scheduled_times_seconds, predicted_seconds)):
    if predicted_sec <= scheduled_sec:
        predicted_sec = scheduled_sec + 60
    if i > 0 and predicted_sec <= adjusted_predictions_seconds[-1]:
        predicted_sec = adjusted_predictions_seconds[-1] + 60
    adjusted_predictions_seconds.append(predicted_sec)

# Get current date
current_date = datetime.now().strftime('%Y-%m-%d')

# Convert adjusted predictions to time strings with the current date
adjusted_times = [seconds_to_time(sec, current_date) for sec in adjusted_predictions_seconds]

# Update cumulative_df with the adjusted times
cumulative_df.loc[zero_arrival_df.index, 'Expected Arrival (BST)'] = adjusted_times

# Print the updated cumulative_df
print(cumulative_df)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
  Scheduled Time Expected Arrival (BST)
0       14:40:00    2024-08-15 14:43:33
1       14:52:00    2024-08-15 14:56:33
2       15:03:00    2024-08-15 15:07:27
  Scheduled Time Expected Arrival (BST)  Scheduled Time (seconds)  \
0       14:40:00    2024-08-15 14:43:33                     52800   
1       14:52:00    2024-08-15 14:56:33                     53520   
2       15:03:00    2024-08-15 15:07:27                     54180   

   Expected Arrival (BST) (seconds)  
0                             53013  
1                             53793  
2                             54447  
  Scheduled Time Expected Arrival (BST)  Scheduled Time (seconds)  \
0       14:40:00    2024-08-15 14:43:33                     52800   
1       14:52:00    2024-08-15 14:56:33                     53520   
2       15:03:00    2024-08-15 15:07:27                     54180   

   Ex

In [18]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from datetime import datetime, timedelta
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the training data from Google Drive
file_path = '/content/drive/My Drive/Files/Training_set.csv'
df = pd.read_csv(file_path)

# Define the cumulative_df DataFrame with equal-length lists
cumulative_df = pd.DataFrame({
    'Line': ['D7'] * 12,
    'Vehicle ID': ['LX11BEY', 'LX11BJO', 'LX61DBO', '048Z_1', '048Z_2', '048Z_3', '048Z_4', '048Z_5', '048Z_6', '048Z_7', '048Z_8', '048Z_9'],
    'Stop Point': ['490002048Z'] * 12,
    'Direction': ['outbound'] * 12,
    'Expected Arrival (BST)': [
        '2024-08-15 14:43:33', '2024-08-15 14:56:33', '2024-08-15 15:07:27',
        '0', '0', '0', '0', '0', '0', '0', '0', '0'
    ],
    'Expected Arrival (HM)': [
        '1900-01-01 14:43:00', '1900-01-01 14:56:00', '1900-01-01 15:07:00',
        '0', '0', '0', '0', '0', '0', '0', '0', '0'
    ],
    'Gap': [0.0, 13.0, 10.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    '2_Gap': [0.0, 26.0, 21.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'Gap_Sq': [0.0, 169.0, 118.81, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'Scheduled Time': [
        '14:40:00', '14:52:00', '15:03:00', '15:15:00', '15:27:00', '15:38:00',
        '15:50:00', '16:02:00', '16:14:00', '16:26:00', '16:38:00', '16:50:00'
    ]
})

# Function to convert time string to seconds since start of day
def time_to_seconds(time_str):
    t = datetime.strptime(time_str, '%H:%M:%S')
    return t.hour * 3600 + t.minute * 60 + t.second

# Function to convert seconds to time string with current date
def seconds_to_time(seconds, date_str):
    return (datetime.strptime(date_str, '%Y-%m-%d') + timedelta(seconds=seconds)).strftime('%Y-%m-%d %H:%M:%S')

# Function to format the time string to hours and minutes with `00` for seconds
def format_time_str(time_str):
    try:
        time_part = time_str.split()[1]
        return f"{time_part.split(':')[0]}:{time_part.split(':')[1]}:00"
    except IndexError:
        return '00:00:00'

# Convert 'Scheduled Time' and 'Expected Arrival (BST)' in the training data to seconds
df['Scheduled Time (seconds)'] = df['Scheduled Time'].apply(time_to_seconds)

# Convert 'Expected Arrival (BST)' to seconds, handling cases where the value doesn't split correctly
def convert_to_seconds(time_str):
    try:
        time_part = time_str.split()[1]
        return time_to_seconds(time_part)
    except IndexError:
        return np.nan

df['Expected Arrival (BST) (seconds)'] = df['Expected Arrival (BST)'].apply(convert_to_seconds)

# Filter out rows with NaN in 'Expected Arrival (BST) (seconds)' column
df = df.dropna(subset=['Expected Arrival (BST) (seconds)'])

# Filter cumulative_df to get rows where Expected Arrival (BST) is not zero
non_zero_df = cumulative_df[cumulative_df['Expected Arrival (BST)'] != '0']

# Extract scheduled time and expected arrival from cumulative_df into new_data DataFrame
new_data = pd.DataFrame({
    'Scheduled Time': non_zero_df['Scheduled Time'],
    'Expected Arrival (BST)': non_zero_df['Expected Arrival (BST)']
})

print(new_data)

# Convert new_data columns to seconds
new_data['Scheduled Time (seconds)'] = new_data['Scheduled Time'].apply(time_to_seconds)
new_data['Expected Arrival (BST) (seconds)'] = new_data['Expected Arrival (BST)'].apply(convert_to_seconds)

print(new_data)

# Filter out rows with NaN in the new_data DataFrame
new_data = new_data.dropna(subset=['Expected Arrival (BST) (seconds)'])

print(new_data)

# Append new_data to your existing DataFrame for training
df = pd.concat([df, new_data], ignore_index=True)

# Define features (X) and target (y) for training
X = df[['Scheduled Time (seconds)']]
y = df['Expected Arrival (BST) (seconds)']

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X, y)

# Extract scheduled times where Expected Arrival (BST) is zero
zero_arrival_df = cumulative_df[cumulative_df['Expected Arrival (BST)'] == '0']
new_scheduled_times = zero_arrival_df['Scheduled Time'].tolist()

# Convert new scheduled times to seconds
new_scheduled_times_seconds = [time_to_seconds(t) for t in new_scheduled_times]

# Predict expected arrival times
predicted_seconds = model.predict(pd.DataFrame({'Scheduled Time (seconds)': new_scheduled_times_seconds}))

# Adjust predictions
adjusted_predictions_seconds = []
previous_expected_seconds = new_scheduled_times_seconds[0]

for i, (scheduled_sec, predicted_sec) in enumerate(zip(new_scheduled_times_seconds, predicted_seconds)):
    if predicted_sec <= scheduled_sec:
        predicted_sec = scheduled_sec + 60
    if i > 0 and predicted_sec <= adjusted_predictions_seconds[-1]:
        predicted_sec = adjusted_predictions_seconds[-1] + 60
    adjusted_predictions_seconds.append(predicted_sec)

# Get current date
current_date = datetime.now().strftime('%Y-%m-%d')

# Convert adjusted predictions to time strings with the current date
adjusted_times = [seconds_to_time(sec, current_date) for sec in adjusted_predictions_seconds]

# Update cumulative_df with the adjusted times
cumulative_df.loc[zero_arrival_df.index, 'Expected Arrival (BST)'] = adjusted_times

# Update 'Expected Arrival (HM)' for rows where value is '0'
# Extract the date from the first 'Expected Arrival (HM)' element
first_hm_date = cumulative_df.loc[cumulative_df['Expected Arrival (HM)'] != '0', 'Expected Arrival (HM)'].iloc[0].split()[0]

def update_hm(row):
    if row['Expected Arrival (HM)'] == '0':
        bst_time = row['Expected Arrival (BST)'].split()[1]
        return f"{first_hm_date} {format_time_str(bst_time)}"
    return row['Expected Arrival (HM)']

cumulative_df['Expected Arrival (HM)'] = cumulative_df.apply(update_hm, axis=1)

# Print the updated cumulative_df
print(cumulative_df)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
  Scheduled Time Expected Arrival (BST)
0       14:40:00    2024-08-15 14:43:33
1       14:52:00    2024-08-15 14:56:33
2       15:03:00    2024-08-15 15:07:27
  Scheduled Time Expected Arrival (BST)  Scheduled Time (seconds)  \
0       14:40:00    2024-08-15 14:43:33                     52800   
1       14:52:00    2024-08-15 14:56:33                     53520   
2       15:03:00    2024-08-15 15:07:27                     54180   

   Expected Arrival (BST) (seconds)  
0                             53013  
1                             53793  
2                             54447  
  Scheduled Time Expected Arrival (BST)  Scheduled Time (seconds)  \
0       14:40:00    2024-08-15 14:43:33                     52800   
1       14:52:00    2024-08-15 14:56:33                     53520   
2       15:03:00    2024-08-15 15:07:27                     54180   

   Ex

In [19]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from datetime import datetime, timedelta
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the training data from Google Drive
file_path = '/content/drive/My Drive/Files/Training_set.csv'
df = pd.read_csv(file_path)

# Define the cumulative_df DataFrame with equal-length lists
cumulative_df = pd.DataFrame({
    'Line': ['D7'] * 12,
    'Vehicle ID': ['LX11BEY', 'LX11BJO', 'LX61DBO', '048Z_1', '048Z_2', '048Z_3', '048Z_4', '048Z_5', '048Z_6', '048Z_7', '048Z_8', '048Z_9'],
    'Stop Point': ['490002048Z'] * 12,
    'Direction': ['outbound'] * 12,
    'Expected Arrival (BST)': [
        '2024-08-15 14:43:33', '2024-08-15 14:56:33', '2024-08-15 15:07:27',
        '0', '0', '0', '0', '0', '0', '0', '0', '0'
    ],
    'Expected Arrival (HM)': [
        '1900-01-01 14:43:00', '1900-01-01 14:56:00', '1900-01-01 15:07:00',
        '0', '0', '0', '0', '0', '0', '0', '0', '0'
    ],
    'Gap': [0.0, 13.0, 10.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    '2_Gap': [0.0, 26.0, 21.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'Gap_Sq': [0.0, 169.0, 118.81, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'Scheduled Time': [
        '14:40:00', '14:52:00', '15:03:00', '15:15:00', '15:27:00', '15:38:00',
        '15:50:00', '16:02:00', '16:14:00', '16:26:00', '16:38:00', '16:50:00'
    ]
})

# Function to convert time string to seconds since start of day
def time_to_seconds(time_str):
    t = datetime.strptime(time_str, '%H:%M:%S')
    return t.hour * 3600 + t.minute * 60 + t.second

# Function to convert seconds to time string with current date
def seconds_to_time(seconds, date_str):
    return (datetime.strptime(date_str, '%Y-%m-%d') + timedelta(seconds=seconds)).strftime('%Y-%m-%d %H:%M:%S')

# Function to format the time string to hours and minutes with `00` for seconds
def format_time_str(time_str):
    try:
        return f"{time_str.split(':')[0]}:{time_str.split(':')[1]}:00"
    except IndexError:
        return '00:00:00'

# Convert 'Scheduled Time' and 'Expected Arrival (BST)' in the training data to seconds
df['Scheduled Time (seconds)'] = df['Scheduled Time'].apply(time_to_seconds)

# Convert 'Expected Arrival (BST)' to seconds, handling cases where the value doesn't split correctly
def convert_to_seconds(time_str):
    try:
        time_part = time_str.split()[1]
        return time_to_seconds(time_part)
    except IndexError:
        return np.nan

df['Expected Arrival (BST) (seconds)'] = df['Expected Arrival (BST)'].apply(convert_to_seconds)

# Filter out rows with NaN in 'Expected Arrival (BST) (seconds)' column
df = df.dropna(subset=['Expected Arrival (BST) (seconds)'])

# Filter cumulative_df to get rows where Expected Arrival (BST) is not zero
non_zero_df = cumulative_df[cumulative_df['Expected Arrival (BST)'] != '0']

# Extract scheduled time and expected arrival from cumulative_df into new_data DataFrame
new_data = pd.DataFrame({
    'Scheduled Time': non_zero_df['Scheduled Time'],
    'Expected Arrival (BST)': non_zero_df['Expected Arrival (BST)']
})

print(new_data)

# Convert new_data columns to seconds
new_data['Scheduled Time (seconds)'] = new_data['Scheduled Time'].apply(time_to_seconds)
new_data['Expected Arrival (BST) (seconds)'] = new_data['Expected Arrival (BST)'].apply(convert_to_seconds)

print(new_data)

# Filter out rows with NaN in the new_data DataFrame
new_data = new_data.dropna(subset=['Expected Arrival (BST) (seconds)'])

print(new_data)

# Append new_data to your existing DataFrame for training
df = pd.concat([df, new_data], ignore_index=True)

# Define features (X) and target (y) for training
X = df[['Scheduled Time (seconds)']]
y = df['Expected Arrival (BST) (seconds)']

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X, y)

# Extract scheduled times where Expected Arrival (BST) is zero
zero_arrival_df = cumulative_df[cumulative_df['Expected Arrival (BST)'] == '0']
new_scheduled_times = zero_arrival_df['Scheduled Time'].tolist()

# Convert new scheduled times to seconds
new_scheduled_times_seconds = [time_to_seconds(t) for t in new_scheduled_times]

# Predict expected arrival times
predicted_seconds = model.predict(pd.DataFrame({'Scheduled Time (seconds)': new_scheduled_times_seconds}))

# Adjust predictions
adjusted_predictions_seconds = []
previous_expected_seconds = new_scheduled_times_seconds[0]

for i, (scheduled_sec, predicted_sec) in enumerate(zip(new_scheduled_times_seconds, predicted_seconds)):
    if predicted_sec <= scheduled_sec:
        predicted_sec = scheduled_sec + 60
    if i > 0 and predicted_sec <= adjusted_predictions_seconds[-1]:
        predicted_sec = adjusted_predictions_seconds[-1] + 60
    adjusted_predictions_seconds.append(predicted_sec)

# Get current date
current_date = datetime.now().strftime('%Y-%m-%d')

# Convert adjusted predictions to time strings with the current date
adjusted_times = [seconds_to_time(sec, current_date) for sec in adjusted_predictions_seconds]

# Update cumulative_df with the adjusted times
cumulative_df.loc[zero_arrival_df.index, 'Expected Arrival (BST)'] = adjusted_times

# Update 'Expected Arrival (HM)' for rows where value is '0'
# Extract the date from the first non-zero 'Expected Arrival (HM)' element
first_hm_date = cumulative_df.loc[cumulative_df['Expected Arrival (HM)'] != '0', 'Expected Arrival (HM)'].iloc[0].split()[0]

def update_hm(row):
    if row['Expected Arrival (HM)'] == '0':
        bst_time = row['Expected Arrival (BST)'].split()[1]
        return f"{first_hm_date} {format_time_str(bst_time)}"
    return row['Expected Arrival (HM)']

cumulative_df['Expected Arrival (HM)'] = cumulative_df.apply(update_hm, axis=1)

# Print the updated cumulative_df
print(cumulative_df)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
  Scheduled Time Expected Arrival (BST)
0       14:40:00    2024-08-15 14:43:33
1       14:52:00    2024-08-15 14:56:33
2       15:03:00    2024-08-15 15:07:27
  Scheduled Time Expected Arrival (BST)  Scheduled Time (seconds)  \
0       14:40:00    2024-08-15 14:43:33                     52800   
1       14:52:00    2024-08-15 14:56:33                     53520   
2       15:03:00    2024-08-15 15:07:27                     54180   

   Expected Arrival (BST) (seconds)  
0                             53013  
1                             53793  
2                             54447  
  Scheduled Time Expected Arrival (BST)  Scheduled Time (seconds)  \
0       14:40:00    2024-08-15 14:43:33                     52800   
1       14:52:00    2024-08-15 14:56:33                     53520   
2       15:03:00    2024-08-15 15:07:27                     54180   

   Ex

In [20]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from datetime import datetime, timedelta
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the training data from Google Drive
file_path = '/content/drive/My Drive/Files/Training_set.csv'
df = pd.read_csv(file_path)

# Define the cumulative_df DataFrame with equal-length lists
cumulative_df = pd.DataFrame({
    'Line': ['D7'] * 12,
    'Vehicle ID': ['LX11BEY', 'LX11BJO', 'LX61DBO', '048Z_1', '048Z_2', '048Z_3', '048Z_4', '048Z_5', '048Z_6', '048Z_7', '048Z_8', '048Z_9'],
    'Stop Point': ['490002048Z'] * 12,
    'Direction': ['outbound'] * 12,
    'Expected Arrival (BST)': [
        '2024-08-15 14:43:33', '2024-08-15 14:56:33', '2024-08-15 15:07:27',
        '0', '0', '0', '0', '0', '0', '0', '0', '0'
    ],
    'Expected Arrival (HM)': [
        '1900-01-01 14:43:00', '1900-01-01 14:56:00', '1900-01-01 15:07:00',
        '0', '0', '0', '0', '0', '0', '0', '0', '0'
    ],
    'Gap': [0.0, 13.0, 10.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    '2_Gap': [0.0, 26.0, 21.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'Gap_Sq': [0.0, 169.0, 118.81, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
    'Scheduled Time': [
        '14:40:00', '14:52:00', '15:03:00', '15:15:00', '15:27:00', '15:38:00',
        '15:50:00', '16:02:00', '16:14:00', '16:26:00', '16:38:00', '16:50:00'
    ]
})

# Function to convert time string to seconds since start of day
def time_to_seconds(time_str):
    t = datetime.strptime(time_str, '%H:%M:%S')
    return t.hour * 3600 + t.minute * 60 + t.second

# Function to convert seconds to time string with current date
def seconds_to_time(seconds, date_str):
    return (datetime.strptime(date_str, '%Y-%m-%d') + timedelta(seconds=seconds)).strftime('%Y-%m-%d %H:%M:%S')

# Function to format the time string to hours and minutes with `00` for seconds
def format_time_str(time_str):
    try:
        return f"{time_str.split(':')[0]}:{time_str.split(':')[1]}:00"
    except IndexError:
        return '00:00:00'

# Convert 'Scheduled Time' and 'Expected Arrival (BST)' in the training data to seconds
df['Scheduled Time (seconds)'] = df['Scheduled Time'].apply(time_to_seconds)

# Convert 'Expected Arrival (BST)' to seconds, handling cases where the value doesn't split correctly
def convert_to_seconds(time_str):
    try:
        time_part = time_str.split()[1]
        return time_to_seconds(time_part)
    except IndexError:
        return np.nan

df['Expected Arrival (BST) (seconds)'] = df['Expected Arrival (BST)'].apply(convert_to_seconds)

# Filter out rows with NaN in 'Expected Arrival (BST) (seconds)' column
df = df.dropna(subset=['Expected Arrival (BST) (seconds)'])

# Filter cumulative_df to get rows where Expected Arrival (BST) is not zero
non_zero_df = cumulative_df[cumulative_df['Expected Arrival (BST)'] != '0']

# Extract scheduled time and expected arrival from cumulative_df into new_data DataFrame
new_data = pd.DataFrame({
    'Scheduled Time': non_zero_df['Scheduled Time'],
    'Expected Arrival (BST)': non_zero_df['Expected Arrival (BST)']
})

print(new_data)

# Convert new_data columns to seconds
new_data['Scheduled Time (seconds)'] = new_data['Scheduled Time'].apply(time_to_seconds)
new_data['Expected Arrival (BST) (seconds)'] = new_data['Expected Arrival (BST)'].apply(convert_to_seconds)

print(new_data)

# Filter out rows with NaN in the new_data DataFrame
new_data = new_data.dropna(subset=['Expected Arrival (BST) (seconds)'])

print(new_data)

# Append new_data to your existing DataFrame for training
df = pd.concat([df, new_data], ignore_index=True)

# Define features (X) and target (y) for training
X = df[['Scheduled Time (seconds)']]
y = df['Expected Arrival (BST) (seconds)']

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X, y)

# Extract scheduled times where Expected Arrival (BST) is zero
zero_arrival_df = cumulative_df[cumulative_df['Expected Arrival (BST)'] == '0']
new_scheduled_times = zero_arrival_df['Scheduled Time'].tolist()

# Convert new scheduled times to seconds
new_scheduled_times_seconds = [time_to_seconds(t) for t in new_scheduled_times]

# Predict expected arrival times
predicted_seconds = model.predict(pd.DataFrame({'Scheduled Time (seconds)': new_scheduled_times_seconds}))

# Adjust predictions
adjusted_predictions_seconds = []
previous_expected_seconds = new_scheduled_times_seconds[0]

for i, (scheduled_sec, predicted_sec) in enumerate(zip(new_scheduled_times_seconds, predicted_seconds)):
    if predicted_sec <= scheduled_sec:
        predicted_sec = scheduled_sec + 60
    if i > 0 and predicted_sec <= adjusted_predictions_seconds[-1]:
        predicted_sec = adjusted_predictions_seconds[-1] + 60
    adjusted_predictions_seconds.append(predicted_sec)

# Get current date
current_date = datetime.now().strftime('%Y-%m-%d')

# Convert adjusted predictions to time strings with the current date
adjusted_times = [seconds_to_time(sec, current_date) for sec in adjusted_predictions_seconds]

# Update cumulative_df with the adjusted times
cumulative_df.loc[zero_arrival_df.index, 'Expected Arrival (BST)'] = adjusted_times

# Update 'Expected Arrival (HM)' for rows where value is '0'
# Extract the date from the first non-zero 'Expected Arrival (HM)' element
first_hm_date = cumulative_df.loc[cumulative_df['Expected Arrival (HM)'] != '0', 'Expected Arrival (HM)'].iloc[0].split()[0]

def update_hm(row):
    if row['Expected Arrival (HM)'] == '0':
        bst_time = row['Expected Arrival (BST)'].split()[1]
        return f"{first_hm_date} {format_time_str(bst_time)}"
    return row['Expected Arrival (HM)']

cumulative_df['Expected Arrival (HM)'] = cumulative_df.apply(update_hm, axis=1)

# Calculate 'Gap', '2_Gap', and 'Gap_Sq'
# Convert 'Expected Arrival (BST)' to datetime
cumulative_df['Expected Arrival (BST)'] = pd.to_datetime(cumulative_df['Expected Arrival (BST)'], format='%Y-%m-%d %H:%M:%S')

# Calculate 'Gap' as the difference between successive 'Expected Arrival (BST)' times in minutes
cumulative_df['Gap'] = cumulative_df['Expected Arrival (BST)'].diff().dt.total_seconds() / 60
cumulative_df['Gap'] = cumulative_df['Gap'].round(2)

# Set the first row 'Gap' to 0
cumulative_df.loc[0, 'Gap'] = 0.0

# Calculate '2_Gap' as twice the 'Gap'
cumulative_df['2_Gap'] = (cumulative_df['Gap'] * 2).round(2)

# Calculate 'Gap_Sq' as the square of the 'Gap'
cumulative_df['Gap_Sq'] = (cumulative_df['Gap'] * cumulative_df['Gap']).round(2)

# Print the updated cumulative_df with calculated values
print(cumulative_df)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
  Scheduled Time Expected Arrival (BST)
0       14:40:00    2024-08-15 14:43:33
1       14:52:00    2024-08-15 14:56:33
2       15:03:00    2024-08-15 15:07:27
  Scheduled Time Expected Arrival (BST)  Scheduled Time (seconds)  \
0       14:40:00    2024-08-15 14:43:33                     52800   
1       14:52:00    2024-08-15 14:56:33                     53520   
2       15:03:00    2024-08-15 15:07:27                     54180   

   Expected Arrival (BST) (seconds)  
0                             53013  
1                             53793  
2                             54447  
  Scheduled Time Expected Arrival (BST)  Scheduled Time (seconds)  \
0       14:40:00    2024-08-15 14:43:33                     52800   
1       14:52:00    2024-08-15 14:56:33                     53520   
2       15:03:00    2024-08-15 15:07:27                     54180   

   Ex