In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [16]:
bus_145_data = pd.read_csv("bus_145_weekday_clustering.csv")
print(bus_145_data)

     Unnamed: 0   trip_id arrival_time  time_diff  minutes_past_midnight  \
0             0  13678472      6:08:13   0.000000             368.216667   
1             1  13678473      6:30:13  22.000000             390.216667   
2             2  13678474      6:45:13  15.000000             405.216667   
3             3  13678551      6:59:56  14.716667             419.933333   
4             4  13678514      7:10:56  11.000000             430.933333   
..          ...       ...          ...        ...                    ...   
111         111  13678581     23:26:13  30.000000            1406.216667   
112         112  13678485     23:56:13  30.000000            1436.216667   
113         113  13678582     24:26:13  30.000000              26.216667   
114         114  13678486     24:56:13  30.000000              56.216667   
115         115  13678583     25:26:13  30.000000              86.216667   

        period  
0      Morning  
1      Morning  
2      Morning  
3      Morning  
4 

In [10]:
# One-hot encode the 'period' column
encoder = OneHotEncoder(sparse=False)
period_encoded = encoder.fit_transform(bus_145_data[['period']])



In [19]:
# Define the service frequency categories based on time_diff
def categorize_time_diff(time_diff):
    if time_diff <= 25:
        return 'High'  # High frequency
    elif time_diff <= 15:
        return 'Medium'  # Medium frequency
    else:
        return 'Low'  # Low frequency

In [12]:
# Create a DataFrame from the encoded period features, naming columns for clarity
period_encoded_df = pd.DataFrame(
    period_encoded, 
    columns=["period_" + str(int(i)) for i in range(period_encoded.shape[1])]
)
print(period_encoded_df)

     period_0  period_1  period_2  period_3  period_4
0         0.0       0.0       1.0       0.0       0.0
1         0.0       0.0       1.0       0.0       0.0
2         0.0       0.0       1.0       0.0       0.0
3         0.0       0.0       1.0       0.0       0.0
4         0.0       0.0       1.0       0.0       0.0
..        ...       ...       ...       ...       ...
111       0.0       0.0       0.0       1.0       0.0
112       0.0       0.0       0.0       1.0       0.0
113       0.0       0.0       0.0       0.0       1.0
114       0.0       0.0       0.0       0.0       1.0
115       0.0       0.0       0.0       0.0       1.0

[116 rows x 5 columns]


In [17]:
# Concatenate the new one-hot encoded columns to the original dataframe
bus_145_data_encoded = pd.concat([bus_145_data, period_encoded_df], axis=1)
print(bus_145_data_encoded)

     Unnamed: 0   trip_id arrival_time  time_diff  minutes_past_midnight  \
0             0  13678472      6:08:13   0.000000             368.216667   
1             1  13678473      6:30:13  22.000000             390.216667   
2             2  13678474      6:45:13  15.000000             405.216667   
3             3  13678551      6:59:56  14.716667             419.933333   
4             4  13678514      7:10:56  11.000000             430.933333   
..          ...       ...          ...        ...                    ...   
111         111  13678581     23:26:13  30.000000            1406.216667   
112         112  13678485     23:56:13  30.000000            1436.216667   
113         113  13678582     24:26:13  30.000000              26.216667   
114         114  13678486     24:56:13  30.000000              56.216667   
115         115  13678583     25:26:13  30.000000              86.216667   

        period  period_0  period_1  period_2  period_3  period_4  
0      Morning      

In [18]:
# Drop the original 'period' column and other columns that won't be used in training
bus_145_data_encoded.drop(columns=['Unnamed: 0', 'trip_id', 'arrival_time', 'period'], inplace=True)
# Check the processed dataframe
bus_145_data_encoded.head()

Unnamed: 0,time_diff,minutes_past_midnight,period_0,period_1,period_2,period_3,period_4
0,0.0,368.216667,0.0,0.0,1.0,0.0,0.0
1,22.0,390.216667,0.0,0.0,1.0,0.0,0.0
2,15.0,405.216667,0.0,0.0,1.0,0.0,0.0
3,14.716667,419.933333,0.0,0.0,1.0,0.0,0.0
4,11.0,430.933333,0.0,0.0,1.0,0.0,0.0
