In [1]:
import pandas as pd
import re

path = r"../dataset/bus_gtfs_text_member4.csv"
df = pd.read_csv(path)

print("Shape:", df.shape)
print(df.columns)
display(df.head())


Shape: (100000, 11)
Index(['route_id', 'route_short_name', 'route_long_name', 'trip_id', 'stop_id',
       'stop_name', 'stop_sequence', 'arrival_time', 'departure_time',
       'direction_id', 'delay_text'],
      dtype='object')


  df = pd.read_csv(path)


Unnamed: 0,route_id,route_short_name,route_long_name,trip_id,stop_id,stop_name,stop_sequence,arrival_time,departure_time,direction_id,delay_text
0,5198_117858,6,Howth Dart Stn - Lower Abbey Street,5198_14101,8220DB007591,Abbey Street Lower,1,06:10:00,06:10:00,0,Route 6 (Howth Dart Stn - Lower Abbey Street) ...
1,5198_117858,6,Howth Dart Stn - Lower Abbey Street,5198_14101,8220DB000496,Busáras,2,06:11:17,06:11:17,0,Route 6 (Howth Dart Stn - Lower Abbey Street) ...
2,5198_117858,6,Howth Dart Stn - Lower Abbey Street,5198_14101,8220DB000515,Five Lamps,3,06:14:00,06:14:00,0,Route 6 (Howth Dart Stn - Lower Abbey Street) ...
3,5198_117858,6,Howth Dart Stn - Lower Abbey Street,5198_14101,8220DB000516,Newcomen Bridge,4,06:15:00,06:15:00,0,Route 6 (Howth Dart Stn - Lower Abbey Street) ...
4,5198_117858,6,Howth Dart Stn - Lower Abbey Street,5198_14101,8220DB000519,North Strand Fire Station,5,06:17:00,06:17:00,0,Route 6 (Howth Dart Stn - Lower Abbey Street) ...


In [2]:
def clean_text_basic(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# change column name here if yours is different
# your screenshot shows text_clean already exists, but we rebuild safely
if "text_clean" not in df.columns:
    # try common column names
    for c in ["delay_text", "text", "description", "route_text"]:
        if c in df.columns:
            df["text_clean"] = df[c].apply(clean_text_basic)
            break
else:
    df["text_clean"] = df["text_clean"].apply(clean_text_basic)

display(df[["text_clean"]].head(5))


Unnamed: 0,text_clean
0,route 6 howth dart stn lower abbey street in o...
1,route 6 howth dart stn lower abbey street in o...
2,route 6 howth dart stn lower abbey street in o...
3,route 6 howth dart stn lower abbey street in o...
4,route 6 howth dart stn lower abbey street in o...


In [3]:
def extract_route_id(txt):
    m = re.search(r"\broute\s+([a-z0-9]+)\b", txt)
    return m.group(1) if m else "unknown"

def estimate_stop_count(txt):
    # rough estimate: count how many " in " / " to " separators or place names
    # works because GTFS route strings contain many location tokens
    tokens = txt.split()
    return len(tokens)

df["route_id"] = df["text_clean"].apply(extract_route_id)
df["token_count"] = df["text_clean"].apply(estimate_stop_count)

print(df[["route_id","token_count"]].describe())
display(df[["text_clean","route_id","token_count"]].head(10))


       token_count
count  100000.0000
mean       23.8810
std         1.5671
min        21.0000
25%        22.0000
50%        24.0000
75%        25.0000
max        28.0000


Unnamed: 0,text_clean,route_id,token_count
0,route 6 howth dart stn lower abbey street in o...,6,26
1,route 6 howth dart stn lower abbey street in o...,6,25
2,route 6 howth dart stn lower abbey street in o...,6,25
3,route 6 howth dart stn lower abbey street in o...,6,25
4,route 6 howth dart stn lower abbey street in o...,6,27
5,route 6 howth dart stn lower abbey street in o...,6,24
6,route 6 howth dart stn lower abbey street in o...,6,25
7,route 6 howth dart stn lower abbey street in o...,6,25
8,route 6 howth dart stn lower abbey street in o...,6,25
9,route 6 howth dart stn lower abbey street in o...,6,24


In [4]:
# choose thresholds based on quantiles so labels are balanced
q1 = df["token_count"].quantile(0.33)
q2 = df["token_count"].quantile(0.66)

def label_length(x):
    if x <= q1:
        return "short_route"
    elif x <= q2:
        return "medium_route"
    else:
        return "long_route"

df["label"] = df["token_count"].apply(label_length)

print("Label counts:")
print(df["label"].value_counts())


Label counts:
label
medium_route    50256
short_route     37340
long_route      12404
Name: count, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X = df["text_clean"].astype(str)
y = df["label"].astype(str)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec  = tfidf.transform(X_test)

print("Train:", X_train_vec.shape, "Test:", X_test_vec.shape)
print("Train label counts:\n", y_train.value_counts())


Train: (80000, 4851) Test: (20000, 4851)
Train label counts:
 label
medium_route    40205
short_route     29872
long_route       9923
Name: count, dtype: int64


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

lr = LogisticRegression(max_iter=2000)
lr.fit(X_train_vec, y_train)

pred_lr = lr.predict(X_test_vec)

print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_lr))
print(classification_report(y_test, pred_lr))


Logistic Regression Accuracy: 1.0
              precision    recall  f1-score   support

  long_route       1.00      1.00      1.00      2481
medium_route       1.00      1.00      1.00     10051
 short_route       1.00      1.00      1.00      7468

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000



In [7]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train_vec, y_train)

pred_svm = svm.predict(X_test_vec)

print("LinearSVC Accuracy:", accuracy_score(y_test, pred_svm))
print(classification_report(y_test, pred_svm))




LinearSVC Accuracy: 0.99895
              precision    recall  f1-score   support

  long_route       1.00      1.00      1.00      2481
medium_route       1.00      1.00      1.00     10051
 short_route       1.00      1.00      1.00      7468

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000



In [8]:
out_path = r"../dataset/bus_text_enriched_member4.csv"
df.to_csv(out_path, index=False)
print("Saved:", out_path)


Saved: ../dataset/bus_text_enriched_member4.csv
