### Algorithm Choice

I selected logistic regression for classification because:
- The binary label (Low vs. High ridership) is clear and easy to interpret.
- Accuracy and confusion matrix directly measure how often the model correctly distinguishes the two ridership levels.
- Logistic regression is simple, fast to train, and demonstrates core ML skills.

In [8]:
# -----------------------------
# 1. Setup and Data Preparation
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from datetime import datetime

# Load dataset with semicolon separator
df = pd.read_csv('Ruter-data.csv', sep=';')
print(f"Dataset loaded successfully with shape: {df.shape}")

# Check date format
print("Date column sample:")
print(df['Dato'].head())
print(f"Date column type: {df['Dato'].dtype}")

# Convert date column manually (Norwegian date format DD/MM/YYYY)
df['Dato'] = pd.to_datetime(df['Dato'], format='%d/%m/%Y')
print(f"Date column converted to: {df['Dato'].dtype}")

# Filter to one bus line, e.g. line '150'
df = df[df['Linjenavn'] == '150'].copy()
print(f"After filtering to line 150: {df.shape}")

# Extract date features
df['weekday'] = df['Dato'].dt.weekday      # 0=Mon, 6=Sun
df['month']   = df['Dato'].dt.month

# Create binary target: High ridership if passengers > median
median_count = df['Passasjerer_Ombord'].median()
df['high_ridership'] = (df['Passasjerer_Ombord'] > median_count).astype(int)

print(f"Median passenger count: {median_count}")
print(f"High ridership distribution:")
print(df['high_ridership'].value_counts())

# Define features and target
X = df[['weekday', 'month']]
y = df['high_ridership']

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# -----------------------------
# 2. Train Logistic Regression
# -----------------------------
model = LogisticRegression()
model.fit(X_train, y_train)

# -----------------------------
# 3. Evaluate and Display Results
# -----------------------------
y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
cm  = confusion_matrix(y_test, y_pred)

print(f'\nAccuracy: {acc:.3f}\n')
print('Confusion Matrix:')
print(cm)

Dataset loaded successfully with shape: (6000, 17)
Date column sample:
0    10/08/2020
1    15/08/2020
2    03/08/2020
3    27/07/2020
4    27/08/2020
Name: Dato, dtype: object
Date column type: object
Date column converted to: datetime64[ns]
After filtering to line 150: (110, 17)
Median passenger count: 2.0
High ridership distribution:
high_ridership
0    56
1    54
Name: count, dtype: int64
Training set size: 88
Test set size: 22

Accuracy: 0.364

Confusion Matrix:
[[ 8  0]
 [14  0]]
