In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load cleaned data from preprocessing step
journeys = pd.read_csv('../data/train-drives-cleaned.csv', encoding='utf-8')

# Select only relevant columns
journeys_filtered = journeys.loc[:, journeys.columns.str.startswith('train_line') | journeys.columns.isin(['planned_departure_hour', 'day_of_week'])]
class_label_df = journeys['crowdedness']

# Split into train and test sets
journeys_train, journeys_test, class_label_train, class_label_test = train_test_split(journeys_filtered, class_label_df, test_size=0.2, random_state=123)
journeys_filtered.head()


In [None]:
############################################ MANUAL CHECK REQUIRED !!! ############################################
import matplotlib.pyplot as plt
import seaborn as sns

# == Plot the crowdedness by planned departure time ==

# Group by hour and crowdedness, count occurrences
journeys_visual = journeys.groupby(['planned_departure_hour', 'crowdedness']).size().reset_index(name='count')

# Normalize the counts per hour
journeys_visual['normalized_count'] = journeys_visual.groupby('planned_departure_hour')['count'].transform(lambda x: x / x.sum())

plt.figure(figsize=(7, 8))
plt.subplot(2, 1, 1)
sns.barplot(data=journeys_visual, x='planned_departure_hour', y='normalized_count', hue='crowdedness', palette='viridis')
plt.title('Normalized crowdedness by planned departure hour')
plt.xlabel('Planned departure hour')
plt.ylabel('Normalized number of journeys')
plt.legend(title='Crowdedness', loc='upper left')
plt.tight_layout()


# == Plot the total number of journeys by planned departure time ==
journeys_total = journeys.groupby('planned_departure_hour').size().reset_index(name='total_count')
plt.subplot(2, 1, 2)
sns.barplot(data=journeys_total, x='planned_departure_hour', y='total_count')
plt.title('Total number of train journeys by planned departure hour')
plt.xlabel('Planned departure hour')
plt.ylabel('Number of journeys')
plt.tight_layout()

plt.show()


In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report

## Create decision tree with gini index
class_labels = class_label_df.unique().astype(str)
decision_tree_model = DecisionTreeClassifier(criterion='gini', random_state=123)
decision_tree_model.fit(journeys_train, class_label_train)

# Plot resulting tree
plt.figure(figsize=(40, 20))
plot_tree(decision_tree_model, filled=True, feature_names=journeys_train.columns, class_names=class_labels, rounded=False)
plt.title('Decision tree classifier (gini index)')
plt.show()

# Evaluate model
predictions = decision_tree_model.predict(journeys_test)
accuracy = accuracy_score(class_label_test, predictions)
print(f'Accuracy of gini index decision tree: {accuracy:.2f}')
print(classification_report(class_label_test, predictions, target_names=class_labels, zero_division=0))


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'max_depth': [2, 3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Set up the model
clf = DecisionTreeClassifier(random_state=123)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=clf,
                           param_grid=param_grid,
                           cv=5,               # 5-fold cross-validation
                           scoring='accuracy',
                           n_jobs=-1)          # Use all CPU cores

# Fit
grid_search.fit(journeys_train, class_label_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


plt.figure(figsize=(40, 20))
plot_tree(grid_search.best_estimator_, filled=True, feature_names=journeys_train.columns, class_names=class_labels, rounded=True)
plt.title('Decision Tree Classifier (Gini Index)')
plt.show()


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# Scale the features (optional, but often beneficial for KNN)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
journeys_train_scaled = scaler.fit_transform(journeys_train)
journeys_test_scaled = scaler.transform(journeys_test)


# As seen in visuals, it is imbalance, use smote
# Apply SMOTE only to the training set
smote = SMOTE(random_state=123, k_neighbors=3)
journeys_train_balanced, class_label_train_balanced = smote.fit_resample(journeys_train_scaled, class_label_train)

# Try different values of k and use cross-validation to find the best one
k_range = range(1, 21)
cv_scores = []

for k in k_range:
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn_classifier, journeys_train_balanced, class_label_train_balanced, cv=3, scoring='accuracy')
    cv_scores.append(scores.mean())

best_k = k_range[cv_scores.index(max(cv_scores))]
print(f'Best k found by cross-validation: {best_k}')

# Re-initialize KNN classifier with the best k
knn_classifier = KNeighborsClassifier(n_neighbors=3)

# Fit the model
knn_classifier.fit(journeys_train_balanced, class_label_train_balanced)

# Predict on the test set
knn_predictions = knn_classifier.predict(journeys_test_scaled)

# Evaluate the KNN model
knn_accuracy = accuracy_score(class_label_test, knn_predictions)
print(f'Accuracy of KNN Classifier: {knn_accuracy:.3f}')
print(classification_report(class_label_test, knn_predictions, target_names=class_labels))
