In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


# The first three lines are invalid -> skip
journeys = pd.read_csv('../data/train-drives-cleaned.csv', encoding='utf-8')

# Select relevant columns
journeys_filtered = journeys.loc[:, journeys.columns.str.startswith('train_line') | journeys.columns.isin(['planned_departure_hour', 'crowdedness'])]
class_label_df = journeys['ticket_checked']

# Split into train and test sets
journeys_train, journeys_test, class_label_train, class_label_test = train_test_split(journeys_filtered, class_label_df, test_size=0.2, random_state=124)
journeys_filtered.head()


In [None]:
############################################ MANUAL CHECK REQUIRED !!! ############################################

# Good visualization which shows the distribution of the class labels
class_label_train.value_counts().plot(kind='bar', title='Class label distribution in training set')


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.under_sampling import RandomUnderSampler

class_labels = class_label_train.unique().astype(str)

# As seen in visuals, it is imbalance, use smote
# Apply SMOTE only to the training set
# smote = SMOTE(random_state=123, k_neighbors=3)
# journeys_train_balanced, class_label_train_balanced = smote.fit_resample(journeys_train, class_label_train)
# Initialize undersampler
undersampler = RandomUnderSampler(random_state=123)

# Apply it to your training data
journeys_train_balanced, class_label_train_balanced = undersampler.fit_resample(journeys_train, class_label_train)
# journeys_train_balanced, class_label_train_balanced = journeys_train, class_label_train


class_label_train_balanced.value_counts().plot(kind='bar', title='Class label distribution in training set')



In [None]:
# Try different values of k and use cross-validation to find the best one
k_range = range(1, 21)
cv_scores = []

for k in k_range:
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn_classifier, journeys_train_balanced, class_label_train_balanced, cv=3, scoring='accuracy')
    cv_scores.append(scores.mean())

best_k = k_range[cv_scores.index(max(cv_scores))]
print(f'Best k found by cross-validation: {best_k}')

# Re-initialize KNN classifier with the best k
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Fit the model
knn_classifier.fit(journeys_train_balanced, class_label_train_balanced)

# Predict on the test set
knn_predictions = knn_classifier.predict(journeys_test)

# Evaluate the KNN model
knn_accuracy = accuracy_score(class_label_test, knn_predictions)
print(f'Accuracy of KNN Classifier: {knn_accuracy:.3f}')
print(classification_report(class_label_test, knn_predictions, target_names=class_labels))
