# Import data and libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as preproc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef
from sklearn.model_selection import GridSearchCV

# constants and settings
rand_seed = 35

In [None]:
# Load the data
tree_data = pd.read_csv('../data/covtype.data', header=None)
tree_cols = ['Elevation', 'Aspect', 'Slope', 'Horizontal_To_Hydrology',
             'Vertical_To_Hydrology', 'Horizontal_To_Roadways',
             'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
             'Horizontal_To_Fire'] + \
            [f'Wilderness_Area_{i}' for i in range(4)] + \
            [f'Soil_Type_{i}' for i in range(40)] + \
            ['Cover_Type']

# print(tree_data.describe())
tree_data.columns = tree_cols
print("Nan values in dataset:", tree_data.isna().sum().sum())

# Data exploration

In [None]:
def plot_histogram(y, label=None):
    import matplotlib.pyplot as plt
    bin_centers = sorted(y.unique())
    edges = [x - 0.5 for x in bin_centers] + [bin_centers[-1] + 0.5]
    y.hist(bins=edges, align='mid', density=True, rwidth=0.8)
    plt.xlabel(label if label else 'Value')
    plt.ylabel('Density')
    plt.title('Histogram of ' + (label if label else 'Histogram'))
    plt.show()

In [None]:
columns = tree_data.columns.tolist()
columns.remove('Cover_Type')
X_tree = tree_data[columns]
y_tree = tree_data['Cover_Type']


plot_histogram(y_tree, label='Cover Type')

Very uneven label distribution, mostly centered on types 1 and 2.
Stratified sampling require to ensure labels 3-7 are properly represented in training.

In [None]:
scaler = preproc.StandardScaler()
X_scaled = scaler.fit_transform(X_tree)

## Split training/testing data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_scaled, y_tree, test_size=0.2, stratify=y_tree, random_state=rand_seed)  # fix stratify

plot_histogram(y_train, label='Train Cover Type')

In [None]:
knn_cv = GridSearchCV(KNeighborsClassifier(), {'n_neighbors': [1, 3, 5, 7, 9]}, cv=5, n_jobs=-1, scoring='accuracy')
knn_cv.fit(x_train, y_train)
print("Best K:", knn_cv.best_params_)
print("Best cross-validation score:", knn_cv.best_score_)