# Machine Learning Fundamentals - Lecture 03

This is the Jupyter notebook for Lecture 03 of the Machine Learning Fundamentals
course.

## Part 1: Load and clean dataset

In [None]:
# Import the required libraries using the commonly use short names (pd, sns, ...)
import numpy as np
import pandas as pd
import seaborn as sns

# The Path object from pathlib allows us to easily build paths in an
# OS-independent fashion
from pathlib import Path

# Load the required scikit-learn classes and functions
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from scipy import stats

# Set a nicer style for Seaborn plots
sns.set_style("darkgrid")

## Part 1: load and clean the Pokémon dataset

Here we just repeat the steps already done in the previous lectures, but in a
more succint way.

In [110]:
# Load the dataset (note the use of the Path object)
df = pd.read_csv(Path("..", "datasets", "Pokemon.csv"))

# It's not good practice to have column names with spaces and other non-standard
# characters, so let's fix this by renaming the columns to standard names
df.rename(columns={
    "Type 1" : "Type1",
    "Type 2" : "Type2",
    "Sp. Atk" : "SpAtk",
    "Sp. Def" : "SpDef",
}, inplace=True)

# Replace missing values in the "Type2" column with the string "None"
df["Type2"] = df["Type2"].fillna("None")

# Since primary and secondary types are essentially categories (and not just
# strings / objects), we can convert these columns to the category type
df["Type1"] = df["Type1"].astype("category")
df["Type2"] = df["Type2"].astype("category")

Before we proceed to the interesting part, we'll perform our data scaling and
train/test data splitting.

In [111]:
# Let's use all features except the Total, which can be considered redundant
# since it's the total of the other features
features = ["HP", "Attack", "Defense", "SpAtk", "SpDef", "Speed"]

# Get only the specified features
df_X = df[features]

# Standardize them
ss = StandardScaler()
X = ss.fit_transform(df_X)

# Our labels will be the legendary status
y = df["Legendary"].to_numpy()

# Let's split our data into training (80%) and test (20%) sets
# Change the random_state parameter do split data in different ways
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [112]:
dists = euclidean_distances(X_test, X_train)
k = 5
idx_mink = np.argpartition(dists, k, axis=1)[:, :k]

labels_mink = y_train[idx_mink]

maj_labels = np.zeros(labels_mink.shape[0], dtype=np.bool)
for i, row in enumerate(labels_mink):
    values, counts = np.unique(row, return_counts=True)
    maj_labels[i] = values[np.argmax(counts)]

# modes = np.array(modes)

#neighs
#idx_mink #.shape
#y_train[idx_mink]
#maj_labels


## Part 2: Implement our own $k$-Nearest Neighbors classifier and regressor

We'll use our implementation to classify legendary and non-legendary Pokémons,
and use it as a regressor to predict the "Total" column.

We'll also compare our results with the respective classifier and regressor
available in `scikit-learn`.

In [113]:
def knn_predict(X_train, y_train, X_test, k=5, task="class"):

    dists = euclidean_distances(X_test, X_train)
    idx_mink = np.argpartition(dists, k, axis=1)[:, :k]

    if task == "class":
        labels_mink = y_train[idx_mink]
        maj_labels = np.zeros(labels_mink.shape[0], dtype=np.bool)
        for i, row in enumerate(labels_mink):
            values, counts = np.unique(row, return_counts=True)
            maj_labels[i] = values[np.argmax(counts)]
        return maj_labels
    else:  # regression
        return X_train[idx_mink].mean(axis=1)

In [114]:
y_pred = knn_predict(X_train, y_train, X_test)

In [115]:
accuracy_score(y_pred, y_test)

0.925

In [116]:
knnClf = KNeighborsClassifier(n_neighbors=5)
knnClf.fit(X_train, y_train)
knnClf.score(X_test, y_test)

0.925

In [117]:
y_total_raw = df["Total"].to_numpy().reshape((-1, 1))
y_total = ss.fit_transform(y_total_raw)

In [118]:
y_regr = knn_predict(X_train, y_total, X_test, task="regr")

In [None]:
knnRegr = KNeighborsRegressor(n_neighbors=5)
knnRegr.fit(X_train, y_total)
knnRegr.predict(y_total)

(160, 6)