In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import numpy as np 
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [10]:
# Loads the data from UCI Machine Learning Repo into a pandas DataFrame
mushrooms = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")
columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']
# Names the columns 
mushrooms.columns = columns
# Replace "?" (represents missing values on UCI Machine Learning Repo) witn np.NaN
mushrooms.replace('?', np.NaN, inplace = True)

In [11]:
# Gets a dataset without the problematic feature
mushrooms_no_stalk_root = mushrooms.drop("stalk-root", axis = 1)

# Gets the predictor features 
X = mushrooms_no_stalk_root.drop(["class"], axis=1)
# And encodes them (they are all categorical)
X = pd.get_dummies(X).values
# Gets the target feature (binary)
y = mushrooms_no_stalk_root["class"].values

le = LabelEncoder()
# Encodes the target
y = le.fit_transform(y)

# Splits between training and testing, stratifying by the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = y)

In [12]:
pipe = Pipeline([('clf', KNeighborsClassifier())])

In [13]:
param_grid = {'clf__n_neighbors': [5],
               'clf__p': [2]}

In [14]:
gs = GridSearchCV(estimator=pipe,
                      param_grid=param_grid,
                      scoring='accuracy',
                      cv=StratifiedKFold(n_splits=10,
                                         shuffle=True,
                                         random_state=0)) 
# Fit the pipeline
gs = gs.fit(X_train, y_train)
    
# Update best_score_param_estimators
print(gs.best_score_, gs.best_params_, gs.best_estimator_)

1.0 {'clf__n_neighbors': 5, 'clf__p': 2} Pipeline(memory=None,
     steps=[('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))])


In [16]:
gs.best_estimator_.score(X_test, y_test)

1.0

## TODO: Use less features and/or dim reduction and make some plots...?