In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import numpy as np 
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# 1. 

In [2]:
# Loads the data from UCI Machine Learning Repo into a pandas DataFrame
mushrooms = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data")
columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']
# Names the columns 
mushrooms.columns = columns
# Replace "?" (represents missing values on UCI Machine Learning Repo) witn np.NaN
mushrooms.replace('?', np.NaN, inplace = True)

In [83]:
# Gets a dataset without the problematic feature
mushrooms_no_stalk_root = mushrooms.drop("stalk-root", axis = 1)

# Gets the predictor features 
X = mushrooms_no_stalk_root.drop(["class"], axis=1)
# And encodes them (they are all categorical)
X = pd.get_dummies(X).values
# Gets the target feature (binary)
y = mushrooms_no_stalk_root["class"].values

le = LabelEncoder()
# Encodes the target
y = le.fit_transform(y)

# Splits between training and testing, stratifying by the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = y)

# Gets the indexes of the unique class labels
classes, indices = np.unique(y_train, return_index=True)
# Removes lables from y_train minus two unique ones
y_train = np.array([y_train[i] if i in indices else -1 for i in range(y_train.shape[0])])

In [84]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2, random_state=2)   
y_train_pred = km.fit_predict(X_train)
dict_ = {}
#for i in range(y_train.shape[0]):
#    if y_train[i] != -1:
#        key = y_train_pred[i]
#        val = y_train[i]
#        dict_[key] = val
dict_[0] = 1
dict_[1] = 0
y_test_pred = km.predict(X_test)
y_test_pred = np.array([dict_[key] for key in y_test_pred])
from sklearn.metrics import precision_recall_fscore_support
print('Accuracy:', end=' ')
print(precision_recall_fscore_support(y_test_pred, y_test, average='micro')[0])

Accuracy: 0.9043906442347148


## Using only most significant features

In [85]:
# Gets the predictor features 
X = mushrooms[['odor','stalk-surface-above-ring','gill-color','gill-size',
                                  'spore-print-color','stalk-surface-below-ring','ring-type']]
# And encodes them (they are all categorical)
X = pd.get_dummies(X).values
# Gets the target feature (binary)
y = mushrooms["class"].values

le = LabelEncoder()
# Encodes the target
y = le.fit_transform(y)

# Splits between training and testing, stratifying by the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = y)

# Gets the indexes of the unique class labels
classes, indices = np.unique(y_train, return_index=True)
# Removes lables from y_train minus two unique ones
y_train = np.array([y_train[i] if i in indices else -1 for i in range(y_train.shape[0])])

In [86]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2, random_state=2)   
y_train_pred = km.fit_predict(X_train)
print(y_train[:5])
print(y_train_pred[:5])
dict_ = {}
#for i in range(y_train.shape[0]):
#    if y_train[i] != -1:
#        key = y_train_pred[i]
#        val = y_train[i]
#        dict_[key] = val
dict_[0] = 1
dict_[1] = 0
y_test_pred = km.predict(X_test)
y_test_pred = np.array([dict_[key] for key in y_test_pred])
from sklearn.metrics import precision_recall_fscore_support
print('Accuracy:', end=' ')
print(precision_recall_fscore_support(y_test_pred, y_test, average='micro')[0])

[ 0 -1 -1  1 -1]
[0 1 1 0 1]
Accuracy: 0.9006975789905621


# 4.

In [8]:
# Gets a dataset without the problematic feature
mushrooms_without_NaNs = mushrooms.dropna()
# Gets the predictor features 
X = mushrooms_without_NaNs.drop(["class"], axis=1)
# And encodes them (they are all categorical)
X = pd.get_dummies(X).values
# Gets the target feature (binary)
y = mushrooms_without_NaNs["class"].values
le = LabelEncoder()
# Encodes the target
y = le.fit_transform(y)

# Splits between training and testing, stratifying by the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = y)

# Gets the indexes of the unique class labels
classes, indices = np.unique(y_train, return_index=True)
# Removes lables from y_train minus two unique ones
y_train = np.array([y_train[i] if i in indices else -1 for i in range(y_train.shape[0])])

In [12]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2, random_state=2)   
y_train_pred = km.fit_predict(X_train)
dict_ = {}
#for i in range(y_train.shape[0]):
#    if y_train[i] != -1:
#        key = y_train_pred[i]
#        val = y_train[i]
#        dict_[key] = val
dict_[0] = 0
dict_[1] = 1
y_test_pred = km.predict(X_test)
y_test_pred = np.array([dict_[key] for key in y_test_pred])
from sklearn.metrics import precision_recall_fscore_support
print('Accuracy:', end=' ')
print(precision_recall_fscore_support(y_test_pred, y_test, average='micro')[0])

Accuracy: 0.8629651506202008


In [13]:
# Gets the predictor features 
X = mushrooms.drop(["class"], axis=1)
# And encodes them (they are all categorical)
X = pd.get_dummies(X).values
# Gets the target feature (binary)
y = mushrooms["class"].values
le = LabelEncoder()
# Encodes the target
y = le.fit_transform(y)

# Splits between training and testing, stratifying by the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = y)

# Gets the indexes of the unique class labels
classes, indices = np.unique(y_train, return_index=True)
# Removes lables from y_train minus two unique ones
y_train = np.array([y_train[i] if i in indices else -1 for i in range(y_train.shape[0])])

In [14]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2, random_state=2)   
y_train_pred = km.fit_predict(X_train)
dict_ = {}
#for i in range(y_train.shape[0]):
#    if y_train[i] != -1:
#        key = y_train_pred[i]
#        val = y_train[i]
#        dict_[key] = val
dict_[0] = 0
dict_[1] = 1
y_test_pred = km.predict(X_test)
y_test_pred = np.array([dict_[key] for key in y_test_pred])
from sklearn.metrics import precision_recall_fscore_support
print('Accuracy:', end=' ')
print(precision_recall_fscore_support(y_test_pred, y_test, average='micro')[0])

Accuracy: 0.9043906442347148
