In [1]:
# Load libraries and dataset
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set(style='ticks', color_codes=True)

from os.path import isfile

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Load the dataset
data_path = './datasets/Mushroom_Dataset.csv'

df = pd.DataFrame()

if isfile(data_path):
    df = pd.read_csv(data_path)
else:
    print("Dataset not found. Please check that the dataset exists and the path is correct.")
    
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [2]:
# Check data

# Duplicates? (Nope)
df['duplicate'] = df.duplicated()

if len(df[df['duplicate'] == True]) > 0:
    print(df[df['duplicate'] == True])
    
    dups = df[df['duplicate'] == True].index
    df.drop(dups, inplace=True)
    
df.drop(columns='duplicate', inplace=True)

# Nulls? (Nope)
df['null'] = df.isnull().any()

if len(df[df['null'] == True]) > 0:
    print(df[df['null'] == True])
    
    nu = df[df['null'] == True].index
    
df.drop(columns='null', inplace=True)

# The stalk root has a '?' class, but it's huge so I don't want to replace it with the mode.
# Might drop later to see if that helps the models.
# (Later) Default parameters on the models get close to 100% w/ the missing data, so . . . I'm going to leave it.


In [3]:
# Discretize the data

# Taken from module 3 notebook
def encode_onehot(_df, f):
    _df2 = pd.get_dummies(_df[f], prefix='', prefix_sep='').max(level=0, axis=1).add_prefix(f+' - ')
    df3 = pd.concat([_df, _df2], axis=1)
    df3 = df3.drop([f], axis=1)
    return df3

# all data is nominal and none of it is derived, so let's onehot encode everything except 
# for mushroom class (dep. var.)
df_o = df.copy()
for col in df.columns:
    # don't descretize our dependent variable
    if col == 'class':
        continue
    df_o = encode_onehot(df_o, col)
    
df_o.head()

Unnamed: 0,class,cap-shape - b,cap-shape - c,cap-shape - f,cap-shape - k,cap-shape - s,cap-shape - x,cap-surface - f,cap-surface - g,cap-surface - s,...,population - s,population - v,population - y,habitat - d,habitat - g,habitat - l,habitat - m,habitat - p,habitat - u,habitat - w
0,p,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,1,0
1,e,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,e,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,p,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,e,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [6]:
# Get models and data ready

# Divide data up
x = df_o.loc[:, df_o.columns != 'class'].values
y = df_o.loc[:, df_o.columns == 'class'].values.ravel()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=None)

# Adapted from module 3 notebook
def train_test(classifier, _x_tr, _x_ts, _y_tr, _y_ts):
    # Train on training data
    model = classifier.fit(_x_tr, _y_tr)
    # Test on training data
    y_pred = model.predict(_x_ts)
    # Return accuracy
    return accuracy_score(_y_ts, y_pred)

# Decision Tree Classifiers
dts = [DecisionTreeClassifier()]
for md in range(2, 6):
    for min_samp_leaf in range(100, 600, 100):
        dts.append(DecisionTreeClassifier(max_depth=md, min_samples_leaf=min_samp_leaf))
                   

# Random Forest Classifiers
rfs = [RandomForestClassifier()]
for est in range(100, 600, 100):
    rfs.append(RandomForestClassifier(n_estimators=est, max_depth=2))

# RBF SVM Classifiers
rbfs = [SVC()]
for g in [x for x in range(1, 21)]:
    rbfs.append(SVC(gamma=g))

In [None]:
# Decision Tree Classifiers
print("Decision Tree Classifiers:")
for dt in dts:
    print(train_test(dt, x_train, x_test, y_train, y_test))

# Random Forest Classifiers
print("Random Forest Classifiers:")
for rf in rfs:
    print(train_test(rf, x_train, x_test, y_train, y_test))

# RBF SVM Classifiers
print("RBF SVMs")
for rbf in rbfs:
    print(train_test(rbf, x_train, x_test, y_train, y_test))


Decision Tree Classifiers:
1.0
0.9507692307692308
0.9507692307692308
0.9507692307692308
0.9507692307692308
0.904
0.9686153846153847
0.9507692307692308
0.9507692307692308
0.9507692307692308
0.924923076923077
0.9686153846153847
0.9513846153846154
0.9507692307692308
0.9507692307692308
0.924923076923077
0.9686153846153847
0.9513846153846154
0.9507692307692308
0.9507692307692308
0.924923076923077
Random Forest Classifiers:
1.0
0.9052307692307693
0.896
0.9452307692307692
0.9070769230769231
0.9353846153846154
RBF SVMs
1.0
1.0
1.0
0.5150769230769231
0.5150769230769231
0.5150769230769231
0.5150769230769231
0.5150769230769231
0.5150769230769231
0.5150769230769231
0.5150769230769231
0.5150769230769231
0.5150769230769231


In [None]:
# Display OP
def annot(opi, x, y):
    plt.annotate(f"OP{opi}", xy=(x, y), xytext=(.90*x+.1, .80*y), arrowprops=dict(facecolor='lightgray', shrink=1))

# Parameter to vary for Logistic Regression
C = (2e-1, 0.5, 0.8, 1, 2, 5, 1e1, 2e1, 1e2)

# Let's vary C and generate training/testing sessions to collect data for ROC
FPR, TPR = [], []
for c in C:
    pipe_lr = make_pipeline(StandardScaler(),
                            LogisticRegression(random_state=14,
                                               penalty='l1',
                                               solver='liblinear',
                                               class_weight='balanced',
                                               C=c,
                                               multi_class='auto',
                                               max_iter=10000))
    pipe_lr.fit(x_train, y_train)
    y_pred = pipe_lr.predict(x_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    TPR += [tp/(tp+fn)]  # Pd
    FPR += [fp/(fp+tn)]  # Pf
    #
    print(f'Test Accuracy= {pipe_lr.score(x_test, y_test):.3f}, C={c:13.5f}, TPR {TPR[-1]:.3f}, FPR {FPR[-1]:.3f}')

In [None]:
# Sorts the points to display nicely on ROC
FPR, TPR = zip(*sorted(zip(FPR, TPR)))
fpr = [0.]+list(FPR)+[1.]; tpr = [0.]+list(TPR)+[1.]

# Plot
fig, ax = plt.subplots()
plt.plot(fpr, tpr, ':', label='ROC')
plt.scatter(FPR, TPR, 50, color='red', marker='o', label='operating points')
plt.plot([0, 1], [0, 1], linestyle='--', color=(0.6, 0.6, 0.6), label='coin flip')

# Annotate certain operating points
annot(1, fpr[1], tpr[1])
annot(2, fpr[4], tpr[4])
annot(3, fpr[8], tpr[8])
annot(4, fpr[9], tpr[9])

# Labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.grid()
plt.show()