In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
plt.style.use('ggplot')

# Data Loading

In [3]:
data = pd.read_csv('mushrooms.csv')

## Label Encoder

In [4]:
from sklearn.preprocessing import LabelEncoder

labelencoder=LabelEncoder()
data = data.apply(labelencoder.fit_transform)

# Machine Learning

## Data Splitting

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop(['class'], axis=1)
y = data['class']

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X=scaler.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

## Models

In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [59]:
pipeline = Pipeline(steps=[('pca', PCA()), ('randomforestclassifier', RandomForestClassifier())])

In [62]:
parameters = {
            'pca__n_components': [5, 10, 15, 20],
            'randomforestclassifier__bootstrap': [True],
            'randomforestclassifier__max_depth': [80, 90, 100, 110],
            'randomforestclassifier__max_features': [2, 3],
            'randomforestclassifier__min_samples_leaf': [3, 4, 5],
            'randomforestclassifier__min_samples_split': [8, 10, 12],
            'randomforestclassifier__n_estimators': [100, 200, 300, 1000]
            }

In [63]:
grid = GridSearchCV(estimator=pipeline, param_grid=parameters, cv=5, scoring='f1', n_jobs=-1) 
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_no...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'pca__n_components': [5, 10, 15, 20], 'randomforestclassifier__bootstrap': [True], 'randomforestclassifier__max_depth': [80, 90, 100, 110], 'randomforestclassifier__max_features': [2, 3], 'randomforestclassifier__min_samples_leaf': [3, 4, 5], 'randomforestclassifier__min_samples_split': [8, 10, 12], 'randomforestclassifier__n_estimators': [100, 200, 300, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='war

## Best parameters

In [64]:
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_)) 
print("Test set score: {:.2f}".format(grid.score(X_test, y_test))) 
print("Best parameters: {}".format(grid.best_params_))

Best cross-validation accuracy: 1.00
Test set score: 1.00
Best parameters: {'pca__n_components': 20, 'randomforestclassifier__bootstrap': True, 'randomforestclassifier__max_depth': 80, 'randomforestclassifier__max_features': 2, 'randomforestclassifier__min_samples_leaf': 3, 'randomforestclassifier__min_samples_split': 8, 'randomforestclassifier__n_estimators': 100}
