# Library

In [20]:
import numpy as np
import matplotlib as mpl
import pandas as pd
import scipy as sc
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, precision_score, recall_score, auc,roc_curve

from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC

import scipy.stats
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score

# Dataset

In [21]:
dataset = pd.read_csv('./mush.csv')
print("data set : ", dataset.shape)

data set :  (8416, 23)


# Data Cleaning

In [22]:
edible = dataset[dataset['edibility']=='EDIBLE'].shape[0]
poison = dataset[dataset['edibility']=='POISONOUS'].shape[0]
print("edible count: "+str(edible) + "(" + str(edible/dataset.shape[0] * 100) + "%)")
print("poison count: "+str(poison) + "(" + str(poison/dataset.shape[0] * 100) + "%)")

dataset = dataset.drop(['veil-type'], axis = 1)
dataset = dataset.replace("?", np.NaN)
dataset = dataset.dropna(axis=1)

# Binary Nominal Attributes
edibility_map_dict = {"EDIBLE":1,"POISONOUS":0}
bruises_map_dict = {"BRUISES":1,"NO":0}
gill_attachment_map_dict = {"FREE":0,"ATTACHED":1}
gill_size_map_dict = {'BROAD':0,'NARROW':1}
stalk_shape_map_dict = {'ENLARGING':0,'TAPERING':1}
# Ordinal Attributes
gill_spacing_map_dict = {'CLOSE':0,'CROWDED':1,'DISTANT':2}
ring_number_map_dict = {'NONE':0,'ONE':1,'TWO':2}
population_map_dict = {'ABUNDANT':6,'CLUSTERED':5,'NUMEROUS':4,'SCATTERED':3,'SEVERAL':2,'SOLITARY':1}

mapping_dicts = [edibility_map_dict, bruises_map_dict, gill_attachment_map_dict, gill_size_map_dict, stalk_shape_map_dict, gill_spacing_map_dict, ring_number_map_dict, population_map_dict]
mapping_columns = ['edibility','bruises?','gill-attachment','gill-size','stalk-shape','gill-spacing','ring-number','population']

for i in range(len(mapping_columns)):
    dataset[mapping_columns[i]] = dataset[mapping_columns[i]].map(mapping_dicts[i])
    
onehot_columns = ['cap-shape','cap-surface','cap-color','odor','gill-color','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-color','ring-type','spore-print-color','habitat']
cleanDataset = pd.get_dummies(dataset, columns=onehot_columns)

cleanDataset.shape
cleanDataset.describe()

edible count: 4488(53.326996197718636%)
poison count: 3928(46.67300380228137%)


Unnamed: 0,edibility,bruises?,gill-attachment,gill-spacing,gill-size,stalk-shape,ring-number,population,cap-shape_BELL,cap-shape_CONICAL,...,spore-print-color_PURPLE,spore-print-color_WHITE,spore-print-color_YELLOW,habitat_GRASSES,habitat_LEAVES,habitat_MEADOWS,habitat_PATHS,habitat_URBAN,habitat_WASTE,habitat_WOODS
count,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,...,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0
mean,0.53327,0.401141,0.025665,0.189163,0.301331,0.577947,1.065589,2.423954,0.053707,0.000475,...,0.005703,0.288023,0.005703,0.285646,0.101711,0.034696,0.135932,0.043726,0.022814,0.375475
std,0.498922,0.490159,0.158144,0.391662,0.458863,0.493916,0.269635,1.314272,0.225452,0.021797,...,0.07531,0.452869,0.07531,0.451749,0.302286,0.183019,0.342736,0.204497,0.149318,0.484274
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,2.0,6.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
y = cleanDataset['edibility'].values
X = cleanDataset.iloc[:, 1:].values




X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = y)


# Logreg

In [24]:
LogReg = LogisticRegression(solver='lbfgs', max_iter=1000) 
LogReg.fit(X_train, y_train)
y_pred=LogReg.predict(X_test)
train_acc = LogReg.score(X_train, y_train) 
print("The Accuracy for Training Set is {}".format(train_acc*100)) 
test_acc = accuracy_score(y_test, y_pred) 
print("The Accuracy for Test Set is {}".format(test_acc*100)) 
print("The Precision for Test Set is {}".format(precision_score(y_test, y_pred)*100))
print(classification_report(y_test, y_pred))

The Accuracy for Training Set is 99.96604990663724
The Accuracy for Test Set is 99.92079207920793
The Precision for Test Set is 99.85174203113417
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1178
           1       1.00      1.00      1.00      1347

    accuracy                           1.00      2525
   macro avg       1.00      1.00      1.00      2525
weighted avg       1.00      1.00      1.00      2525



# Cross Validation

In [25]:
scores = cross_val_score(LogReg, X, y, cv=10)

print('Cross-Validation scores: {}'.format(scores))
print('Average score: {}'.format(np.mean(scores)))

Cross-Validation scores: [0.87173397 0.98931116 1.         1.         1.         0.86817102
 1.         1.         0.71700357 1.        ]
Average score: 0.9446219719200929


# Hypertuning perameter

In [None]:
#grid search
C = np.logspace(0, 4, num=10)
penalty = ['l1', 'l2']
solver = ['liblinear', 'saga']
hyperparameters = dict(C=C, penalty=penalty, solver=solver)
logistic =LogisticRegression(max_iter=3000)
gridsearch = GridSearchCV(logistic, hyperparameters, cv=10)
best_model_grid = gridsearch.fit(X_train,y_train)
print(best_model_grid.best_params_)

In [None]:
LogReg2=LogisticRegression(C=1,penalty="l1",solver='liblinear')
LogReg2.fit(X_train,y_train)

scores = cross_val_score(LogReg2, X, y, cv=10)

print('Cross-Validation scores: {}'.format(scores))
print('Average score: {}'.format(np.mean(scores)))

In [None]:
#random search
C = np.logspace(0, 4, num=10)
penalty = ['l1', 'l2']
solver = ['liblinear', 'saga']
hyperparameters = dict(C=C, penalty=penalty, solver=solver)
logistic =LogisticRegression(max_iter=3000)

randomizedsearch = RandomizedSearchCV(logistic, hyperparameters)
best_model_random = randomizedsearch.fit(X_train,y_train)
print(best_model_random.best_estimator_)

In [None]:
LogReg3=LogisticRegression(C=2.7825594022071245, max_iter=3000, solver='liblinear')

LogReg3.fit(X_train,y_train)

scores = cross_val_score(LogReg3, X, y, cv=10)

print('Cross-Validation scores: {}'.format(scores))
print('Average score: {}'.format(np.mean(scores)))

In [None]:
LogReg4=LogisticRegression(C=1,penalty="l2",solver='liblinear')
LogReg4.fit(X_train,y_train)

scores = cross_val_score(LogReg4, X, y, cv=10)

print('Cross-Validation scores: {}'.format(scores))
print('Average score: {}'.format(np.mean(scores)))

https://medium.com/@jackstalfort/hyperparameter-tuning-using-grid-search-and-random-search-f8750a464b35