<a href="https://www.kaggle.com/code/theanjalirai/binary-classification-mushroom-dataset?scriptVersionId=195632402" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Binary classification using logistic regression

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("/kaggle/input/mushroom-dataset/mushroom_cleaned.csv")

In [3]:
df.head()

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
0,1372,2,2,10,3.807467,1545,11,1.804273,1
1,1461,2,2,10,3.807467,1557,11,1.804273,1
2,1371,2,2,10,3.612496,1566,11,1.804273,1
3,1261,6,2,10,3.787572,1566,11,1.804273,1
4,1305,6,2,10,3.711971,1464,11,0.943195,1


In [4]:
df.describe()

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
count,54035.0,54035.0,54035.0,54035.0,54035.0,54035.0,54035.0,54035.0,54035.0
mean,567.257204,4.000315,2.142056,7.329509,0.75911,1051.081299,8.418062,0.952163,0.549181
std,359.883763,2.160505,2.228821,3.200266,0.650969,782.056076,3.262078,0.305594,0.49758
min,0.0,0.0,0.0,0.0,0.000426,0.0,0.0,0.027372,0.0
25%,289.0,2.0,0.0,5.0,0.270997,421.0,6.0,0.88845,0.0
50%,525.0,5.0,1.0,8.0,0.593295,923.0,11.0,0.943195,1.0
75%,781.0,6.0,4.0,10.0,1.054858,1523.0,11.0,0.943195,1.0
max,1891.0,6.0,6.0,11.0,3.83532,3569.0,12.0,1.804273,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54035 entries, 0 to 54034
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   cap-diameter     54035 non-null  int64  
 1   cap-shape        54035 non-null  int64  
 2   gill-attachment  54035 non-null  int64  
 3   gill-color       54035 non-null  int64  
 4   stem-height      54035 non-null  float64
 5   stem-width       54035 non-null  int64  
 6   stem-color       54035 non-null  int64  
 7   season           54035 non-null  float64
 8   class            54035 non-null  int64  
dtypes: float64(2), int64(7)
memory usage: 3.7 MB


In [6]:
df.isnull().sum()

cap-diameter       0
cap-shape          0
gill-attachment    0
gill-color         0
stem-height        0
stem-width         0
stem-color         0
season             0
class              0
dtype: int64

In [7]:
y = df['class']

In [8]:
X = df.iloc[:,:-1]

In [9]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size = 0.33,random_state = 42)

In [10]:
from sklearn.linear_model import LogisticRegression
classifier  = LogisticRegression()

In [11]:
classifier.fit(X_train , y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Predicted probabilities for test dataset

In [12]:
classifier.predict_proba(X_test)

array([[0.66982474, 0.33017526],
       [0.48099228, 0.51900772],
       [0.67653708, 0.32346292],
       ...,
       [0.50714875, 0.49285125],
       [0.45598153, 0.54401847],
       [0.45576908, 0.54423092]])

# Prediction

In [13]:
y_pred = classifier.predict(X_test)
y_pred

array([0, 1, 0, ..., 0, 1, 1])

# Confusion matrix, accuracy, classification report

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
print(confusion_matrix(y_pred , y_test))
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred,y_test))

[[4496 3169]
 [3569 6598]]
0.6221399730820996
              precision    recall  f1-score   support

           0       0.56      0.59      0.57      7665
           1       0.68      0.65      0.66     10167

    accuracy                           0.62     17832
   macro avg       0.62      0.62      0.62     17832
weighted avg       0.62      0.62      0.62     17832



# Hyperparameter Tuning - GridSearchCV

In [15]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [16]:
parameters = {'penalty' : ['l1','l2','elasticnet' ,'None'], 'solver' : ['lbfgs' ,'liblinear','saga','sag'] ,'C' : [1,10,20]}
             

In [17]:
clf = GridSearchCV(classifier , param_grid = parameters,cv = 5)

In [18]:
clf.fit(X_train,y_train)

In [19]:
clf.best_params_

{'C': 20, 'penalty': 'l1', 'solver': 'liblinear'}

In [20]:
classifier2 = LogisticRegression(C = 20,penalty = 'l1' , solver = 'liblinear')
classifier2.fit(X_train,y_train)

In [21]:
y_pred2 = classifier2.predict(X_test)

In [22]:
print(confusion_matrix(y_pred2 , y_test))
print(accuracy_score(y_pred2,y_test))
print(classification_report(y_pred2,y_test))

[[4316 2786]
 [3749 6981]]
0.6335240017945267
              precision    recall  f1-score   support

           0       0.54      0.61      0.57      7102
           1       0.71      0.65      0.68     10730

    accuracy                           0.63     17832
   macro avg       0.62      0.63      0.63     17832
weighted avg       0.64      0.63      0.64     17832



*See you soon...*