In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt

# Requirements
1. Load the data and check its correctness
2. Explore the basic parameters: how many data points do we have? What are the targets and what is their distribution? Any kind of exploratory data analysis is welcome
3. Identify the problem: is it regression? classification?
4. Identify metric you're going to use
5. Design and run the experiment: train and validate your model
6. Compare your results with some kind of baseline (simplest possible solution to the problem)
7. (Optional) estimate feature importances and select the most important features


# Task 1 ✅
Load the data and check its correctness


In [None]:
# load data into df
df = pd.read_csv("/kaggle/input/mushroom-classification/mushrooms.csv")

In [None]:
# check content
df.head()

=> All features are categorical. 

# Task 2 ✅
Explore the basic parameters: how many data points do we have? What are the targets and what is their distribution? Any kind of exploratory data analysis is welcome

In [None]:
len(df)

=> There are 8124 datapoints.

In [None]:
# describe data: simple stats
df.describe()

In [None]:
# data types
df.dtypes

In [None]:
# distributions of features and target (categorical)
categorical_features = df.columns # all columns are categorical
fig, ax = plt.subplots(len(categorical_features), 1, figsize=(6,len(categorical_features)*5))
for i, categorical_feature in enumerate(df[categorical_features]):
    df[categorical_feature].value_counts().plot(kind="bar", ax=ax[i]).set_title(categorical_feature)
fig.show()

=> The target class can be "e" or "p". Both "e" and "p" seem to be about equally likely.

In [None]:
# correlation matrix
df.apply(lambda x : pd.factorize(x)[0]).corr(method='pearson', min_periods=1)

# Task 3 ✅
Identify the problem: is it regression? classification?

In [None]:
df["class"].value_counts()

=> The target is class. Class is categorical, therefore the task is a classification.

# Task 4 ✅
Identify metric you're going to use

We will use the F1 score, since it considers both precision and recall.

Comparison of classification metrics:
https://towardsdatascience.com/the-5-classification-evaluation-metrics-you-must-know-aa97784ff226

We can alter the β parameter to value precision over recall, because wrongly predicting a poisonous ("p") mushroom as edible ("e") is worse than predicting a edible ("e") mushroom as poisonous ("p").

"The beta parameter determines the weight of recall in the combined score. beta < 1 lends more weight to precision, while beta > 1 favors recall (beta -> 0 considers only precision, beta -> +inf only recall)." - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.fbeta_score.html

In [None]:
beta = 0.5 # we give recall half the importance of precision

In [None]:
from sklearn.metrics import fbeta_score

# fbeta_score example

y_true = [0, 1, 1, 0, 1, 1]
y_pred = [0, 0, 1, 0, 0, 1]

fbeta_score(y_true, y_pred, beta=0.5)

# Task 5 ✅
Design and run the experiment: train and validate your model

## 5.1 Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# encode target: we aim to predict poisonous mushrooms => we need high precision
y = df["class"].map({'p':1, 'e':0})
y

In [None]:
# encode features
X = df.drop(columns=["class"]).apply(LabelEncoder().fit_transform)
X.head()

## 5.2 Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y,
                                                   test_size=0.2,
                                                   random_state=0)

print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)
print("Y_train: ", Y_train.shape)
print("Y_test: ", Y_test.shape)

## 5.3 Train Model

We choose XGBoost, because this is a classification with only categorical features and XGBoost has proven to give a good idea of what is possible on this type of dataset especially with classification tasks.

In [None]:
from xgboost import XGBClassifier

In [None]:
model = XGBClassifier()
model.fit(X_train, Y_train, eval_metric="auc", eval_set=[(X_test, Y_test)], verbose=False) # TODO use fbeta_score for eval_metric

## 5.4 Evaluate

In [None]:
pred = model.predict(X)
fbeta_score(y, pred, beta=0.5)

=> We are able to achieve perfect classification 🎉.
Even when training with auc instead of fbeta_score.

# Task 6 ✅
Compare your results with some kind of baseline (simplest possible solution to the problem)

We choose the baseline of predicting all mushrooms as poisonous (class=1).

In [None]:
pred_baseline = np.ones(y.shape) # class=1
fbeta_score(y, pred_baseline, beta=beta)

The baseline of predicting all mushrooms as edible (class=0) leeds to the worst score.

In [None]:
pred_baseline = np.zeros(y.shape) # class=0
fbeta_score(y, pred_baseline, beta=beta)

# Task 7 ✅
(Optional) estimate feature importances and select the most important features

XGBoost allows us to display feature importance for the trained model. More details: https://machinelearningmastery.com/feature-importance-and-feature-selection-with-xgboost-in-python/

In [None]:
from xgboost import plot_importance

In [None]:
plot_importance(model)

=> The three most important features according to the trained XGBoost model are:
1. spore-print-color
2. odor
3. gill-size