In [1]:
!python -V

Python 3.11.10


In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier

In [3]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("fungi-incognita-experiment")

<Experiment: artifact_location='/Users/tillmeineke/ML/ML_Zoomcamp2024_hw/fungiIncognita/notebooks/mlruns/1', creation_time=1733079958846, experiment_id='1', last_update_time=1733079958846, lifecycle_stage='active', name='fungi-incognita-experiment', tags={}>

In [4]:
# load the data
df = pd.read_csv("../data/secondary_data_generated_with_names.csv", sep=";")

# shuffle the data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# drop columns with missing values
df_no_missing = df.dropna(axis=1, inplace=False)
df_no_missing = df_no_missing.drop(columns=["family"], inplace=False)
df_no_missing = df_no_missing.drop(columns=["class"], inplace=False)

In [5]:
df_full_train, df_test = train_test_split(
    df_no_missing, test_size=0.2, random_state=42, stratify=df_no_missing["name"]
)
df_train, df_val = train_test_split(
    df_full_train, test_size=0.25, random_state=42, stratify=df_full_train["name"]
)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.name.values
y_val = df_val.name.values
y_test = df_test.name.values

del df_train["name"]
del df_val["name"]
del df_test["name"]

In [40]:
cat_features = [
    #    "family",
    #    "class",
    "cap-shape",
    "cap-color",
#    "does-bruise-or-bleed",
    "gill-color",
    "stem-color",
#    "has-ring",
    "habitat",
    "season",
]

In [41]:
num_features = [
    "cap-diameter",
    "stem-height",
    "stem-width",
]

In [42]:
dv = DictVectorizer(sparse=False)

# Convert categorical features to dictionary format
train_dicts = df_train[cat_features + num_features].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[cat_features + num_features].to_dict(orient="records")
X_val = dv.transform(val_dicts)


In [43]:
dv.get_feature_names_out()

array(['cap-color=b', 'cap-color=e', 'cap-color=g', 'cap-color=k',
       'cap-color=l', 'cap-color=n', 'cap-color=o', 'cap-color=p',
       'cap-color=r', 'cap-color=u', 'cap-color=w', 'cap-color=y',
       'cap-diameter', 'cap-shape=b', 'cap-shape=c', 'cap-shape=f',
       'cap-shape=o', 'cap-shape=p', 'cap-shape=s', 'cap-shape=x',
       'gill-color=b', 'gill-color=e', 'gill-color=f', 'gill-color=g',
       'gill-color=k', 'gill-color=n', 'gill-color=o', 'gill-color=p',
       'gill-color=r', 'gill-color=u', 'gill-color=w', 'gill-color=y',
       'habitat=d', 'habitat=g', 'habitat=h', 'habitat=l', 'habitat=m',
       'habitat=p', 'habitat=u', 'habitat=w', 'season=a', 'season=s',
       'season=u', 'season=w', 'stem-color=b', 'stem-color=e',
       'stem-color=f', 'stem-color=g', 'stem-color=k', 'stem-color=l',
       'stem-color=n', 'stem-color=o', 'stem-color=p', 'stem-color=r',
       'stem-color=u', 'stem-color=w', 'stem-color=y', 'stem-height',
       'stem-width'], dtype=object

In [44]:
with mlflow.start_run():
    dt_model = DecisionTreeClassifier()
    dt_model.fit(X_train, y_train)

    y_pred = dt_model.predict(X_val)

    accuracy = (y_pred == y_val).mean()

In [45]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

ovr_clf = OneVsRestClassifier(DecisionTreeClassifier(random_state=42)).fit(X_train, y_train)
predictions = ovr_clf.predict(X_val)
pred_probas = ovr_clf.predict_proba(X_val)
score = ovr_clf.score(X_val, y_val)

print(f"Score: {score}")

Score: 0.9435074504666776


  Y /= np.sum(Y, axis=1)[:, np.newaxis]


In [46]:
report = classification_report(y_val, predictions)
print(report)

                          precision    recall  f1-score   support

       Amethyst Deceiver       1.00      0.96      0.98        70
      Aniseed Funnel Cap       1.00      1.00      1.00        71
          Apricot Fungus       1.00      0.99      0.99        71
    Bare-toothed Russula       0.88      0.90      0.89        71
              Bay Bolete       1.00      0.99      0.99        71
      Beechwood Sickener       1.00      0.99      0.99        70
           Birch Russula       1.00      1.00      1.00        71
           Bitter Bolete       1.00      0.89      0.94        70
      Blackening Russula       1.00      0.90      0.95        70
      Blackening Wax Cap       1.00      0.92      0.96        71
 Blackish Purple Russula       1.00      0.97      0.99        70
 Bleeding Brown Mushroom       1.00      0.99      0.99        71
   Blood-red Cortinarius       1.00      1.00      1.00        71
           Blue Leptonia       1.00      0.99      0.99        70
         

In [28]:
# Check data type
print("y_val dtype:", y_val.dtype)

y_val dtype: object


In [47]:
# Get unique classes
classes = np.unique(y_test)
print("Unique classes:", len(classes))

# Convert labels to numeric using mapping
label_map = {label: idx for idx, label in enumerate(classes)}
y_test_numeric = np.array([label_map[label] for label in y_test])

# Check for NaN values and handle them
if np.isnan(y_test_numeric).sum() > 0 or np.isnan(pred_probas).sum() > 0:
    print("NaN values found in y_test_numeric or pred_probas. Handling NaNs...")
    # Remove NaN values
    valid_indices = ~np.isnan(y_test_numeric) & ~np.isnan(pred_probas).any(axis=1)
    y_test_numeric = y_test_numeric[valid_indices]
    pred_probas = pred_probas[valid_indices]

# Ensure the lengths of y_test_numeric and pred_probas are consistent
min_length = min(len(y_test_numeric), len(pred_probas))
y_test_numeric = y_test_numeric[:min_length]
pred_probas = pred_probas[:min_length]

# Calculate ROC AUC score
auc = roc_auc_score(y_test_numeric, pred_probas, multi_class="ovr", average="macro")

print(f"ROC AUC Score: {auc:.4f}")

Unique classes: 173
NaN values found in y_test_numeric or pred_probas. Handling NaNs...
ROC AUC Score: 0.5003


In [27]:
# Check y_val for NaNs
print("NaNs in y_val:", np.isnan(y_val).sum())

# Check pred_probas for NaNs
print("NaNs in pred_probas:", np.isnan(pred_probas).sum())


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [24]:
auc = roc_auc_score(y_val, pred_probas, multi_class='ovr')
auc

ValueError: Input contains NaN.