In [None]:
import pandas as pd 
import os 
import numpy as np 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline

In [2]:
from ucimlrepo import fetch_ucirepo


def load_data():
    """Load the secondary mushroom dataset.

    Returns
    -------
    X : pandas.DataFrame
        Features.
    y : pandas.Series
        Target.
    """
    # fetch dataset
    secondary_mushroom = fetch_ucirepo(id=848)

    # data (as pandas dataframes)
    X = secondary_mushroom.data.features
    y = secondary_mushroom.data.targets  # Convert to Series if single column

    return X, y


In [3]:
X , y = load_data() 
X.shape

(61069, 20)

# plane 
- how would i approach this project? 
- i will start by making a split to the data (traing, validation, test)
- then i will explore the train data 
- make a pipline of all the preprocessing steps 
- and based on my analysis i will choose the best features and the best model
- i will use mlflow to track my experiments
- i will make a api on my model using fastapi
- i will dockerize my api
- i will make a ci/cd pipeline using github actions

In [4]:
# step 1: split the data into training ,validation and testing sets
# 60% training , 20% validation and 20% testing
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, 
                                                  random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25,
                                                   random_state=42, stratify=y_temp) # 0.25 x 0.8 = 0.2
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


X_train shape: (36641, 20), y_train shape: (36641, 1)
X_val shape: (12214, 20), y_val shape: (12214, 1)
X_test shape: (12214, 20), y_test shape: (12214, 1)


now we log this data to start working on it

In [6]:
X_train.to_csv('../data/preprocessed/X_train.csv', index=False)
y_train.to_csv('../data/preprocessed/y_train.csv', index=False)
X_val.to_csv('../data/preprocessed/X_val.csv', index=False)
y_val.to_csv('../data/preprocessed/y_val.csv', index=False)
X_test.to_csv('../data/preprocessed/X_test.csv', index=False)
y_test.to_csv('../data/preprocessed/y_test.csv', index=False)

# phase 1 : data exploration and preprocessing

In [3]:
X_train = pd.read_csv("../data/preprocessed/X_train.csv")
y_train = pd.read_csv("../data/preprocessed/y_train.csv")

In [4]:
X_train.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,6.52,x,i,n,f,e,c,p,4.43,12.75,,,w,,,t,l,,w,u
1,8.02,x,d,n,t,p,,y,7.03,15.75,,,y,,,f,f,,d,u
2,12.99,f,,n,t,,,w,12.03,18.19,b,,w,u,w,t,g,,d,u
3,9.0,x,e,o,f,a,,o,10.32,17.98,r,,o,,,t,r,n,d,a
4,11.46,f,,l,f,s,c,u,8.45,21.54,,,u,,,f,f,,l,w


In [7]:
# now we want to explore the data 
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36641 entries, 0 to 36640
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   cap-diameter          36641 non-null  float64
 1   cap-shape             36641 non-null  object 
 2   cap-surface           28225 non-null  object 
 3   cap-color             36641 non-null  object 
 4   does-bruise-or-bleed  36641 non-null  object 
 5   gill-attachment       30784 non-null  object 
 6   gill-spacing          21558 non-null  object 
 7   gill-color            36641 non-null  object 
 8   stem-height           36641 non-null  float64
 9   stem-width            36641 non-null  float64
 10  stem-root             5642 non-null   object 
 11  stem-surface          13759 non-null  object 
 12  stem-color            36641 non-null  object 
 13  veil-type             1937 non-null   object 
 14  veil-color            4501 non-null   object 
 15  has-ring           

to start our investigation we split the data into numrical and categorical features and start doing (Univariate, Bivariate, Multivariate) analysis on the data 

In [8]:
# split the data into categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object'])
numerical_features = X_train.select_dtypes(include=['int64', 'float64'])
categorical_features.shape, numerical_features.shape

((36641, 17), (36641, 3))