In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import mutual_info_classif
from ipywidgets import interact, Dropdown


## Task 1.
randomly sample 10,000 rows out of the 61,000 rows available

## Implementation
- read the csv into a data frame
- use built in sample function
    - 10000 rows
    - random seed 
    - do not replace so every row is selected once
- output results


In [2]:
#generation of sample
RandomSeed = 27 #placeholder for random integer later
DataPath = "MushroomDataset/Secondary_data.csv"
DataFrame = pd.read_csv(DataPath, sep=';')  #semicolon seperators 
DataFrameSample = DataFrame.sample(n=10_000, random_state=RandomSeed,replace=False)

#output
print("Full dataset shape:",DataFrame.shape)
print("Sampled dataset shape:", DataFrameSample.shape)
DataFrameSample.head()

Full dataset shape: (61069, 21)
Sampled dataset shape: (10000, 21)


Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
58793,p,26.17,f,g,n,f,p,,y,7.77,...,,,n,,,f,f,,d,w
10787,p,2.16,b,,n,f,a,,g,5.54,...,,s,n,,,f,f,,d,a
60682,e,4.65,o,,g,f,f,f,f,5.12,...,,,n,,,f,f,,l,s
11922,e,1.09,f,g,e,t,a,,p,5.67,...,,,n,,,f,f,,d,a
37572,p,1.86,p,t,n,f,a,c,n,5.12,...,,t,y,,,t,e,,m,a


## Task 2
EDA exploratory data analysis



In [3]:

plt.style.use('default')
sns.set_theme()

# basic info
DataFrameSample.info()
print(DataFrameSample.describe())

# class labels for plots
ClassMap = {'e': 'Edible', 'p': 'Poisonous'}
DataFrameSample['class_mapped'] = DataFrameSample['class'].map(ClassMap)

# missing values
MissingCounts = DataFrameSample.isna().sum().sort_values(ascending=False)
print(MissingCounts[MissingCounts > 0])

# column types
CatCols = DataFrameSample.select_dtypes(include=['object', 'category']).columns.tolist()
NumCols = DataFrameSample.select_dtypes(include=['number']).columns.tolist()
if 'class' in CatCols: CatCols.remove('class')
if 'class' in NumCols: NumCols.remove('class')
print("Categorical:", CatCols)
print("Numeric:", NumCols)

# mutual information
X = DataFrameSample.drop(columns=['class'])
Y = DataFrameSample['class']
try:
    OHE = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    OHE = OneHotEncoder(handle_unknown='ignore', sparse=False)

XCat = OHE.fit_transform(X[CatCols]) if CatCols else np.empty((len(X), 0))
XNum = X[NumCols].to_numpy() if NumCols else np.empty((len(X), 0))
XEncoded = np.hstack([XCat, XNum])

MIScores = mutual_info_classif(XEncoded, Y, discrete_features=True, random_state=42)
FeatureNamesOHE = OHE.get_feature_names_out(CatCols) if CatCols else np.array([])
AllFeatureNames = list(FeatureNamesOHE) + NumCols
MISeries = pd.Series(MIScores, index=AllFeatureNames).sort_values(ascending=False)
print(MISeries.head(20))

# build dropdown options
SelectedCats = ['odor','spore-print-color','gill-color','cap-color','does-bruise-or-bleed','habitat','season']
ColsToPlotCat = [c for c in SelectedCats if c in CatCols]
ColsToPlotNum = NumCols

PlotOptions = ['Class distribution', 'Missing values', 'Top MI features']
PlotOptions += [f'Categorical: {c}' for c in ColsToPlotCat]
PlotOptions += [f'Numeric: {c}' for c in ColsToPlotNum]

def ShowPlot(Choice):
    plt.figure(figsize=(6,4))

    if Choice == 'Class distribution':
        DataFrameSample['class_mapped'].value_counts().plot(kind='bar')
        plt.title('Class Distribution')
        plt.xlabel('Class')
        plt.ylabel('Count')

    elif Choice == 'Missing values':
        MissingCounts[MissingCounts > 0].plot(kind='bar')
        plt.title('Missing Values')
        plt.xticks(rotation=45, ha='right')

    elif Choice == 'Top MI features':
        MISeries.head(15).plot(kind='bar')
        plt.title('Top MI Features')
        plt.xticks(rotation=45, ha='right')

    elif Choice.startswith('Categorical: '):
        Col = Choice.split(': ', 1)[1]
        Tab = pd.crosstab(DataFrameSample[Col], DataFrameSample['class_mapped'], normalize='index')
        Tab.plot(kind='bar', stacked=True)
        plt.title(f'{Col} vs Class')
        plt.ylabel('Proportion')
        plt.xticks(rotation=45, ha='right')

    elif Choice.startswith('Numeric: '):
        Col = Choice.split(': ', 1)[1]
        sns.boxplot(x='class_mapped', y=Col, data=DataFrameSample)
        plt.title(f'{Col} by Class')

    plt.tight_layout()
    plt.show()

DropdownWidget = Dropdown(options=PlotOptions, description='Plot:')
interact(ShowPlot, Choice=DropdownWidget)


<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 58793 to 20396
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   class                 10000 non-null  object 
 1   cap-diameter          10000 non-null  float64
 2   cap-shape             10000 non-null  object 
 3   cap-surface           7754 non-null   object 
 4   cap-color             10000 non-null  object 
 5   does-bruise-or-bleed  10000 non-null  object 
 6   gill-attachment       8366 non-null   object 
 7   gill-spacing          5900 non-null   object 
 8   gill-color            10000 non-null  object 
 9   stem-height           10000 non-null  float64
 10  stem-width            10000 non-null  float64
 11  stem-root             1580 non-null   object 
 12  stem-surface          3812 non-null   object 
 13  stem-color            10000 non-null  object 
 14  veil-type             510 non-null    object 
 15  veil-color          



interactive(children=(Dropdown(description='Plot:', options=('Class distribution', 'Missing values', 'Top MI f…

<function __main__.ShowPlot(Choice)>

### Task 3.
Model Shortlisting based on EDA: 
Based on your findings from the above EDA task, shortlist
three classifiers from the classifiers you learnt in the lab classes. Explain your choice of
classifiers in terms of your findings from the above EDA task. [5 marks]

### Response
out of all the classifiers we learnt this is the list of suitability:
- **logistic regression**:  no because the data isnt linear 
- **KNN**: 



