# Weka machine learning toolkit

* [Download Weka](https://www.cs.waikato.ac.nz/~ml/weka/)
* [Data mining with Weka video series](https://www.youtube.com/user/WekaMOOC)

# Exercise 6

For this exercise you can use either Python with sklearn or Weka.

* Using the UCI mushroom dataset from the last exercise, perform a feature selection using a classifier evaluator. Which features are most discriminitave?
* Use principal components analysis to construct a reduced space. Which combination of features explain the most variance in the dataset?
* Do you see any overlap between the PCA features and those obtained from feature selection?

In [366]:
import math
import numpy as np
from debugpy.launcher.debuggee import describe
from matplotlib.pyplot import margins
from numpy.ma.core import indices
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt

In [367]:
df = pd.read_csv("../data/mushrooms/mushroomdata.csv")
df.describe()

Unnamed: 0,edibility,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [368]:
#Siden rundt 80% av matrisen er tom, tyder det op at det er en "sparse" matrise (vanlig verdi ligger tydeligvis på rundt 80 som treshold

def getSparsity(dataframe):
    dummies = pd.get_dummies(dataframe)
    non_zero_elements = (dummies != 0).sum().sum()
    sparsity = (1 - (non_zero_elements / dummies.size)) * 100
    return f"{sparsity:.3f}% of the matrix is 0's"


getSparsity(df)


"80.672% of the matrix is 0's"

# NOTATER
Hver principal component består av et sett med vekter ganget med verdien i datapunktet, dette datasettet har da 3 sett med vekter som er like lange som antall features i df-en


In [369]:
x, y = df.drop(columns=['edibility']), df['edibility']
x, y = pd.get_dummies(x), y.map({'p': 0, 'e': 1}) #TODO, sjekk at e faktisk betyr edible

display(x.shape)
display(y.shape)
display(x.describe())
display(y)


(8124, 117)

(8124,)

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
top,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
freq,7672,8120,4972,7296,8092,4468,5804,8120,5568,4880,...,6876,4084,6412,4976,5976,7292,7832,6980,7756,7932


0       0
1       1
2       1
3       0
4       1
       ..
8119    1
8120    1
8121    1
8122    0
8123    1
Name: edibility, Length: 8124, dtype: int64

# Feature selection with a classifier evaluator

In [370]:


from sklearn.feature_selection import f_classif, VarianceThreshold, chi2


algorithm = SelectKBest(chi2, k=5)
algorithm.fit(x, y)
x_transformed = algorithm.transform(x)

selected = [x.columns[i] for i in algorithm.get_support(indices=True)]
print("Selected features:", ", ".join(selected))


Selected features: odor_f, odor_n, gill-color_b, stalk-surface-above-ring_k, stalk-surface-below-ring_k


# PRINCIPAL COMPONENTS ANALYSIS

In [371]:

#Onehot encoderen oversetter de kategoriske featurene (alle, i dette tilfellet), 
#om datasettet hadde hatt noen som var numeriske, burde disse ikke blitt stappet inn i transformatoren på samme måte
#man ville da lagt til en ekstra transformer f.eks 'num' og her stapt inn verdiene det gjelder, likt som for cat


pipeline = Pipeline(steps=[
    ('scaler', StandardScaler(with_mean=False)),  
    ('pca', PCA(n_components=3))
])

X_pca = pipeline.fit_transform(x)

plotting_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2', 'PC3'])

plotting_df['Color'] = y.map({1: 'Edible', 0: 'Not Edible'})

fig = px.scatter_3d(
    plotting_df,
    x='PC1',
    y='PC2',
    z='PC3',
    title='3D PCA of Mushroom Dataset',
    labels={'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2',
            'PC3': 'Principal Component 3'},
    color='Color',  
    opacity=0.8
)

fig.update_layout(
    height=800,
    margin=dict(b=0)
)

fig.update_traces(marker=dict(size=1))

fig.show()


# Evalutating the pca model

In [373]:
pca_model = pipeline.named_steps['pca']


In [374]:
pca_loadings = pd.DataFrame(pca_model.components_,
                            columns=x.columns,
                            index=[f'PC{i + 1}' for i in range(pca_model.n_components_)])

sum_row = pd.DataFrame(pca_loadings.abs().sum(axis=0)).T

sum_row.index = ['Sum']

pca_loadings_with_sum = pd.concat([pca_loadings, sum_row])

with pd.option_context('display.max_columns', None):  # Show all columns
    display(pca_loadings_with_sum)
    
    
explained_variance = pca_model.explained_variance_ratio_
print(
    f"Explained variance by each component: {explained_variance}, cumulative: {explained_variance.cumsum()}")

#Viktigste: habitat_p, population_v, cap-shape_k

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_b,cap-color_c,cap-color_e,cap-color_g,cap-color_n,cap-color_p,cap-color_r,cap-color_u,cap-color_w,cap-color_y,bruises?_f,bruises?_t,odor_a,odor_c,odor_f,odor_l,odor_m,odor_n,odor_p,odor_s,odor_y,gill-attachment_a,gill-attachment_f,gill-spacing_c,gill-spacing_w,gill-size_b,gill-size_n,gill-color_b,gill-color_e,gill-color_g,gill-color_h,gill-color_k,gill-color_n,gill-color_o,gill-color_p,gill-color_r,gill-color_u,gill-color_w,gill-color_y,stalk-shape_e,stalk-shape_t,stalk-root_?,stalk-root_b,stalk-root_c,stalk-root_e,stalk-root_r,stalk-surface-above-ring_f,stalk-surface-above-ring_k,stalk-surface-above-ring_s,stalk-surface-above-ring_y,stalk-surface-below-ring_f,stalk-surface-below-ring_k,stalk-surface-below-ring_s,stalk-surface-below-ring_y,stalk-color-above-ring_b,stalk-color-above-ring_c,stalk-color-above-ring_e,stalk-color-above-ring_g,stalk-color-above-ring_n,stalk-color-above-ring_o,stalk-color-above-ring_p,stalk-color-above-ring_w,stalk-color-above-ring_y,stalk-color-below-ring_b,stalk-color-below-ring_c,stalk-color-below-ring_e,stalk-color-below-ring_g,stalk-color-below-ring_n,stalk-color-below-ring_o,stalk-color-below-ring_p,stalk-color-below-ring_w,stalk-color-below-ring_y,veil-type_p,veil-color_n,veil-color_o,veil-color_w,veil-color_y,ring-number_n,ring-number_o,ring-number_t,ring-type_e,ring-type_f,ring-type_l,ring-type_n,ring-type_p,spore-print-color_b,spore-print-color_h,spore-print-color_k,spore-print-color_n,spore-print-color_o,spore-print-color_r,spore-print-color_u,spore-print-color_w,spore-print-color_y,population_a,population_c,population_n,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
PC1,0.079834,0.001893,-0.013437,-0.085837,0.012201,0.026957,0.032678,0.005794,0.017089,-0.046603,0.02736,-0.003184,-0.064564,0.000874,-0.007553,0.02831,-0.002792,-0.002792,0.107566,-0.044337,-0.220238,0.220238,0.082955,0.026776,-0.19049,0.082955,-0.007172,0.18218,0.051116,-0.106285,-0.106285,0.022149,-0.022149,-0.089435,0.089435,0.14117,-0.14117,-0.20027,0.01771,-0.037798,-0.051972,0.071764,0.102913,0.013815,0.019071,0.020728,0.05764,0.088335,0.008439,0.001892,-0.001892,-0.151946,0.008616,0.100799,0.097054,0.044537,0.051176,-0.235773,0.196487,-0.00288,0.048387,-0.234316,0.179601,0.028665,-0.093314,-0.007172,0.017549,0.070155,-0.091735,0.024508,-0.118529,0.138128,-0.002633,-0.093574,-0.007172,0.017522,0.070113,-0.089846,0.024508,-0.118759,0.140847,-0.009162,-4.700178e-16,0.017226,0.017226,-0.023492,-0.002633,-0.007172,-0.048018,0.05114,-0.120514,-0.003886,-0.169457,-0.007172,0.240026,0.011791,-0.142414,0.143039,0.146803,0.011791,0.034036,0.020057,-0.161358,0.011791,0.051951,0.028572,0.073051,0.11401,-0.150387,0.003753,0.020477,0.075409,-0.080517,0.080901,-0.135526,0.044891,0.025837
PC2,0.016743,0.008192,-0.041887,0.131896,0.001769,-0.047485,-0.118133,0.005171,0.14496,-0.028729,-0.000624,0.013337,0.090348,-0.126871,0.139249,0.014261,0.00011,0.00011,0.000858,-0.140707,0.054993,-0.054993,-0.020545,-0.001546,-0.153039,-0.020545,0.024852,0.033601,0.012787,0.109481,0.109481,0.148478,-0.148478,-0.023905,0.023905,-0.179802,0.179802,0.191789,0.038037,-0.103392,-0.104088,0.005134,0.006484,0.088469,-0.091201,-0.001539,-0.023796,-0.00493,0.086781,-0.112015,0.112015,0.242009,-0.223955,-0.013691,0.021121,-0.023536,-0.003046,-0.097539,0.093417,0.003628,-0.003002,-0.103551,0.099688,-0.00662,-0.130262,0.024852,0.037138,-0.033106,-0.128881,0.148225,0.013315,0.066086,0.015208,-0.131075,0.024852,0.037177,-0.033031,-0.115214,0.148225,0.013026,0.06301,0.014946,-1.760072e-14,0.104182,0.104182,-0.148382,0.015208,0.024852,-0.040734,0.035526,0.19597,0.000191,-0.229793,0.024852,-0.02094,0.075537,-0.224336,-0.030044,-0.015195,0.075537,-0.003724,0.002011,0.201668,0.075537,0.012341,0.107838,-0.008043,-0.009443,0.0838,-0.149493,-0.049328,-0.064072,0.176586,-0.015291,-0.013922,-0.006366,0.051098
PC3,0.058664,0.008551,-0.006413,-0.014504,-0.005943,-0.011561,0.002503,0.000489,0.001449,-0.003704,0.002536,0.028116,-0.078835,0.021482,0.01648,0.001911,-0.002981,-0.002981,-0.039559,0.073847,0.02637,-0.02637,0.003219,-0.015279,0.060202,0.003219,0.093014,0.021226,-0.026836,-0.073405,-0.073405,0.287641,-0.287641,0.072978,-0.072978,0.137528,-0.137528,-0.129362,0.013819,0.054638,0.02952,-0.034688,0.008557,0.165685,0.006161,0.003865,-0.016877,0.001163,0.178166,0.185684,-0.185684,-0.018435,0.053534,0.036915,-0.083998,0.009395,-0.053343,0.054593,-0.026297,0.022969,-0.052696,0.040915,-0.027669,0.048185,0.068857,0.093014,0.015616,-0.011992,0.070175,0.276981,-0.037847,-0.126774,0.02175,0.069325,0.093014,0.015599,-0.012028,0.063599,0.276981,-0.037664,-0.125754,0.012488,-2.441986e-14,0.194681,0.194681,-0.275925,0.02175,0.093014,-0.038475,0.015896,-0.154002,-0.005331,0.1197,0.093014,0.046893,0.141642,0.103444,-0.047914,-0.023948,0.141642,0.006901,-0.017045,-0.094202,0.141642,-0.056824,0.156581,0.007602,-0.039752,-0.052532,0.048202,-0.029516,-0.02242,0.087838,0.009525,-0.00709,-0.021553,0.018501
Sum,0.155241,0.018636,0.061736,0.232236,0.019913,0.086003,0.153314,0.011454,0.163498,0.079037,0.03052,0.044638,0.233747,0.149227,0.163283,0.044482,0.005883,0.005883,0.147984,0.258892,0.301601,0.301601,0.106719,0.043601,0.403732,0.106719,0.125038,0.237007,0.090739,0.289171,0.289171,0.458268,0.458268,0.186317,0.186317,0.4585,0.4585,0.521421,0.069567,0.195828,0.18558,0.111586,0.117954,0.26797,0.116432,0.026132,0.098312,0.094428,0.273386,0.299591,0.299591,0.41239,0.286105,0.151405,0.202173,0.077467,0.107565,0.387905,0.316201,0.029477,0.104085,0.378782,0.306959,0.08347,0.292433,0.125038,0.070304,0.115253,0.290792,0.449714,0.16969,0.330988,0.039591,0.293974,0.125038,0.070298,0.115171,0.268658,0.449714,0.169448,0.329611,0.036596,4.24906e-14,0.316089,0.316089,0.447798,0.039591,0.125038,0.127227,0.102562,0.470485,0.009408,0.518949,0.125038,0.30786,0.22897,0.470194,0.220998,0.185946,0.22897,0.04466,0.039113,0.457229,0.22897,0.121116,0.29299,0.088696,0.163204,0.286719,0.201448,0.099321,0.161901,0.344941,0.105718,0.156537,0.07281,0.095436


Explained variance by each component: [0.08891017 0.08125474 0.07203601], cumulative: [0.08891017 0.17016491 0.24220092]
