# AI coursework


using the mush.csv dataset.
we will do binary classification with the three algorithms

- tree decision thing
- logistic regression
- k nearest thing


## Used Libraries

In [94]:
import numpy as np
import matplotlib as mpl
import pandas as pd
import scipy as sc
import seaborn as sb
import sklearn

In [95]:
dataset = pd.read_csv('./mush.csv')
print("train set : ", dataset.shape)

train set :  (8416, 23)


Now that we have imported the (extended) dataset into the Pandas Dataframe `dataset` we can do some EDA
# Exploratory Data Analysis 

In [96]:
columns = dataset.columns
for col in columns:
    print("Column: "+col)
    print(dataset[col].value_counts())
    print("null values = " + str(dataset[col].isnull().sum()))
    print("\n")

Column: edibility
EDIBLE       4488
POISONOUS    3928
Name: edibility, dtype: int64
null values = 0


Column: cap-shape
CONVEX     3796
FLAT       3292
KNOBBED     840
BELL        452
SUNKEN       32
CONICAL       4
Name: cap-shape, dtype: int64
null values = 0


Column: cap-surface
SCALY      3268
SMOOTH     2684
FIBROUS    2460
GROOVES       4
Name: cap-surface, dtype: int64
null values = 0


Column: cap-color
BROWN       2320
GRAY        2096
RED         1500
YELLOW      1072
WHITE       1040
BUFF         168
PINK         144
CINNAMON      44
PURPLE        16
GREEN         16
Name: cap-color, dtype: int64
null values = 0


Column: bruises?
NO         5040
BRUISES    3376
Name: bruises?, dtype: int64
null values = 0


Column: odor
NONE        3808
FOUL        2160
FISHY        576
SPICY        576
ALMOND       400
ANISE        400
PUNGENT      256
CREOSOTE     192
MUSTY         48
Name: odor, dtype: int64
null values = 0


Column: gill-attachment
FREE        8200
ATTACHED     216
Nam

If we go through, we can see that attribute #16 'veil-type' is a useless attribute since all $8416$ rows have the same value. So, we can remove this column since it will do nothing but take up time. Now, we are down to 22 columns (including the `edibility` column)

In [97]:
dataset = dataset.drop(['veil-type'], axis = 1)

Here we have printed out the the sums of all the values in each column, as well as a count of the number of null values in each. 
Since the null values in attribute $#11$ are represented by the string `'?'`, they are not registered as a null value by pandas.

In [98]:
dataset = dataset.replace("?", np.NaN)
dataset.isnull().sum()

edibility                      0
cap-shape                      0
cap-surface                    0
cap-color                      0
bruises?                       0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64

Now that the missing values have been correctly replaced with the Pandas `NaN` indicator, we can deal with thees datapoints how we see fit.


Two options would be to either delete the column with the missing data (`stalk-root` attribute $#11$) or delete the rows with the missing data

By deleting the Column with the missing data, we will still have $8416$ entries, and only $21$ columns to work with (including edibility)

On the other hand, by deleting the Rows with the missing values, we will keep $22$ columns but will go down to $5936$ columns

In [99]:
# To delete rows with missing data
# dataset.dropna(axis=0)

# To delete attribute #11
dataset = dataset.dropna(axis=1)
dataset.shape


(8416, 21)

Now that we have fixed up our dataset, we must now do some Data Transformation
# Data Transformation
all of our $22$ attributes can be divided into 3 categories; Ordinal, Nominal and Binary Nominal attributes

Binary Nominal Attributes with only two values only require one binary flag (0,1) to represent it's data.
- #0 'edibility': {'EDIBLE':1,'POISONOUS':0}
- #4 'bruises?' : {'BRUISES':1,'NO':0}
- #6 'gill-attachment' : {'FREE':0,'ATTACHED':1}
- #8 'gill-size' : {'BROAD':0,'NARROW':1}
- #10 'stalk-shape' : {'ENLARGING':0,'TAPERING':1}


Ordinal attributes can be use a mapping dictionary to map each category to an integer.
- #7 'gill-spacing' : {'CLOSE':0,'CROWDED':1,'DISTANT':2}
- #18 'ring-number' : {'NONE':0,'ONE':1,'TWO':2}
- #21 'population' : {'ABUNDANT':6,'CLUSTERED':5,'NUMEROUS':4,'SCATTERED':3,'SEVERAL':2,'SOLITARY':1}

And the rest are Nominal chategorical attributes so will likely have to be one-hot encoded




In [100]:
# Binary Nominal Attributes
edibility_map_dict = {"EDIBLE":1,"POISONOUS":0}
bruises_map_dict = {"BRUISES":1,"NO":0}
gill_attachment_map_dict = {"FREE":0,"ATTACHED":1}
gill_size_map_dict = {'BROAD':0,'NARROW':1}
stalk_shape_map_dict = {'ENLARGING':0,'TAPERING':1}
# Ordinal Attributes
gill_spacing_map_dict = {'CLOSE':0,'CROWDED':1,'DISTANT':2}
ring_number_map_dict = {'NONE':0,'ONE':1,'TWO':2}
population_map_dict = {'ABUNDANT':6,'CLUSTERED':5,'NUMEROUS':4,'SCATTERED':3,'SEVERAL':2,'SOLITARY':1}

mapping_dicts = [edibility_map_dict, bruises_map_dict, gill_attachment_map_dict, gill_size_map_dict, stalk_shape_map_dict, gill_spacing_map_dict, ring_number_map_dict, population_map_dict]
mapping_columns = ['edibility','bruises?','gill-attachment','gill-size','stalk-shape','gill-spacing','ring-number','population']

for i in range(len(mapping_columns)):
    dataset[mapping_columns[i]] = dataset[mapping_columns[i]].map(mapping_dicts[i])
    
onehot_columns = ['cap-shape','cap-surface','cap-color','odor','gill-color','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-color','ring-type','spore-print-color','habitat']
cleanDataset = pd.get_dummies(dataset, columns=onehot_columns)

Now, the dataset is cleaned, and stored in the new Pandas DF `cleanDataset`

In [106]:
cleanDataset.shape

(8416, 100)

In [107]:
cleanDataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8416 entries, 0 to 8415
Data columns (total 100 columns):
 #   Column                            Non-Null Count  Dtype
---  ------                            --------------  -----
 0   edibility                         8416 non-null   int64
 1   bruises?                          8416 non-null   int64
 2   gill-attachment                   8416 non-null   int64
 3   gill-spacing                      8416 non-null   int64
 4   gill-size                         8416 non-null   int64
 5   stalk-shape                       8416 non-null   int64
 6   ring-number                       8416 non-null   int64
 7   population                        8416 non-null   int64
 8   cap-shape_BELL                    8416 non-null   uint8
 9   cap-shape_CONICAL                 8416 non-null   uint8
 10  cap-shape_CONVEX                  8416 non-null   uint8
 11  cap-shape_FLAT                    8416 non-null   uint8
 12  cap-shape_KNOBBED                

In [108]:
cleanDataset.describe()

Unnamed: 0,edibility,bruises?,gill-attachment,gill-spacing,gill-size,stalk-shape,ring-number,population,cap-shape_BELL,cap-shape_CONICAL,...,spore-print-color_PURPLE,spore-print-color_WHITE,spore-print-color_YELLOW,habitat_GRASSES,habitat_LEAVES,habitat_MEADOWS,habitat_PATHS,habitat_URBAN,habitat_WASTE,habitat_WOODS
count,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,...,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0,8416.0
mean,0.53327,0.401141,0.025665,0.189163,0.301331,0.577947,1.065589,2.423954,0.053707,0.000475,...,0.005703,0.288023,0.005703,0.285646,0.101711,0.034696,0.135932,0.043726,0.022814,0.375475
std,0.498922,0.490159,0.158144,0.391662,0.458863,0.493916,0.269635,1.314272,0.225452,0.021797,...,0.07531,0.452869,0.07531,0.451749,0.302286,0.183019,0.342736,0.204497,0.149318,0.484274
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,2.0,6.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
