In [4]:
import pandas as pd
from features import FEATURE_NAMES, EDIBILITY_CLASS
from typing import List
from sklearn.model_selection import train_test_split
import random

In [5]:
FILE_PATH = 'data/mushroom/agaricus-lepiota.data'

In [6]:
class ReadDatasetFile:
    def __init__(self, file_path: str, columns: List[str] = None):
        self.file_path = file_path
        self.columns = columns

    def check_file_extension_csv(self):
        file_extension = self.file_path.split('.')[-1].lower()
        return file_extension == '.csv'

    def convert_text_file_to_dataframe(self):
        data = []
        with open(self.file_path, 'r') as file:
            for line in file:
                data.append(line.strip().split(','))
        return pd.DataFrame(data, columns=self.columns)

    def file_converter_to_dataframe(self):
        if self.check_file_extension_csv():
            return pd.read_csv(self.file_path)
        else:
            return self.convert_text_file_to_dataframe()

class MushroomEdibilityModel():
    def __init__(self, data: pd.DataFrame, label_column: str):
        random.set_seed(4)
        self.label = data[label_column]
        self.data = data.drop(label_column)
        #one hot encoding of values
        self.train, self.test = train_test_split()

# graph with distribution for particular classes in % or in number
# 3 model types with metrics and accuracies
# report 
              

In [7]:
dataset_reader = ReadDatasetFile(FILE_PATH, FEATURE_NAMES)
df = dataset_reader.file_converter_to_dataframe()

In [8]:
df['poisonous'].value_counts()

poisonous
e    4208
p    3916
Name: count, dtype: int64

In [10]:
df.isnull().sum()

poisonous                   0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [12]:
column_names = df.columns.to_list()
print("Value Distribution:\n")
for col in column_names:
    print(col,"\n",df[col].value_counts(),"\n\n") 

Value Distribution:

poisonous 
 poisonous
e    4208
p    3916
Name: count, dtype: int64 


cap-shape 
 cap-shape
x    3656
f    3152
k     828
b     452
s      32
c       4
Name: count, dtype: int64 


cap-surface 
 cap-surface
y    3244
s    2556
f    2320
g       4
Name: count, dtype: int64 


cap-color 
 cap-color
n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
u      16
r      16
Name: count, dtype: int64 


bruises 
 bruises
f    4748
t    3376
Name: count, dtype: int64 


odor 
 odor
n    3528
f    2160
y     576
s     576
a     400
l     400
p     256
c     192
m      36
Name: count, dtype: int64 


gill-attachment 
 gill-attachment
f    7914
a     210
Name: count, dtype: int64 


gill-spacing 
 gill-spacing
c    6812
w    1312
Name: count, dtype: int64 


gill-size 
 gill-size
b    5612
n    2512
Name: count, dtype: int64 


gill-color 
 gill-color
b    1728
p    1492
w    1202
n    1048
g     752
h     732
u     492
k     408
e      96
y      8

In [2]:
pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.1.1-cp312-cp312-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl (11.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.4/11.4 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.1.1-cp312-cp312-macosx_14_0_arm64.whl (5.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytz-2024.2-py2.py3-none-any.whl (508 kB)
Downloading tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed num