# Analisis Awal ENose Kakao

> 30 Mei 2022

![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)

In [1]:
import os
import re
from dataclasses import asdict, dataclass

import numpy as np
import pandas as pd
from snhlib.multitools import FindData

## Global variables

In [2]:
DATA_PATH = "/Users/shidiq/Library/CloudStorage/OneDrive-Personal/Personal Projects/2022-enose kakao/resources/Hasil Pengukuran e-Nose Sample Butter"

In [3]:
def get_label(item: str) -> int:
    out = 1 if bool(re.search("accepted", item.lower())) else 0
    return out


def get_group(item: str) -> str:
    out = (
        "A"
        if bool(re.search("kebawah", item.lower()))
        else "B"
        if bool(re.search("400-600", item.lower()))
        else "C"
        if bool(re.search("keatas", item.lower()))
        else "X"
    )
    return out


def get_names(item: str) -> str:
    return os.path.splitext(os.path.basename(item))[0]


@dataclass
class DataItem:
    path: str
    label: np.int8
    group: str
    names: str


def dataclass_to_df(item: str, index: int) -> pd.DataFrame:
    info = DataItem(item, get_label(item), get_group(item), get_names(item))
    info = pd.Series(asdict(info))
    info = pd.DataFrame(info).transpose()
    info.index = [index]
    return info

## List data

In [4]:
list_data = FindData(path=DATA_PATH)

data_df = pd.DataFrame()

for i, item in enumerate(list_data.get_files):
    info = dataclass_to_df(item, i)
    data_df = pd.concat([data_df, info], axis=0)

data_df.to_csv("data/list_rawdata_butter.csv", index=False)
data_df.head()

Unnamed: 0,path,label,group,names
0,/Users/shidiq/Library/CloudStorage/OneDrive-Personal/Personal Projects/2022-enose kakao/resources/Hasil Pengukuran e-Nose Sample Butter/Rejected/400-600/TK65-1.csv,0,B,TK65-1
1,/Users/shidiq/Library/CloudStorage/OneDrive-Personal/Personal Projects/2022-enose kakao/resources/Hasil Pengukuran e-Nose Sample Butter/Rejected/400-600/TK39-1.csv,0,B,TK39-1
2,/Users/shidiq/Library/CloudStorage/OneDrive-Personal/Personal Projects/2022-enose kakao/resources/Hasil Pengukuran e-Nose Sample Butter/Rejected/400-600/TK41-1.csv,0,B,TK41-1
3,/Users/shidiq/Library/CloudStorage/OneDrive-Personal/Personal Projects/2022-enose kakao/resources/Hasil Pengukuran e-Nose Sample Butter/Rejected/400-600/TK26-1.csv,0,B,TK26-1
4,/Users/shidiq/Library/CloudStorage/OneDrive-Personal/Personal Projects/2022-enose kakao/resources/Hasil Pengukuran e-Nose Sample Butter/Rejected/400-600/TK42-1.csv,0,B,TK42-1


In [5]:
print(data_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 135 entries, 0 to 134
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   path    135 non-null    object
 1   label   135 non-null    object
 2   group   135 non-null    object
 3   names   135 non-null    object
dtypes: object(4)
memory usage: 5.3+ KB
None


In [6]:
data_df.iloc[:, 1:].describe()

Unnamed: 0,label,group,names
count,135,135,135
unique,2,3,135
top,0,C,TK65-1
freq,69,80,1


In [7]:
data_df.groupby("label").nunique()

Unnamed: 0_level_0,path,group,names
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,69,3,69
1,66,3,66


In [8]:
data_df.groupby("group").nunique()

Unnamed: 0_level_0,path,label,names
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,33,2,33
B,22,2,22
C,80,2,80


In [9]:
data_df.groupby("group").label.value_counts()

group  label
A      1        23
       0        10
B      0        16
       1         6
C      0        43
       1        37
Name: label, dtype: int64