# Initialization

### Module Import

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

### Parameter Setting

In [None]:
pass

# Kaggle Data - Groceries Market Basket Dataset
* Source: [click me](https://www.kaggle.com/irfanasrullah/groceries?select=groceries+-+groceries.csv)

### Read Dataset
* `./Kaggle_GMB/groceries - groceries.csv` -> Suitable for using pandas dataframe processing
* `./Kaggle_GMB/groceries.csv` -> Suitable for using Python Standard Data Structure
    * It is recommend to use this dataset for convenience in processing items

In [None]:
gmb_pd_data = pd.read_csv("../dataset/Kaggle_GMB/groceries - groceries.csv")
gmb_pd_data

In [None]:
gmb_list_fd = open('../dataset/Kaggle_GMB/groceries.csv', 'r')
gmb_list_data = [line[:-1] for line in gmb_list_fd.readlines()]
print(f"Read {len(gmb_list_data)} Transactions")
gmb_list_fd.close()

### Item Count Description

In [None]:
gmb_pd_data[['Item(s)']].describe()

# IBM Data - Synthetic Data

## Synthetic Data Command
```shell=
& '.\IBM Quest Data Generator.exe' lit -fname './dataset/IBM/syndata' -npats 2500
```

In [None]:

IBM_Quest_Data_Generator.exe

# Testing Dataset from Moodle
* Source: on Moodle Page

# Kaggle Data - Heart Disease (Deprecated)
* Source: [Click Me](https://www.kaggle.com/ronitf/heart-disease-uci)

## Read Data

In [None]:
# Read Source UCI 
uci_data = pd.read_csv("../dataset/Kaggle_Heart/heart_disease.csv")
uci_data

## Data Description
* Schema Description
    * Age - 年齡 (Unit: years)
    * Sex - 性別 (1 for male, 0 for female)
    * cp - 胸痛 (Chest Pain) 類別
    * trestbps - 入住醫院時量測的 Resting Blood Pressure
    * chol - 血清膽固醇 (Unit: mg/dl)
    * fbs - 快速血糖檢測值 (fasting blood sugar) 是否大於 120 mg/dl (1 = true; 0 = false)
    * restecg - resting electrocardiographic(心電圖) results
        * 0 = Nothing to note
        * 1 = ST-T Wave abnormality
            * can range from mild symptoms(輕微症狀) to severe problems(嚴重問題)
            * signals non-normal heart beat (心臟發出非正常心跳)
        * 2 = Possible or definite left ventricular hypertrophy(可能的或是明確的左心室肥大)
            * Enlarged heart's main pumping chamber (擴大的主心室)
    * thalach - maximum heart rate achieved (量測最大心率值)
    * exang - exercise induced angina(運動誘發的心絞痛)
    * oldpeak - ST depression induced by exercise relative to rest (相對於休息，由運動引發的ST下降程度)
* Reference: [Click Here](https://www.kaggle.com/ronitf/heart-disease-uci/discussion/273496)

In [None]:
for field_name in uci_data.columns.values:
    col_description = uci_data[field_name].describe()

    fig = make_subplots(
        rows=1,
        cols=3,
        subplot_titles=("Index-Value Distribution", "Boxplot", "Histogram")
    )

    fig.add_trace(
        go.Scatter(
            x=np.arange(0, len(uci_data), 1),
            y=uci_data[field_name],
            mode='markers',
            name=f'(Index, {field_name})'
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Box(
            y=uci_data[field_name],
            name=f"max: {col_description['max']}<br>Q3: {col_description['75%']}<br>Q2: {col_description['50%']}<br>Q1: {col_description['25%']}<br>min: {col_description['min']}",
        ),
        row=1,
        col=2,
    )

    fig.add_trace(
        go.Histogram(
            x=uci_data[field_name],
            name=f'(Range, Count)'
        ),
        row=1,
        col=3,
    )

    fig['layout']['xaxis']['title'] = f"Count: {col_description['count']}"
    fig['layout']['xaxis3']['title'] = f"mean: {col_description['mean']:.2f}, std: {col_description['std']:.2f}"

    fig.update_layout(
        title_text=f"Field <{field_name}> Values Visualization",
        title_x=0.5,
        showlegend=False,
    )

    fig.show()