# Initialization

### Module Import

In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

### Parameter Setting

In [3]:
pass

# Kaggle Data - Groceries Market Basket Dataset
* Source: [click me](https://www.kaggle.com/irfanasrullah/groceries?select=groceries+-+groceries.csv)

### Read Dataset
* `./Kaggle_GMB/groceries - groceries.csv` -> Suitable for using pandas dataframe processing
* `./Kaggle_GMB/groceries.csv` -> Suitable for using Python Standard Data Structure
    * It is recommend to use this dataset for convenience in processing items

In [4]:
gmb_pd_data = pd.read_csv("../dataset/Kaggle_GMB/groceries - groceries.csv")
gmb_pd_data

Unnamed: 0,Item(s),Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,...,Item 23,Item 24,Item 25,Item 26,Item 27,Item 28,Item 29,Item 30,Item 31,Item 32
0,4,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,...,,,,,,,,,,
1,3,tropical fruit,yogurt,coffee,,,,,,,...,,,,,,,,,,
2,1,whole milk,,,,,,,,,...,,,,,,,,,,
3,4,pip fruit,yogurt,cream cheese,meat spreads,,,,,,...,,,,,,,,,,
4,4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9830,17,sausage,chicken,beef,hamburger meat,citrus fruit,grapes,root vegetables,whole milk,butter,...,,,,,,,,,,
9831,1,cooking chocolate,,,,,,,,,...,,,,,,,,,,
9832,10,chicken,citrus fruit,other vegetables,butter,yogurt,frozen dessert,domestic eggs,rolls/buns,rum,...,,,,,,,,,,
9833,4,semi-finished bread,bottled water,soda,bottled beer,,,,,,...,,,,,,,,,,


In [5]:
gmb_list_fd = open('../dataset/Kaggle_GMB/groceries.csv', 'r')
gmb_list_data = [line[:-1] for line in gmb_list_fd.readlines()]
print(f"Read {len(gmb_list_data)} Transactions")
gmb_list_fd.close()

Read 9835 Transactions


### Item Count Description

In [6]:
gmb_pd_data[['Item(s)']].describe()

Unnamed: 0,Item(s)
count,9835.0
mean,4.409456
std,3.589385
min,1.0
25%,2.0
50%,3.0
75%,6.0
max,32.0


# IBM Data - Synthetic Data for testing

### Read Dataset

In [26]:
ibm_qsdg_list_fd = open('../dataset/IBM/ibm-2021.txt', 'r')     # TODO: Replace Testing File Name for verification
ibm_qsdg_list_data = [line[:-1].split() for line in ibm_qsdg_list_fd.readlines()]
print(f"Read {len(ibm_qsdg_list_data)} Raws")
ibm_qsdg_list_fd.close()

Read 24827 Raws


### Integrate to Transaction Format

In [30]:
new_transactions = {}
for raw in ibm_qsdg_list_data:
    if raw[1] in new_transactions:
        new_transactions[raw[1]].append(raw[2])
    else:
        new_transactions[raw[1]] = [raw[2]]
print(f"Figure out {len(new_transactions)} transactions")

Figure out 4114 transactions


### Save preprocessed transaction into new csv file

In [35]:
with open("../dataset/IBM/ibm-2021_preprocessed.csv", "w+") as writer:
    for transaction_id in new_transactions:
        # print(new_transactions[transaction_id])
        writer.write(",".join(new_transactions[transaction_id]) + "\n")

['9192', '31651', '59344', '80129']
['26134', '57515', '60732', '74524']
['13453', '14148', '44776', '47548', '54782', '59246', '60443', '61339', '62105', '64594', '96328']
['26134', '37096', '48693', '57515', '60732', '74524']
['2228', '33193', '52460', '55047', '56699', '84472']
['65309', '75284', '92628']
['2114', '39749', '54494', '64408', '74994']
['8721', '26134', '57515', '60732', '66529', '67940', '74524']
['23397', '31651', '44439', '70751', '71932', '95771']
['5185', '10109', '23040', '24554', '72358', '99941']
['8096', '34560', '40313', '44413', '55020', '55776', '72695']
['74735', '93735']
['67822', '72374', '79533']
['10413', '15755', '31717', '60042', '62583', '78656', '83249', '86958']
['26134', '57515', '60732', '74524']
['216', '11496', '80479']
['18409', '24238', '61779', '64217', '72949', '74096', '75620', '83045']
['2343', '7300', '59199', '66646', '74735', '93735']
['75619', '83228', '94110']
['7190', '12305', '51657', '88631', '92490', '95398']
['37096', '48693', 

# Kaggle Data - Heart Disease (Deprecated)
* Source: [Click Me](https://www.kaggle.com/ronitf/heart-disease-uci)

## Read Data

In [8]:
# # Read Source UCI 
# uci_data = pd.read_csv("../dataset/Kaggle_Heart/heart_disease.csv")
# uci_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


## Data Description
* Schema Description
    * Age - 年齡 (Unit: years)
    * Sex - 性別 (1 for male, 0 for female)
    * cp - 胸痛 (Chest Pain) 類別
    * trestbps - 入住醫院時量測的 Resting Blood Pressure
    * chol - 血清膽固醇 (Unit: mg/dl)
    * fbs - 快速血糖檢測值 (fasting blood sugar) 是否大於 120 mg/dl (1 = true; 0 = false)
    * restecg - resting electrocardiographic(心電圖) results
        * 0 = Nothing to note
        * 1 = ST-T Wave abnormality
            * can range from mild symptoms(輕微症狀) to severe problems(嚴重問題)
            * signals non-normal heart beat (心臟發出非正常心跳)
        * 2 = Possible or definite left ventricular hypertrophy(可能的或是明確的左心室肥大)
            * Enlarged heart's main pumping chamber (擴大的主心室)
    * thalach - maximum heart rate achieved (量測最大心率值)
    * exang - exercise induced angina(運動誘發的心絞痛)
    * oldpeak - ST depression induced by exercise relative to rest (相對於休息，由運動引發的ST下降程度)
* Reference: [Click Here](https://www.kaggle.com/ronitf/heart-disease-uci/discussion/273496)

In [9]:
# for field_name in uci_data.columns.values:
#     col_description = uci_data[field_name].describe()

#     fig = make_subplots(
#         rows=1,
#         cols=3,
#         subplot_titles=("Index-Value Distribution", "Boxplot", "Histogram")
#     )

#     fig.add_trace(
#         go.Scatter(
#             x=np.arange(0, len(uci_data), 1),
#             y=uci_data[field_name],
#             mode='markers',
#             name=f'(Index, {field_name})'
#         ),
#         row=1,
#         col=1,
#     )

#     fig.add_trace(
#         go.Box(
#             y=uci_data[field_name],
#             name=f"max: {col_description['max']}<br>Q3: {col_description['75%']}<br>Q2: {col_description['50%']}<br>Q1: {col_description['25%']}<br>min: {col_description['min']}",
#         ),
#         row=1,
#         col=2,
#     )

#     fig.add_trace(
#         go.Histogram(
#             x=uci_data[field_name],
#             name=f'(Range, Count)'
#         ),
#         row=1,
#         col=3,
#     )

#     fig['layout']['xaxis']['title'] = f"Count: {col_description['count']}"
#     fig['layout']['xaxis3']['title'] = f"mean: {col_description['mean']:.2f}, std: {col_description['std']:.2f}"

#     fig.update_layout(
#         title_text=f"Field <{field_name}> Values Visualization",
#         title_x=0.5,
#         showlegend=False,
#     )

#     fig.show()