In [8]:
import pandas as pd
import numpy as np

from mod.logs import Logger
from mod.dao import MyConn, SqlStatement
from data_lake import PT_TABLE, OP_TABLE, DX_TABLE


In [9]:
# Prepare modules
l = Logger()
conn = MyConn("127.0.0.1", "hecon", l, False)
builder = SqlStatement()

In [7]:
# Load data from database
pt = conn.wrap(builder.read_data, table=PT_TABLE)
op = conn.wrap(builder.read_data, table=OP_TABLE)
dx = conn.wrap(builder.read_data, table=DX_TABLE)


        select * from hecon.pt_dl
        

        select * from hecon.op_dl
        

        select * from hecon.dx_dl
        


In [10]:
# Remove duplicates?
# Let's write a simple duplicate removal code. 

def remove_duplicates(data: np.ndarray, pivot_order=None) -> pd.DataFrame:
    if pivot_order is None:
        pivot_order = {"index": 1, "columns": 0, "values": 2}

    # Create data
    df = pd.DataFrame(data)
    df_pivot = df.pivot(**pivot_order)

    # Identify duplicate rows, where the current row is the same as the previous row
    df_pivot["is_dup"] = df_pivot.duplicated(
        subset=df_pivot.columns.difference(["dates"]),
        keep="first",
    ) & ~df_pivot.duplicated(
        subset=df_pivot.columns.difference(["dates"]),
        keep="last",
    )

    # Keep only the rows that are not marked as duplicates
    df_cleaned = df_pivot[
        ~df_pivot["is_dup"] |
        df_pivot.duplicated(
            subset=df_pivot.columns.difference(["dates"]),
            keep="last"
        )
        ].drop(columns="is_dup")
    df_cleaned.reset_index(drop=True, inplace=True)
    return df_pivot


# PT Data
pt_original = pd.DataFrame(pt)
pt_original = pt_original.pivot(index=1, columns=0, values=2).reset_index()
pt_remove_dup = remove_duplicates(pt)
print(
    f"PT data: Before {pt_original.shape} / After Duplicate Removal {pt_remove_dup.shape}"
)

# OP Data
op_original = pd.DataFrame(op)
op_original = op_original.pivot(index=1, columns=0, values=2).reset_index()
op_remove_dup = remove_duplicates(op)
print(
    f"OP data: Before {op_original.shape} / After Duplicate Removal {op_remove_dup.shape}"
)

# DX Data
dx_original = pd.DataFrame(dx)
dx_original = dx_original.pivot(index=1, columns=0, values=2).reset_index()
dx_remove_dup = remove_duplicates(dx)
print(
    f"DX data: Before {dx_original.shape} / After Duplicate Removal {dx_remove_dup.shape}"
)


PT data: Before (243, 16) / After Duplicate Removal (243, 16)
OP data: Before (243, 43) / After Duplicate Removal (243, 43)
DX data: Before (243, 46) / After Duplicate Removal (243, 46)


# 1. Regarding Duplication Removal?

```python
PT data: Before (243, 16) / After Duplicate Removal (243, 16)
OP data: Before (243, 43) / After Duplicate Removal (243, 43)
DX data: Before (243, 46) / After Duplicate Removal (243, 46)
```

<h3><i>No datapoints were removed</i></h3>

# 2. Regarding Zero Inflation ?
|	| 0개수 | 전체 개수 | 0 비율 | 연구자 자료 |
|---|------|--------|----|-----------|
|OP | 7095 | 10206 | 0.6952 | 약 70% |
|DX | 7467 | 10935 | 0.682853224 | 약 69% |

* 90% 넘는 물품만 제거한다고 하였기 때문에, 70% 인 물품 제거 안함. 
* 이미 시점 개수가 243개인 것을 보아, 이미 정리된 자료를 준 것으로 추정됨.