In [1]:
# Open analysis duckdb database
import duckdb

conn = duckdb.connect('/data/vgribanov/data/a1c1/final_data/analyze_rnn.duckdb')

In [5]:
# Create analysis duckdb database (requires trained model and dataset
from analysis import load_model, predict, create_analysis_db

# Load model and dataset
model, ds = load_model("/data/vgribanov/a1c1/models/final/rnn.model")
# Results of prediction across all dataset
dfResults = predict(model, ds)
# Create analysis duckdb database
conn = create_analysis_db("/data/vgribanov/data/a1c1/final_data", "analyze_rnn", dfResults)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 657/657 [06:58<00:00,  1.57it/s]


In [7]:
# create individual and save prediction results

individual_limit = 10
result_json_file = "/data/vgribanov/data/a1c1/final_data/analyze_rnn_individual.json"
result_yaml_file = "/data/vgribanov/data/a1c1/final_data/analyze_rnn_individual.yaml"

dfi = conn.query(f"""
    with target_1 as (
        select results.idx, results.target, results.logit, data.static_data, data.dynamic_data, data.months
        from results
                inner join data on results.idx = data.idx
        where result and target = 1
        order by logit desc
        limit {individual_limit}
    ), target_0 as (
        select results.idx, results.target, results.logit, data.static_data, data.dynamic_data, data.months
        from results
                inner join data on results.idx = data.idx
        where result and target = 0
        order by logit desc
        limit {individual_limit}
    ), target as (
        from target_1
        union all
        from target_0
    ), target_static_data as (
        with target_static_data as (
            select idx, unnest(static_data) as static_data_id
            from target
        )
        select idx, array_agg({{ id: static_data_id, feature: static_data_vocab.type, description: static_data_vocab.value }}) as static_data
        from target_static_data
                inner join static_data_vocab on target_static_data.static_data_id = static_data_vocab.id
        group by all
    ), target_dynamic_data as (
        with target_dynamic_data_m as (
            select idx, unnest(months) as month, unnest(dynamic_data) as dynamic_data_vec
            from target
        ), target_dynamic_data_unnested as (
            select idx, month, unnest(dynamic_data_vec) as dynamic_data_id
            from target_dynamic_data_m
        ), target_dynamic_data_mu as (
            select idx, target_dynamic_data_unnested.month,
                array_agg({{ 
                    id: dynamic_data_id, 
                    type: dynamic_data_vocab.type, 
                    code: dynamic_data_vocab.code,
                    description: coalesce(
                        codes.description, 
                        dynamic_data_vocab.text_res
                    )
                }}) as dynamic_data
            from target_dynamic_data_unnested
                    inner join dynamic_data_vocab on target_dynamic_data_unnested.dynamic_data_id = dynamic_data_vocab.id
                    left join codes on dynamic_data_vocab.code = codes.code
            group by all
       )
        select idx, array_agg({{ month: target_dynamic_data_mu.month,  data: dynamic_data }}) as dynamic_data
        from target_dynamic_data_mu
        group by all
    )
    select target.idx, target.target, logit, target_static_data.static_data, target_dynamic_data.dynamic_data
    from target
            inner join target_static_data on target.idx = target_static_data.idx
            inner join target_dynamic_data on target.idx = target_dynamic_data.idx
    order by logit desc
""").df()

import yaml
import numpy as np

json_data = dfi.to_json(orient="table")
with open(result_json_file, "w", encoding="utf-8") as file:
    file.write(json_data)


def convert_numpy(obj):
    if isinstance(obj, np.ndarray):  # numpy array -> list
        return obj.tolist()
    elif isinstance(obj, (np.integer, np.floating)):  # numpy int/float -> Python int/float
        return obj.item()
    elif isinstance(obj, dict):  # Рекурсивно применяем к словарям
        return {k: convert_numpy(v) for k, v in obj.items()}
    elif isinstance(obj, list):  # Рекурсивно применяем к спискам
        return [convert_numpy(v) for v in obj]
    return obj  

yaml_data = yaml.dump(convert_numpy(json_data), allow_unicode=True, default_flow_style=False, indent=4)
with open(result_yaml_file, "w", encoding="utf-8") as file:
    yaml.dump(yaml_data, file)

In [8]:
# Preparing most significant procedures and diagnoses data for plotting

limit = 10

df_procs = conn.query(f"""
WITH res AS (
    SELECT idx, target, prediction, logit, result, res,
        rank() OVER (PARTITION BY target ORDER BY logit DESC) AS rank
    FROM results
    WHERE result
    ORDER BY rank
    LIMIT {limit}
), mdynamic AS (
    SELECT res.target, unnest(months) AS month, unnest(dynamic_data) as feature_vec
    FROM res
            INNER JOIN data USING (idx)
    ORDER BY rank
), gd AS (
    select target, month, unnest(feature_vec) as feature
    from mdynamic
    where month <=12
)
SELECT 
    37-month as month, 
    feature, 
    dynamic_data_vocab.code,
    dynamic_data_vocab.code || ' - ' || COALESCE(codes.description, 'none') AS description,
    COUNT(*) as count_all,
    COALESCE(count(*) FILTER (WHERE target = 1),0) as count_1,
    COALESCE(count(*) FILTER (WHERE target = 0),0) as count_0,
    count_1 / count_all as ratio_1,
    count_0 / count_all as ratio_0,
    count_1 - count_0 as diff,
    diff / count_all as ratio_diff
FROM gd
        INNER JOIN dynamic_data_vocab ON dynamic_data_vocab.id = gd.feature
        LEFT JOIN codes ON dynamic_data_vocab.code = codes.code || '_' || codes.type
WHERE dynamic_data_vocab.type = 'procedures'
GROUP BY ALL
--HAVING count_all > 30
ORDER BY count_all desc, feature, month
""").df()
df_diags = conn.query(f"""
WITH res AS (
    SELECT idx, target, prediction, logit, result, res,
        rank() OVER (PARTITION BY target ORDER BY logit DESC) AS rank
    FROM results
    WHERE result
    ORDER BY rank
    LIMIT {limit}
), mdynamic AS (
    SELECT res.target, unnest(months) AS month, unnest(dynamic_data) as feature_vec
    FROM res
            INNER JOIN data USING (idx)
    ORDER BY rank
), gd AS (
    select target, month, unnest(feature_vec) as feature
    from mdynamic
    where month <=12
)
SELECT 
    37-month as month, 
    feature, 
    dynamic_data_vocab.code,
    dynamic_data_vocab.code || ' - ' || COALESCE(codes.description, 'none') AS description,
    COUNT(*) as count_all,
    COALESCE(count(*) FILTER (WHERE target = 1),0) as count_1,
    COALESCE(count(*) FILTER (WHERE target = 0),0) as count_0,
    count_1 / count_all as ratio_1,
    count_0 / count_all as ratio_0,
    count_1 - count_0 as diff,
    diff / count_all as ratio_diff
FROM gd
        INNER JOIN dynamic_data_vocab ON dynamic_data_vocab.id = gd.feature
        LEFT JOIN codes ON dynamic_data_vocab.code = codes.code || '_' || codes.type
WHERE dynamic_data_vocab.type = 'diagnoses'
GROUP BY ALL
--HAVING count_all > 15
ORDER BY count_all desc, feature, month
""").df()

In [9]:
# preparing data for hitmaps
conn.execute("""DROP TABLE IF EXISTS procs_results;""")
conn.execute("""DROP TABLE IF EXISTS diags_results;""")
conn.execute("""CREATE TABLE procs_results AS SELECT * FROM df_procs;""")
conn.execute("""CREATE TABLE diags_results AS SELECT * FROM df_diags;""")

def prepare_hitmaps_data(t,target):
    return conn.query(f"""
        WITH all_data AS (
            SELECT description, feature, sum(count_all) as count_all, sum(count_1) as count_1, sum(count_0) as count_0
            FROM {t}_results
            GROUP BY ALL 
            ORDER BY count_all desc
        ), p_data AS (
            PIVOT {t}_results
            ON month
            USING SUM({target})
            GROUP BY description
        )
        SELECT p_data.*
        FROM p_data
            INNER JOIN all_data USING (description)
        ORDER BY all_data.count_all desc
        """).df()

df_procs_1 = prepare_hitmaps_data("procs", "count_1")
df_procs_0 = prepare_hitmaps_data("procs", "count_0")
df_procs_diff = prepare_hitmaps_data("procs", "diff")
df_procs_diff_ratio = prepare_hitmaps_data("procs", "ratio_diff")

df_diags_1 = prepare_hitmaps_data("diags", "count_1")
df_diags_0 = prepare_hitmaps_data("diags", "count_0")
df_diags_diff = prepare_hitmaps_data("diags", "diff")
df_diags_diff_ratio = prepare_hitmaps_data("diags", "ratio_diff")

(proc_min, proc_max, proc_1_max, proc_0_max, proc_min_diff, proc_max_diff, proc_min_dr, proc_max_dr) = conn.query("""
        WITH all_data AS (
            SELECT description, month, sum(count_all) as count_all, sum(count_1) as count_1, sum(count_0) as count_0, sum(diff) as diff, sum(ratio_diff) as ratio_diff
            FROM procs_results
            GROUP BY ALL
        )
        SELECT min(count_all), max(count_all), 
            max(count_1) AS m_count_1, max(count_0) AS m_count_0, min(diff) AS min_diff, max(diff) AS max_diff, min(ratio_diff) AS min_ratio, max(ratio_diff) AS max_ratio
        FROM all_data
""").df().values[0]

(diag_min, diag_max, diag_1_max, diag_0_max, diag_min_diff, diag_max_diff, diag_min_dr, diag_max_dr) = conn.query("""
        WITH all_data AS (
            SELECT description, month, sum(count_all) as count_all, sum(count_1) as count_1, sum(count_0) as count_0, sum(diff) as diff, sum(ratio_diff) as ratio_diff
            FROM diags_results
            GROUP BY ALL
        )
        SELECT min(count_all), max(count_all), 
            max(count_1) AS m_count_1, max(count_0) AS m_count_0, min(diff) AS min_diff, max(diff) AS max_diff, min(ratio_diff) AS min_ratio, max(ratio_diff) AS max_ratio
        FROM all_data
""").df().values[0]
    

In [None]:
# create diagnoses and procedures heatmaps charts
import seaborn as sns
import matplotlib.pyplot as plt

def print_heatmaps(name, df_diff, df_0, df_1, max0, max1, min_diff, max_diff):
    i = 0
    diff = df_diff.set_index("description")
    df0 = df_0.set_index("description")
    df1 = df_1.set_index("description")
    for i in range(0, len(df_diff), 20):
        fig, axes = plt.subplots(3, figsize=(18, 18))
        axes = axes.flatten()
        sns.heatmap(df0[i:i+20],annot=False,cmap="YlGnBu",ax=axes[0],cbar=True, vmin=0, vmax=max0)
        axes[0].set_title(f"{name} from {i} to {i+20} for A1 less or equal than 7")
        axes[0].set_ylabel("Feature")
        axes[0].set_xlabel("Month")
        sns.heatmap(df1[i:i+20],annot=False,cmap="YlGnBu",ax=axes[1],cbar=True, vmin=0, vmax=max1)
        axes[1].set_title(f"{name} {i} to {i+20}  for A1 greater than 7")
        axes[1].set_ylabel("Feature")
        axes[1].set_xlabel("Month")
        sns.heatmap(diff[i:i+20],annot=False,cmap="RdBu", center=0,ax=axes[2],cbar=True, vmin=min_diff, vmax=max_diff)
        axes[2].set_title(f"{name} {i} to {i+20} difference")
        axes[2].set_ylabel("Feature")
        axes[2].set_xlabel("Month")
        plt.show()

print_heatmaps("procedures", df_procs_diff, df_procs_0, df_procs_1, proc_0_max, proc_1_max, proc_min_diff, proc_max_diff)
print_heatmaps("diagonses", df_diags_diff, df_diags_0, df_diags_1, diag_0_max, diag_1_max, diag_min_diff, diag_max_diff)

In [11]:
# preparing labs data for plotting
limit = 10
df_group_labs = conn.query(f"""
WITH res AS (
    SELECT idx, target, prediction, logit, result, res,
        rank() OVER (PARTITION BY target ORDER BY logit DESC) AS rank
    FROM results
    WHERE result
    ORDER BY rank
    LIMIT {limit}
), mdynamic AS (
    SELECT res.target, unnest(months) AS month, unnest(dynamic_data) as feature_vec
    FROM res
            INNER JOIN data USING (idx)
    ORDER BY rank
), gd AS (
    select target, month, unnest(feature_vec) as feature
    from mdynamic
), pre_data AS (
    SELECT 
        dynamic_data_vocab.code,
        COUNT(*) as count
    FROM gd
            INNER JOIN dynamic_data_vocab ON dynamic_data_vocab.id = gd.feature
            LEFT JOIN codes ON dynamic_data_vocab.code = codes.code || '_' || codes.type
    WHERE dynamic_data_vocab.type = 'labs'
    GROUP BY ALL
)
SELECT *
FROM pre_data
ORDER BY count desc
""").df()

df_labs = conn.query(f"""
WITH res AS (
    SELECT idx, target, prediction, logit, result, res,
        rank() OVER (PARTITION BY target ORDER BY logit DESC) AS rank
    FROM results
    WHERE result
    ORDER BY rank
    LIMIT {limit}
), mdynamic AS (
    SELECT res.target, unnest(months) AS month, unnest(dynamic_data) as feature_vec
    FROM res
            INNER JOIN data USING (idx)
    ORDER BY rank
), gd AS (
    select target, month, unnest(feature_vec) as feature
    from mdynamic
), pre_agg AS (
    SELECT target, month, feature, count(*) as count
    FROM gd
    GROUP BY ALL
), groupped AS (
    SELECT code, unnest(array[0,1]) as target
    FROM df_group_labs
), pre_data AS (
    SELECT 
        groupped.target, 
        groupped.code,
        dynamic_data_vocab.code || ' ' || dynamic_data_vocab.lower_bound || ' - ' || dynamic_data_vocab.upper_bound AS description,
        COALESCE(37-month, 1) as month, 
        COALESCE(SUM(pre_agg.count),0) as count
    FROM groupped
            INNER JOIN dynamic_data_vocab ON dynamic_data_vocab.code = groupped.code
            LEFT JOIN pre_agg ON dynamic_data_vocab.id = pre_agg.feature and groupped.target = pre_agg.target
    WHERE dynamic_data_vocab.type = 'labs'
    GROUP BY ALL
), dd AS (
    SELECT target, months.month, code, description, SUM(coalesce(count, 0)) OVER (PARTITION BY target, code) AS rate, coalesce(count, 0) as count
    FROM (select unnest(range(1, 36))::integer as month) as months
            left join pre_data on months.month = pre_data.month
        
    ORDER BY rate desc, code, months.month
)
FROM dd
""").df()

In [None]:
# plot labs data
import matplotlib.pyplot as plt
import seaborn as sns

for idx, row in df_group_labs.iterrows():
    lab_1 = df_labs[(df_labs["target"] == 1)&(df_labs['code']==row['code'])].pivot_table(index="description", columns="month", values="count", aggfunc="sum", fill_value=0)
    lab_0 = df_labs[(df_labs["target"] == 0)&(df_labs['code']==row['code'])].pivot_table(index="description", columns="month", values="count", aggfunc="sum", fill_value=0)
    lab_diff = lab_1 - lab_0
    fig, axes = plt.subplots(3, figsize=(18, 18))
    axes = axes.flatten()
    sns.heatmap(lab_0,annot=False,cmap="YlGnBu",ax=axes[0],cbar=True)
    axes[0].set_title(f"labs {row['code']} rank {row['count']} for A1 less or equal than 7")
    axes[0].set_ylabel("Feature")
    axes[0].set_xlabel("Month")
    sns.heatmap(lab_1,annot=False,cmap="YlGnBu",ax=axes[1],cbar=True)
    axes[1].set_title(f"labs {row['code']} rank {row['count']} for A1 greater than 7")
    axes[1].set_ylabel("Feature")
    axes[1].set_xlabel("Month")
    sns.heatmap(lab_diff,annot=False,cmap="RdBu", center=0,ax=axes[2],cbar=True)
    axes[2].set_title(f"labs {row['code']} rank {row['count']} differences between A1C1 greater and less")
    axes[2].set_ylabel("Feature")
    axes[2].set_xlabel("Month")
    plt.show()

In [None]:
# plot static features chart
limit = 1000

import matplotlib.pyplot as plt
import numpy as np


df_static = conn.query(f"""
WITH res AS (
    SELECT idx, target, prediction, logit, result, res,
        rank() OVER (PARTITION BY target ORDER BY logit DESC) AS rank
    FROM results
    WHERE result
    ORDER BY rank
    LIMIT {limit}
), static AS (
    SELECT res.target, unnest(static_data) as feature
    FROM res
            INNER JOIN data USING (idx)
    ORDER BY rank
)
SELECT target, feature, static_data_vocab.type, static_data_vocab.value, COUNT(*) as count
FROM static
        INNER JOIN static_data_vocab ON static_data_vocab.id = static.feature
GROUP BY ALL
ORDER BY static_data_vocab.type, static_data_vocab.value
""").df()

unique_labels = df_static.sort_values(by=["type", "value"])["type"] + " " + df_static.sort_values(by=["type", "value"])["value"]
unique_labels = unique_labels.unique()
counts_target_0 = [df_static[(df_static["target"] == 0) & (df_static["type"] + " " + df_static["value"] == label)]["count"].values[0] if not df_static[(df_static["target"] == 0) & (df_static["type"] + " " + df_static["value"] == label)].empty else 0 for label in unique_labels]
counts_target_1 = [df_static[(df_static["target"] == 1) & (df_static["type"] + " " + df_static["value"] == label)]["count"].values[0] if not df_static[(df_static["target"] == 1) & (df_static["type"] + " " + df_static["value"] == label)].empty else 0 for label in unique_labels]

y_positions = np.arange(len(unique_labels))
plt.figure(figsize=(14, 10))
plt.barh(y_positions - 0.2, counts_target_0, height=0.4, label="A1 less or equal than 7", color="skyblue", alpha=0.8)
plt.barh(y_positions + 0.2, counts_target_1, height=0.4, label="A1 greater than 7", color="salmon", alpha=0.8)
plt.yticks(y_positions, unique_labels, rotation=0)
plt.xlabel("Count")
plt.ylabel("Type and Value")
plt.title("Static Features")
plt.legend()
plt.tight_layout()

plt.show()