In [None]:
import pandas as pd
import plotly.express as px

In [None]:
import numpy as np

In [None]:
from collections import defaultdict

In [None]:
df = pd.read_pickle('data/human_transcriptome_segment_high_mfe_freq_training_len20_200_5000_pred_stem_bps.pkl.gz')

In [None]:
row = df.iloc[0]

In [None]:
seq = row['seq']

In [None]:
x = np.zeros((len(seq), len(seq)))

In [None]:
for idx in row['stem_bb_bps']:
    x[idx] = 1

In [None]:
for idx in row['target_bps']:
    x[idx] = 2

In [None]:
px.imshow(x)

In [None]:
df_stats = []
for _, row in df.iterrows():
    seq = row['seq']
    
    # number of edges per node (for node with at least 1 edge)
    idx2ct = defaultdict(lambda: 0)
    for bp in row.stem_bb_bps:
        idx2ct[bp[0]] += 1
        idx2ct[bp[1]] += 1
        
    # number of base without any proposed bp
    n_wo_bp = len(seq) - len(idx2ct)

    # connection density
    tmp = np.zeros((len(seq), len(seq)))
    for idx in row['stem_bb_bps']:
        tmp[idx] = 1
    bp_density = np.sum(tmp)/((len(seq)**2 - len(seq))/2)  # denominator: number of entris in upper triangular (exclude diagonal)

    df_stats.append({
        'len': len(seq),
        'idx2ct': idx2ct,
        'n_wo_bp': n_wo_bp,
        'bp_density': bp_density,
    })
    
df_stats = pd.DataFrame(df_stats)

In [None]:
px.histogram(df_stats['bp_density'])

In [None]:
px.scatter(df_stats, x='len', y='bp_density')

In [None]:
px.histogram(df_stats['n_wo_bp'])

In [None]:
n_edge_per_node = []
for _, row in df_stats.iterrows():
    n_edge_per_node.extend(list(row['idx2ct'].values()))
fig = px.histogram(n_edge_per_node)
fig.update_layout(width=800, height=300, title='all')

In [None]:
n_edge_per_node = []
for _, row in df_stats.iterrows():
    if 20 <= row['len'] < 50:
        n_edge_per_node.extend(list(row['idx2ct'].values()))
fig = px.histogram(n_edge_per_node)
fig.update_layout(width=800, height=300, title='len in [20, 50)')

In [None]:
n_edge_per_node = []
for _, row in df_stats.iterrows():
    if 50 <= row['len'] < 100:
        n_edge_per_node.extend(list(row['idx2ct'].values()))
fig = px.histogram(n_edge_per_node)
fig.update_layout(width=800, height=300, title='len in [50, 100)')

In [None]:
n_edge_per_node = []
for _, row in df_stats.iterrows():
    if 100 <= row['len'] <= 200:
        n_edge_per_node.extend(list(row['idx2ct'].values()))
fig = px.histogram(n_edge_per_node)
fig.update_layout(width=800, height=300, title='len in [100, 200]')