In [3]:
%pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 16.8 MB/s eta 0:00:01
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
from pgmpy.models import BayesianModel
from pgmpy.estimators import HillClimbSearch, K2Score, ExhaustiveSearch, BDeuScore
from pgmpy.sampling import BayesianModelSampling
import pandas as pd
import numpy as np
from pgmpy.inference.CausalInference import CausalInference
from pyvis import network as net

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# 連続値データの読み込み
data = pd.read_excel('../data/230724_0804_AI用qPCRデータ.xlsx', index_col=0, header=1)
data = data.dropna()
data = data[['bglB', 'malx', 'yihR', 'chbF', 'ascF']]
# data = data[['bglB', 'malx', 'yihR', 'chbF']]
data

Unnamed: 0_level_0,bglB,malx,yihR,chbF,ascF
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2.389907e-07,9.403729e-07,4.792578e-07,4.533360e-07,1.842627e-06
2,3.838396e-06,3.847846e-06,3.848767e-06,1.920287e-06,3.815250e-06
3,4.884611e-07,9.615077e-07,4.864569e-07,4.830404e-07,9.581183e-07
4,1.217226e-07,4.807837e-07,1.207673e-07,2.398760e-07,4.799480e-07
5,2.417564e-07,9.593193e-07,2.428970e-07,4.823107e-07,9.566899e-07
...,...,...,...,...,...
107,4.848761e-07,3.886102e-06,9.700637e-07,9.729974e-07,1.929397e-06
108,2.408789e-07,1.906135e-06,9.624883e-07,9.566223e-07,9.527273e-07
109,4.815630e-07,3.875600e-06,1.944778e-06,9.663708e-07,1.900486e-06
110,1.881602e-06,1.900545e-06,1.885412e-06,9.396897e-07,1.901576e-06


In [5]:
# 全探索
est = ExhaustiveSearch(data, scoring_method=K2Score(data))
# est = ExhaustiveSearch(data, scoring_method=BDeuScore(data))
best_model = est.estimate()

In [5]:
# BDeu usage
edges = best_model.edges()
nodes = best_model.nodes()

print(edges)
print(nodes)

[('bglB', 'chbF'), ('bglB', 'malx'), ('bglB', 'yihR')]
['bglB', 'chbF', 'malx', 'yihR']


In [6]:
# K2 usage
edges = best_model.edges()
nodes = best_model.nodes()

print(edges)
print(nodes)

[('bglB', 'chbF'), ('bglB', 'malx'), ('bglB', 'yihR')]
['bglB', 'chbF', 'malx', 'yihR']


In [7]:
# draw network
edge_list = list(edges)
edge_list = [i + tuple([0.1]) for i in edge_list]  # ウェイトの値を適当に追加

# jupyter中に表示させる場合、notebook=True, →にする場合directed=True
got_net = net.Network(notebook=True, directed=True)

for e in edge_list:
    src = e[0]
    dst = e[1]
    w = e[2]

    got_net.add_node(src, title=src, shape='ellipse', label=src)
    got_net.add_node(dst, title=dst, shape='ellipse', label=dst)
    got_net.add_edge(src, dst, width=3)

    # print(src)
    # print(dst)
    # print(w)
    # print('='*10)

neighbor_map = got_net.get_adj_list()

for node in got_net.nodes:
    node['title'] += ' Neighbors:<br>' + '<br>'.join(neighbor_map[node['id']])
    node['value'] = len(neighbor_map[node['id']])
    # node['label'] = node['label']

got_net.show_buttons(False)

got_net.show('../work/output/network_usage_K2Score.html')


../work/output/network_usage_K2Score.html


In [7]:
# DAGオブジェクトからBayesianModelオブジェクトへ変換
best_model_bayesian = BayesianModel(best_model)



In [8]:
# 探索したネットワークのエッジからネットワークのモデルを構築してCPDを求める
model = BayesianModel(list(edges))
# 独立なノードが入っていない場合にも対応
model.add_nodes_from(list(nodes))
model.fit(data)  # cpds を計算
cpds = model.get_cpds()
# for cpd in cpds:
#     print(cpd, '\n')

  tabular_cpd.values = (cpd / cpd.sum(axis=0)).reshape(tabular_cpd.cardinality)
  tabular_cpd.values = (cpd / cpd.sum(axis=0)).reshape(tabular_cpd.cardinality)


In [11]:
print(cpds)

[<TabularCPD representing P(ascF:71 | malx:71, yihR:71) at 0x7f56d50c36d0>, <TabularCPD representing P(bglB:71 | ascF:71, malx:71, yihR:71) at 0x7f56d5147370>, <TabularCPD representing P(chbF:71 | ascF:71, bglB:71, malx:71, yihR:71) at 0x7f56d50c2320>, <TabularCPD representing P(malx:71) at 0x7f56d5147070>, <TabularCPD representing P(yihR:71 | malx:71) at 0x7f56d51476d0>]


In [20]:
# # HillClimbSearch
# # 全探索
# hc = HillClimbSearch(data)
# hc_best_model = hc.estimate()

# hc_edges = hc_best_model.edges()
# hc_nodes = hc_best_model.nodes()

# print("="*20)
# print(hc_edges)
# print(hc_nodes)

# # DAGオブジェクトからBayesianModelオブジェクトへ変換
# hc_best_model_bayesian = BayesianModel(hc_best_model)

# # 探索したネットワークのエッジからネットワークのモデルを構築してCPDを求める
# hc_model = BayesianModel(list(hc_edges))
# # 独立なノードが入っていない場合にも対応
# hc_model.add_nodes_from(list(hc_nodes))
# hc_model.fit(data)  # cpds を計算
# hc_cpds = hc_model.get_cpds()
# # for cpd in cpds:
# #     print(cpd, '\n')

# print("="*20)
# print(hc_cpds)

# print("="*20)
# for cpd in hc_cpds:
#     print(cpd)


In [12]:
for cpd in cpds:
    best_model_bayesian.add_cpds(cpd)

TypeError: 'TabularCPD' object is not iterable

In [19]:
# パラメータ推定（確率分布の推定）
sampler = BayesianModelSampling(best_model_bayesian)
samples = sampler.forward_sample(size=1000)

ValueError: Sum or integral of conditional probabilities for node ascF is not equal to 1.

In [21]:
# from scipy.stats import norm
# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.set()

# # 可視化関数
# def norm_plot(node, min_value, max_value, mu, std):
#     X = np.arange(start=min_value, stop=max_value, step=0.000000001)

#     # pdfで確率密度関数を生成
#     norm_pdf = norm.pdf(x=X, loc=mu, scale=std)

#     fig = plt.figure(figsize=(15, 8))

#     plt.plot(X, norm_pdf)
#     plt.title(node)
#     plt.show()
#     fig.savefig('./output/pdf/{}.png'.format(node))

In [22]:
# # 推定されたパラメータの確率分布を表示
# for node, min_value, max_value in zip(best_model_bayesian.nodes(), data.min(), data.max()):
#     variable_samples = samples[node]
#     variable_mean = np.mean(variable_samples)
#     variable_std = np.std(variable_samples)
#     print(f"{node}: Normal(mu={variable_mean}, sd={variable_std})")
#     norm_plot(node, min_value, max_value, variable_mean, variable_std)

In [11]:
from pyvis import network as net

edge_list = list(best_network.edges())
edge_list = [i + tuple([0.1]) for i in edge_list]  # ウェイトの値を適当に追加

# jupyter中に表示させる場合、notebook=True, →にする場合directed=True
got_net = net.Network(notebook=True, directed=True)

for e in edge_list:
    src = e[0]
    dst = e[1]
    w = e[2]

    got_net.add_node(src, title=src, shape='ellipse', label=src)
    got_net.add_node(dst, title=dst, shape='ellipse', label=dst)
    got_net.add_edge(src, dst, width=3)

    # print(src)
    # print(dst)
    # print(w)
    # print('='*10)

neighbor_map = got_net.get_adj_list()

for node in got_net.nodes:
    node['title'] += ' Neighbors:<br>' + '<br>'.join(neighbor_map[node['id']])
    node['value'] = len(neighbor_map[node['id']])
    # node['label'] = node['label']

got_net.show_buttons(False)

got_net.show('../work/output/network.html')


../work/output/network.html
