In [1]:
import cdt
import networkx as nx
from cdt.causality.graph import SAM
from cdt.data import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib

np.set_printoptions(precision=5)
pd.options.display.max_columns=1000
pd.options.display.max_rows = 1000
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

DATA_PATH = './data/CTU_30m_pH_with_Baseline.csv'
VERSION = 'v13_20220809'
SAM_RESULT = f'./sam_result'    # SAMを実行した結果を入れるフォルダの名前
SEM_RESULT = f'./sem_graph'
FIT_INDEX_RESULT = f'./df_fit_index'
# lambda1_list = [0.0, 0.001, 0.002, 0.01, 0.02, 0.1, 0.2, 1.0, 2.0] # λs
# lambda2_list = [0.002, 0.001, 0.0002, 0.0001, 2e-05, 1e-05, 2e-06, 1e-06, 2e-07] # λf
# lambda1_list = [0.5, 1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0] # λs
# lambda2_list = [1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 5e-1, 1.0] # λf
lambda1_list = np.round(np.arange(5, 15, 0.1), 2)
lambda1_str_list = lambda1_list.astype('str')
lambda1_str_list = list(map(lambda y:y.replace('.', '-'), lambda1_str_list))
nruns_list = [32, 64, 96] # SAMの平均を取る時の母数
threshold_list = np.round(np.arange(0.5, 0.75, 0.05), 3) # パス解析の時に使う閾値

TRAIN_EOPCHS = 10000
TEST_EPOCHS = 2000
NJOBS = 16

# lambda1_str_list = list(map(lambda x:str(x).replace('.', '-'), lambda1_list))
# lambda2_str_list = list(map(lambda x:str(x).replace('.', '-'), lambda2_list))

No GPU automatically detected. Setting SETTINGS.GPU to 0, and SETTINGS.NJOBS to cpu_count.


In [2]:
def run_SAM(data, nruns, lam1, lam2, njobs, train_epochs=10000, test_epochs=2000, gpus=2):
    '''
    SAMを実行
    
    Parameters
    ----------
    data: DataFrame
        学習させるデータ
        
    nruns: int
        平均を取るときの実行回数
    
    lam1: float
        SAMのパラメータλs
        
    lam2: float
        SAMのパラメータλf
        
    njobs: int
        GPUのjob数割り振り
        
    Returns
    -------
    matrix: DataFrame
        SAMの結果を行列で出力
    '''
    sam_obj = SAM(lambda1=lam1, lambda2=lam2,
                  train_epochs=train_epochs, test_epochs=test_epochs,
                  nruns=nruns,njobs=njobs,gpus=gpus)
    output = sam_obj.predict(data, graph=None, return_list_results=False)
    if type(output) == type(tuple()):
        out_nx, out_matrix = output
    else:
        out_nx = output
        
    matrix = convert_matrix(out_nx)
    return matrix

def convert_matrix(edges_nx):
    '''
    SAMの出力をマトリクスに変換する
    
    Parameters
    ----------
    edges_nx: object
        SAMから出力されるオブジェクト
        
    Returns
    -------
    matrix: DataFrame
        因果構造の行列
    '''
    np.set_printoptions(precision=5, suppress=True)
    edges = edges_nx.edges(data=True)
    edges_list = list(edges)
    edges_num = len(edges_list)

    index_list = [edges_list[i][0] for i in range(edges_num)]
    target_list = [edges_list[i][1] for i in range(edges_num)]
    weight_list = [edges_list[i][2]['weight'] for i in range(edges_num)]
    
    df = pd.DataFrame(list(zip(index_list,target_list,weight_list)),columns=['index','target','weight'])
    
    num = len(df['index'].unique())
    mat = np.eye(num)
    indexs = list(df['index'].unique())
    
    for edge in df.iterrows():
        edge = edge[1]

        i = indexs.index(edge['index'])
        t = indexs.index(edge['target'])

        mat[i,t] = edge['weight']
        
    for i in range(num):
        mat[i,i] = 0
    
    return pd.DataFrame(mat, index=indexs, columns=indexs)

def show_heatmap(result_name, sam_result=SAM_RESULT, version=VERSION, origin_data_path=DATA_PATH,
                 figsize=(12,10), font_scale=1.2, linewidths=0.3, cmap='summer_r', 
                 annot=True, annot_kws={'size': 13}, fmt='1.3f'):
    '''
    指定したSAMからの出力結果のヒートマップを表示させる
    
    Parameters
    ----------
    result_name: str
        出力ファイルの名前
    '''
    df_result = pd.read_csv(f'{sam_result}/{version}/{result_name}.csv')
    df_origin = pd.read_csv(DATA_PATH)
    columns = df_origin.columns

    plt.figure(figsize=figsize)
    sns.set(font_scale=font_scale)
    sns.heatmap(df_result, linewidths=linewidths, cmap=cmap, xticklabels=columns, 
                yticklabels=columns, annot=annot, annot_kws=annot_kws, fmt=fmt)
    display(plt.show())

In [3]:
df_data = pd.read_csv(DATA_PATH)
# df_data.tail(2)
len(df_data)

495

In [None]:
mat = run_SAM(df_data, 32, 0.1, 0.001, 1, train_epochs=TRAIN_EOPCHS, test_epochs=TEST_EPOCHS, gpus=2)

In [None]:
mat.to_csv(f'./{SAM_RESULT}/{VERSION}/{file_name}.csv', index=False) # 結果を保存