# Test BBBC022/36/37 Dataset

In [None]:
# Follow main.py to get /data/boom/cpg0001/2020_08_11_Stain3_Yokogawa/images/BR00115125/Step4_Features/

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from skimage.transform import resize
from torchvision import transforms
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
from utils import *

# Read 5 files to merage, single-cell features

### 先合并 plate 里面的所有特征，得带着位置一起， plate - well - feature

In [None]:

# import numpy as np
# import pandas as pd

# save_path = '../bbbc022/'
# file_path = '/data/boom/bbbc022/original/images/'

# REG_PARAM = 1e-2
# num_features = 422

# files_folder = ['20585', '20586', '20589', '20590', '20591', '20592', '20593', '20594', '20595', '20596', '20607', '20608', '20625', '20626', '20630', '20633', '20639', '20640', '20641', '20646']

# # 定义文件列表
# files = [
#     'texture_features.csv',
#     'intensity_features.csv',
#     'shape_features.csv',
#     'bbox_features.csv',
#     'misc_features.csv'
# ]


# # 遍历文件列表，读取并合并
# for i in files_folder:
#     merged_df = pd.DataFrame()
#     for file in files:
#         df = pd.read_csv(file_path+i+'/Step4_Features/'+file)
#         merged_df = pd.concat([merged_df, df], axis=1)
    
#     # save merged_df in '/data/boom/bbbc022/original/images/'

# # 检查合并后的维度
# print(f"Merged DataFrame shape: {merged_df.shape}")

# # merged_df.head()

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

# 配置参数
FILE_PATH = '/data/boom/bbbc022/original/images/'
SAVE_PATH = '/data/boom/bbbc022/original/images/'  # 保存路径与原始路径相同
REG_PARAM = 1e-2
NUM_FEATURES = 422

# 文件夹和文件定义
FILES_FOLDER = [
    '20585', '20586', '20589', '20590', '20591', '20592',
    '20593', '20594', '20595', '20596', '20607', '20608',
    '20625', '20626', '20630', '20633', '20639', '20640', '20641', '20646'
]

FEATURE_FILES = [
    'texture_features.csv',
    'intensity_features.csv',
    'shape_features.csv',
    'bbox_features.csv',
    'misc_features.csv'
]

def merge_and_save_features(file_path, folders, feature_files, save_path):
    """
    为每个文件夹合并所有特征文件并单独保存
    
    Args:
        file_path (str): 基础路径
        folders (list): 包含特征的文件夹列表
        feature_files (list): 特征文件名列表
        save_path (str): 保存合并后文件的路径
    """
    print(f"开始处理 {len(folders)} 个文件夹的特征合并...")
    
    for folder in tqdm(folders, desc="处理文件夹"):
        merged_df = pd.DataFrame()
        folder_path = os.path.join(file_path, folder, 'Step4_Features')
        
        # 检查文件夹是否存在
        if not os.path.exists(folder_path):
            print(f"警告: 文件夹 {folder_path} 不存在，跳过...")
            continue
            
        # 合并当前文件夹的所有特征文件
        for feature_file in feature_files:
            file_full_path = os.path.join(folder_path, feature_file)
            
            try:
                df = pd.read_csv(file_full_path)
                merged_df = pd.concat([merged_df, df], axis=1)
            except FileNotFoundError:
                print(f"警告: 文件 {file_full_path} 未找到，跳过...")
                continue
            except Exception as e:
                print(f"读取 {file_full_path} 时出错: {str(e)}")
                continue
        
        # 如果成功合并了数据，则保存
        if not merged_df.empty:
            # 创建保存路径（如果不存在）
            os.makedirs(save_path, exist_ok=True)
            
            # 保存文件名格式：merged_features_<文件夹名>.csv
            save_file = os.path.join(save_path, f'merged_features_{folder}.csv')
            merged_df.to_csv(save_file, index=False)
            print(f"\n已保存 {folder} 的合并特征到 {save_file}")
            print(f"合并后的DataFrame形状: {merged_df.shape}")
        else:
            print(f"警告: 文件夹 {folder} 没有可用的特征数据")

if __name__ == "__main__":
    try:
        merge_and_save_features(
            file_path=FILE_PATH,
            folders=FILES_FOLDER,
            feature_files=FEATURE_FILES,
            save_path=SAVE_PATH
        )
    except Exception as e:
        print(f"特征合并过程中出错: {str(e)}")

开始处理 19 个文件夹的特征合并...


处理文件夹:   5%|▌         | 1/19 [00:01<00:35,  1.98s/it]


已保存 20585 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20585.csv
合并后的DataFrame形状: (16985, 422)


处理文件夹:  11%|█         | 2/19 [00:04<00:35,  2.09s/it]


已保存 20586 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20586.csv
合并后的DataFrame形状: (19439, 422)


处理文件夹:  16%|█▌        | 3/19 [00:06<00:36,  2.27s/it]


已保存 20589 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20589.csv
合并后的DataFrame形状: (22189, 422)


处理文件夹:  21%|██        | 4/19 [00:08<00:30,  2.02s/it]


已保存 20590 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20590.csv
合并后的DataFrame形状: (14183, 422)


处理文件夹:  26%|██▋       | 5/19 [00:10<00:28,  2.00s/it]


已保存 20591 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20591.csv
合并后的DataFrame形状: (17070, 422)


处理文件夹:  32%|███▏      | 6/19 [00:12<00:25,  2.00s/it]


已保存 20592 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20592.csv
合并后的DataFrame形状: (13103, 422)


处理文件夹:  37%|███▋      | 7/19 [00:13<00:21,  1.78s/it]


已保存 20593 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20593.csv
合并后的DataFrame形状: (11393, 422)


处理文件夹:  42%|████▏     | 8/19 [00:15<00:19,  1.76s/it]


已保存 20594 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20594.csv
合并后的DataFrame形状: (14980, 422)


处理文件夹:  47%|████▋     | 9/19 [00:17<00:18,  1.82s/it]


已保存 20595 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20595.csv
合并后的DataFrame形状: (17163, 422)


处理文件夹:  53%|█████▎    | 10/19 [00:19<00:18,  2.03s/it]


已保存 20596 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20596.csv
合并后的DataFrame形状: (21937, 422)


处理文件夹:  58%|█████▊    | 11/19 [00:21<00:14,  1.85s/it]


已保存 20607 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20607.csv
合并后的DataFrame形状: (12630, 422)


处理文件夹:  63%|██████▎   | 12/19 [00:22<00:11,  1.71s/it]


已保存 20608 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20608.csv
合并后的DataFrame形状: (11617, 422)


处理文件夹:  68%|██████▊   | 13/19 [00:24<00:11,  1.87s/it]


已保存 20625 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20625.csv
合并后的DataFrame形状: (19354, 422)


处理文件夹:  74%|███████▎  | 14/19 [00:26<00:09,  1.93s/it]


已保存 20630 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20630.csv
合并后的DataFrame形状: (18022, 422)


处理文件夹:  79%|███████▉  | 15/19 [00:28<00:07,  1.76s/it]


已保存 20633 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20633.csv
合并后的DataFrame形状: (12069, 422)


处理文件夹:  84%|████████▍ | 16/19 [00:30<00:05,  1.88s/it]


已保存 20639 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20639.csv
合并后的DataFrame形状: (19258, 422)


处理文件夹:  89%|████████▉ | 17/19 [00:32<00:03,  1.99s/it]


已保存 20640 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20640.csv
合并后的DataFrame形状: (19996, 422)


处理文件夹:  95%|█████████▍| 18/19 [00:34<00:01,  1.97s/it]


已保存 20641 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20641.csv
合并后的DataFrame形状: (16839, 422)


处理文件夹: 100%|██████████| 19/19 [00:36<00:00,  1.92s/it]


已保存 20646 的合并特征到 /data/boom/bbbc022/original/images/merged_features_20646.csv
合并后的DataFrame形状: (16602, 422)





### 加入位置，从前往后开始排列匹配（和处理顺序一致）

In [None]:
# 合并文件

import pandas as pd
import os

# List of folder IDs to process
files_folder = ['20585', '20586', '20589', '20590', '20591', '20592', '20593', 
                '20594', '20595', '20596', '20607', '20608', '20625', '20630', 
                '20633', '20639', '20640', '20641', '20646']

# Initialize an empty list to store all DataFrames
all_dataframes = []

for file in files_folder:
    # Construct the file path
    meta_file = f"/data/boom/bbbc022/original/images/merged_features_{file}.csv"
    
    # Check if file exists before reading
    if os.path.exists(meta_file):
        # Read the CSV file
        df = pd.read_csv(meta_file)
        print(f"File {file} shape: {df.shape}")
        
        # Add the DataFrame to our list
        all_dataframes.append(df)
    else:
        print(f"Warning: File not found - {meta_file}")

# all_dataframes

In [6]:
combined_df = pd.concat(all_dataframes, axis=0, ignore_index=True)
combined_df.shape

(314829, 422)

In [10]:
combined_df

Unnamed: 0,Texture_W1-img-Nucleus-Mask_Contrast_25%,Texture_W1-img-Nucleus-Mask_Contrast_50%,Texture_W1-img-Nucleus-Mask_Contrast_75%,Texture_W1-img-Nucleus-Mask_Contrast_mean,Texture_W1-img-Nucleus-Mask_Contrast_std,Texture_W1-img-Nucleus-Mask_Contrast_mad,Texture_W1-img-Nucleus-Mask_Dissimilarity_25%,Texture_W1-img-Nucleus-Mask_Dissimilarity_50%,Texture_W1-img-Nucleus-Mask_Dissimilarity_75%,Texture_W1-img-Nucleus-Mask_Dissimilarity_mean,...,Cell_BBox-x1,Misc_Count_# nucleoli,Misc_Area-Ratio_nucleus/cell,Misc_Area-Ratio_cyto/cell,Misc_Area-Ratio_nucleoli/cell,Misc_Area-Ratio_mito/cell,Misc_Area-Ratio_nucleus/cyto,Misc_Area-Ratio_mito/cyto,Misc_Area-Ratio_nucleoli/cyto,Misc_Area-Ratio_nucleoli/nucleus
0,1.57,3.68,5.94,4.14,3.07,2.28,0.68,1.19,1.62,1.20,...,488.0,2.0,0.60,0.40,0.16,0.20,1.48,0.49,0.40,0.27
1,2.82,6.90,10.61,6.99,4.64,4.04,0.91,1.69,2.24,1.60,...,662.0,3.0,0.54,0.46,0.15,0.16,1.18,0.34,0.32,0.27
2,1.13,2.68,4.55,3.04,2.21,1.74,0.40,0.74,1.07,0.76,...,165.0,2.0,0.27,0.73,0.07,0.17,0.36,0.23,0.10,0.28
3,2.16,4.53,5.75,3.95,2.06,1.48,0.62,1.07,1.29,0.95,...,605.0,1.0,0.35,0.65,0.04,0.17,0.54,0.26,0.06,0.10
4,0.50,1.13,1.83,1.25,0.89,0.69,0.15,0.27,0.40,0.29,...,613.0,2.0,0.16,0.84,0.04,0.12,0.19,0.14,0.05,0.26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314824,1.98,3.79,5.48,3.93,2.42,1.79,0.42,0.71,0.97,0.73,...,683.0,2.0,0.19,0.81,0.05,0.35,0.24,0.44,0.06,0.25
314825,1.80,4.06,6.00,4.10,2.66,2.14,0.50,0.89,1.22,0.88,...,125.0,2.0,0.26,0.74,0.08,0.21,0.35,0.29,0.11,0.31
314826,2.62,5.45,8.29,5.49,3.37,2.88,0.64,1.16,1.65,1.14,...,614.0,4.0,0.27,0.73,0.06,0.27,0.37,0.37,0.09,0.23
314827,3.14,6.67,10.10,6.73,4.21,3.59,0.76,1.34,1.90,1.33,...,290.0,2.0,0.34,0.66,0.10,0.24,0.51,0.36,0.15,0.28


In [None]:
# merged_df是(739389, 422)的df，全是特征, meta是(739389, 8)的df，表示这些特征的信息。meta中有well-id。根据这列来groupby merged_df，得到mean之后的特征。
# SPACe中均值用的最多，EMD其次。我们采用最为广泛的均值

# 将 meta 中的分组列与 merged_df 合并
grouped_data = pd.concat([meta['well-id'], merged_df], axis=1)

# 按 'exp-id' 和 'well-id' 分组并计算均值
well_level_features = grouped_data.groupby(['well-id']).mean()

# 重置索引（可选，若需保留 'exp-id' 和 'well-id' 作为列）
well_level_features = well_level_features.reset_index()

# 输出结果形状（检查分组后的维度）
print("Well-level 特征形状:", well_level_features.shape)

well_level_features.head()

In [6]:
# 可选：保存到 CSV 文件
well_level_features.to_csv(save_path+'SPACe_BBBC022.csv', index=False)

## Generate csv like below_csv, which have plate well and treatment

In [9]:
features = np.load("/home/bob/boom/PhenoProfiler/revision/BBBC022/PhenoProfiler/PhenoProfiler_22train_22test.npy").T 

features.shape

(66558, 672)

In [7]:
path = '/home/bob/boom/PhenoProfiler/revision/BBBC022/PhenoProfiler/0_noBC_well_level.csv'

csv = pd.read_csv(path)

csv.head()

Unnamed: 0,Plate,Well,Treatment,Replicate,0,1,2,3,4,5,...,662,663,664,665,666,667,668,669,670,671
0,20585,A01,BRD-K98763141@8.858401198,1,1.045206,1.558925,2.921548,1.674438,1.106661,0.499378,...,1.281022,0.638197,-0.200087,0.608927,1.689248,0.889015,2.052059,1.928631,0.605199,0.72006
1,20585,A02,BRD-A41941932@2.312793797,1,0.949092,2.170479,2.342595,0.700147,1.181116,0.564941,...,1.687001,0.525907,-0.140222,0.536555,1.864668,0.791943,2.623167,1.794455,0.57029,0.709518
2,20585,A03,BRD-A26711594@5.213492954,1,0.882619,2.183415,2.321936,0.592616,1.224526,0.511811,...,1.728272,0.512789,-0.136531,0.508213,1.934767,0.772496,2.60398,1.798196,0.581171,0.71929
3,20585,A04,BRD-K65667145@4.646594864,1,0.965083,2.065964,2.350903,0.600704,1.236141,0.535168,...,1.63921,0.546802,-0.148175,0.55118,1.918558,0.7993,2.597351,1.841052,0.559211,0.756002
4,20585,A05,BRD-K61250553@5.240677924,1,0.921297,2.131596,2.461645,0.494211,1.297295,0.595707,...,1.710991,0.545445,-0.16157,0.526415,1.866713,0.830936,2.681955,1.884539,0.604113,0.664256


# Make GT of MOA

In [9]:
# read JUMP-MOA_compound_platemap_with_metadata.txt

import pandas as pd

meta = pd.read_csv('/home/bob/boom/PhenoProfiler/output/JUMP-MOA_compound_platemap_with_metadata.txt', sep='\t')  # assuming it's tab-delimited
print(meta.shape)  # meta['moa']
meta.head()

(384, 11)


Unnamed: 0,well_position,broad_sample,solvent,InChIKey,pert_iname,pubchem_cid,moa,smiles,mmoles_per_liter,pert_type,control_type
0,A01,BRD-K80935598-001-01-1,DMSO,ZYVXTMKTGDARKR-UHFFFAOYSA-N,AZ191,72716071.0,DYRK inhibitor,COc1cc(ccc1Nc1nccc(n1)-c1cn(C)c2cnccc12)N1CCN(...,1.5,trt,
1,A02,BRD-K85776940-001-01-9,DMSO,ODADKLYLWWCHNB-LDYBVBFYSA-N,delta-Tocotrienol,5282350.0,HMGCR inhibitor,CC(C)=CCC\C(C)=C\CC\C(C)=C\CC[C@]1(C)CCc2cc(O)...,1.5,trt,
2,A03,BRD-K25611237-001-02-1,DMSO,QDBVSOZTVKXUES-UHFFFAOYSA-N,ML324,44143209.0,histone lysine demethylase inhibitor,CN(C)CCCNC(=O)c1ccc(cc1)-c1cc(O)c2ncccc2c1,1.5,trt,
3,A04,BRD-K66430217-001-03-8,DMSO,CXJCGSPAPOTTSF-VURMDHGXSA-N,KH-CB19,44237094.0,CDC inhibitor,CCOC(=O)c1c(\C(=C/N)C#N)c2ccc(Cl)c(Cl)c2n1C,1.5,trt,
4,A05,BRD-K38627885-001-01-9,DMSO,RFZQYGBLRIKROZ-PCLIKHOPSA-N,APY0201,56927660.0,phosphoinositide dependent kinase inhibitor,Cc1cccc(\C=N\Nc2cc(N3CCOCC3)n3nc(cc3n2)-c2ccnc...,1.5,trt,


In [133]:
meta['pert_type'].unique()

array(['trt', 'control'], dtype=object)

In [25]:
# 整合成 Well Trt Features 的形式

import numpy as np
import pandas as pd

# 加载数据
REG_PARAM = 1e-2
num_features = 422

features = pd.read_csv(save_path+'SPACe_BBBC022.csv')
# meta = pd.read_csv('output/JUMP-MOA_compound_platemap_with_metadata.txt', sep='\t')

assert all(meta['well_position'] == features['well-id']), "well_position 和 well-id 不匹配"

# 将 meta 中的 pert_type 和 broad_sample 列添加到 features 的前面
result_df = pd.concat([
    meta[['well_position', 'pert_type', 'broad_sample']], 
    features.drop(columns=['well-id'])  # 移除重复的 well-id 列
], axis=1)

result_df.shape

(384, 425)

In [26]:
result_df = result_df.rename(columns={
    'well_position': 'Well',
    'pert_type': 'Treatment'
})

first_three_columns = result_df.columns[:3].tolist()

# 生成从0开始的连续数字作为新列名（从第四列开始）
new_columns = first_three_columns + list(range(len(result_df.columns) - 3))

# 重命名列
result_df.columns = new_columns

result_df.head()

Unnamed: 0,Well,Treatment,broad_sample,0,1,2,3,4,5,6,...,412,413,414,415,416,417,418,419,420,421
0,A01,trt,BRD-K80935598-001-01-1,1.645127,3.349233,5.003979,3.450767,2.147409,1.702836,0.597042,...,991.171506,4.915608,0.518285,0.481679,0.130014,0.202541,1.235808,0.420295,0.309605,0.250953
1,A02,trt,BRD-K85776940-001-01-9,1.679743,3.393648,5.042047,3.487641,2.156411,1.702634,0.619995,...,987.417164,4.759523,0.536576,0.463089,0.135392,0.188242,1.339615,0.405994,0.339174,0.251726
2,A03,trt,BRD-K25611237-001-02-1,1.570459,3.177217,4.753203,3.283472,2.04439,1.613926,0.586655,...,997.050781,4.908203,0.525312,0.474648,0.128843,0.189531,1.290771,0.400132,0.315137,0.245938
3,A04,trt,BRD-K66430217-001-03-8,1.707247,3.447403,5.109848,3.538715,2.187224,1.72651,0.616809,...,998.453959,4.865562,0.53797,0.461598,0.133955,0.190143,1.345419,0.413347,0.335373,0.248587
4,A05,trt,BRD-K38627885-001-01-9,1.976635,3.908043,5.645491,3.903183,2.312364,1.844134,0.648424,...,997.983114,2.87242,0.432702,0.567054,0.103827,0.251689,0.907842,0.450413,0.217617,0.240075


# BC Sphering

In [37]:
import scipy.linalg
import pandas as pd

class WhiteningNormalizer(object):
    def __init__(self, controls, reg_param=1e-6):
        # Whitening transform on population level data
        self.mu = controls.mean()
        self.whitening_transform(controls - self.mu, reg_param, rotate=True)
        # print(self.mu.shape, self.W.shape)
        
    def whitening_transform(self, X, lambda_, rotate=True):
        C = (1/X.shape[0]) * np.dot(X.T, X)
        s, V = scipy.linalg.eigh(C)
        D = np.diag( 1. / np.sqrt(s + lambda_) )
        W = np.dot(V, D)
        if rotate:
            W = np.dot(W, V.T)
        self.W = W

    def normalize(self, X):
        return np.dot(X - self.mu, self.W)

columns2 = [i for i in range(422)]  #　(422)
REG_PARAM = 1e-2

wells = result_df
whN = WhiteningNormalizer(wells.loc[wells["Treatment"] == "control", columns2], reg_param=REG_PARAM)

whD = whN.normalize(wells[columns2])

# Save whitened profiles
wells[columns2] = whD

wells.head()

Unnamed: 0,Well,Treatment,broad_sample,0,1,2,3,4,5,6,...,412,413,414,415,416,417,418,419,420,421
0,A01,trt,BRD-K80935598-001-01-1,-15.679659,-31.562634,-43.363066,-30.464617,-18.451878,-13.572591,-2.272549,...,77.921186,-30.572371,0.424776,-0.308855,-0.83247,0.227891,3.289882,0.434729,-1.763269,-1.574132
1,A02,trt,BRD-K85776940-001-01-9,-2.393539,-8.133598,-14.30607,-9.501977,-8.211261,-6.125357,0.041002,...,-4493.770387,-44.531471,-0.77569,0.85025,-0.53983,-0.266471,-3.348405,-1.133397,-1.357499,-0.819481
2,A03,trt,BRD-K25611237-001-02-1,-2.546098,-8.370042,-14.132594,-9.029172,-7.095279,-5.811051,-0.476102,...,-6858.85928,-24.252292,-0.957911,0.998636,-0.312647,-0.105589,-6.560419,-1.164592,-1.802056,-0.195562
3,A04,trt,BRD-K66430217-001-03-8,0.543147,-0.574102,-4.56045,-2.5558,-3.546405,-1.948124,0.307565,...,-4808.88712,-39.467985,-0.450447,0.449389,-0.10091,-0.99941,-2.616676,-1.862713,-0.518434,-0.143225
4,A05,trt,BRD-K38627885-001-01-9,71.23641,139.917208,189.151593,133.097726,80.290635,57.971009,9.963493,...,-35901.667249,23.140143,-14.555023,15.313582,-1.019785,-17.571533,-58.999237,-39.972659,-6.567744,3.380227


In [38]:
wells.shape

(384, 425)

# Treatment-level profiles / Mean Aggreagation

In [39]:
import sklearn

# Aggregate profiles
columns1 = ["Well", "broad_sample"]
columns2 = [i for i in range(422)]

wells_1 = wells.drop(columns=["Well"])

profiles = wells_1.groupby(["broad_sample", 'Treatment']).mean().reset_index()

# 将 meta 的 broad_sample 和 moa 转为字典
moa_dict = meta.set_index('broad_sample')['moa'].to_dict()

# 用 map 添加 moa 列
profiles['moa'] = profiles['broad_sample'].map(moa_dict)

# profiles = wells_1[["Treatment", "broad_sample"] + columns2]
# # profiles
print(profiles.shape)
profiles.head()

(90, 425)


Unnamed: 0,broad_sample,Treatment,0,1,2,3,4,5,6,7,...,413,414,415,416,417,418,419,420,421,moa
0,BRD-A12994259-001-02-1,trt,0.901764,3.646613,7.19978,3.885281,2.991918,3.138834,-0.039097,0.327887,...,2.879712,-0.316715,0.329772,-0.114809,-0.131103,-1.837984,-0.45212,-0.58894,0.005652,tumor necrosis factor production inhibitor
1,BRD-A22769835-300-05-7,trt,-0.44794,-0.210765,0.75507,0.51211,1.338898,0.596384,0.150469,0.291179,...,-8.201242,-0.403221,0.419396,-0.058577,0.132053,-2.937768,-0.159799,-0.499149,-0.003185,antihistamine
2,BRD-A53576514-048-14-3,trt,2.919886,6.49487,9.744995,6.341061,3.934733,3.407691,0.419374,0.846774,...,7.163786,-0.294327,0.296151,-0.106318,-0.00955,-1.434012,-0.148378,-0.402352,-0.051557,acetylcholine receptor antagonist
3,BRD-A78210457-001-01-5,trt,53.382037,125.46352,179.756159,125.84954,88.825632,65.57625,2.778214,11.559854,...,-142.884822,-10.718578,10.348481,-2.539531,-3.721066,-36.13395,-8.870613,-7.810502,-1.582795,MDM inhibitor
4,BRD-A87435144-001-01-6,trt,-5.011807,-9.879178,-12.594342,-9.129596,-4.975723,-3.829349,-1.236147,-2.047976,...,10.696112,-0.649725,0.681015,-0.183731,-0.027037,-4.071496,-0.536605,-1.041253,-0.088173,pyruvate dehydrogenase kinase inhibitor


In [46]:
# # 6. Similarity matrix
# Compute Cosine Similarities
from sklearn.metrics.pairwise import cosine_similarity

COS = cosine_similarity(profiles[columns2], profiles[columns2])
COS.shape

(90, 90)

In [41]:
# Transform to tidy format
df = pd.DataFrame(data=COS, index=list(profiles.broad_sample), columns=list(profiles.broad_sample))
#　将行索引重置为默认整数索引，并将原来的行索引 broad_sample 转换为一列，命名为 index。所以，variable　表示原来的broad_sample名。
df = df.reset_index().melt(id_vars=["index"])
# df # 其中每一行都表示 预测的Treatment和 GT 之间的概率。

# Annotate rows
df2 = pd.merge(
    df, 
    profiles[["broad_sample", "moa"]],  # 为了加 Metadata_moa.x 列数据，先用 broad_sample 建立对应关系，然后删除。
    how="left", 
    left_on="index", # <=== Rows
    right_on="broad_sample"
).drop("broad_sample",axis=1)

# Annotate columns
#　index　和 variable 是一个东西，都表示 Treatment，但是 index　对应 Metadata_moa.x_x	，variable对应Metadata_moa.x_y
df2 = pd.merge(
    df2, profiles[["broad_sample", "moa"]],
    how="left", 
    left_on="variable", # <=== Columns
    right_on="broad_sample"
).drop("broad_sample",axis=1)

df2.head()

Unnamed: 0,index,variable,value,moa_x,moa_y
0,BRD-A12994259-001-02-1,BRD-A12994259-001-02-1,1.0,tumor necrosis factor production inhibitor,tumor necrosis factor production inhibitor
1,BRD-A22769835-300-05-7,BRD-A12994259-001-02-1,0.491556,antihistamine,tumor necrosis factor production inhibitor
2,BRD-A53576514-048-14-3,BRD-A12994259-001-02-1,0.269703,acetylcholine receptor antagonist,tumor necrosis factor production inhibitor
3,BRD-A78210457-001-01-5,BRD-A12994259-001-02-1,0.453289,MDM inhibitor,tumor necrosis factor production inhibitor
4,BRD-A87435144-001-01-6,BRD-A12994259-001-02-1,0.438861,pyruvate dehydrogenase kinase inhibitor,tumor necrosis factor production inhibitor


In [42]:
df2.shape

(8100, 5)

In [43]:
# Rename columns and save
df2.columns = ["Var1", "Var2", "value", "Metadata_moa.x", "Metadata_moa.y"]

# # MOA Evaluation using enrichment analysis

SIM_MATRIX = df2  # "data/cos_efn128combinedplatesout_conv6a_1e-2_e30.csv"
OUT_RESUTS = "output/efn128combinedplatesout_conv6a_1e-2_e30"

def load_similarity_matrix(cr_mat):
    # Load matrix in triplet format and reshape
    # cr_mat = pd.read_csv(filename)
    X = cr_mat.pivot(index="Var1", columns="Var2", values="value").reset_index()
    
    # Identify annotations
    Y = cr_mat.groupby("Var1").max().reset_index()
    Y = Y[~Y["Metadata_moa.x"].isna()].sort_values(by="Var1")
    
    # Make sure the matrix is sorted by treatment
    X = X.loc[X.Var1.isin(Y.Var1), ["Var1"] + list(Y.Var1)].sort_values("Var1")
    
    return X,Y

X, Y = load_similarity_matrix(SIM_MATRIX)  # X 加载了数值, Y 加载了treatment等信息，最后变成随机量。
# Y

In [44]:
Y

Unnamed: 0,Var1,Var2,value,Metadata_moa.x,Metadata_moa.y
0,BRD-A12994259-001-02-1,Compound8,1.0,tumor necrosis factor production inhibitor,ubiquitin specific protease inhibitor
1,BRD-A22769835-300-05-7,Compound8,1.0,antihistamine,ubiquitin specific protease inhibitor
2,BRD-A53576514-048-14-3,Compound8,1.0,acetylcholine receptor antagonist,ubiquitin specific protease inhibitor
3,BRD-A78210457-001-01-5,Compound8,1.0,MDM inhibitor,ubiquitin specific protease inhibitor
4,BRD-A87435144-001-01-6,Compound8,1.0,pyruvate dehydrogenase kinase inhibitor,ubiquitin specific protease inhibitor
...,...,...,...,...,...
85,Compound4,Compound8,1.0,PARP inhibitor,ubiquitin specific protease inhibitor
86,Compound5,Compound8,1.0,IGF-1 inhibitor,ubiquitin specific protease inhibitor
87,Compound6,Compound8,1.0,tricyclic antidepressant,ubiquitin specific protease inhibitor
88,Compound7,Compound8,1.0,FGFR inhibitor,ubiquitin specific protease inhibitor


In [45]:
from utils import *

# MOA matching

Y.groupby("Metadata_moa.x")["Var1"].count()  # 找到每一种 MOA 中 Var1：Treatment 的数量

moa_matches = []
Y["Ref_moa"] = Y["Metadata_moa.x"].str.replace('|', '___')  #　potassium channel activator	
# Y['Metadata_moa.x'][63] 

# MOA 是 Metadata_moa.x 的内部结果，如果 Metadata_moa.x 包含多个预测，则 MOA 中包含多个 True
Y["Ref_moa"] = Y["Metadata_moa.x"].str.replace('|', '___')  #　内部包含多项的预测，替换后方便使用正则表达式进行匹配。 'norepinephrine reuptake inhibitor|tricyclic antidepressant'
for k,r in Y.iterrows():
    moas = r["Metadata_moa.x"].split("|")
    # print(moas)
    candidates = []
    for m in moas:
        reg = r'(^|___){}($|___)'.format(m)  
        '''
        正则表达式：
        匹配字符串 m，并确保它要么出现在字符串的开头或结尾，要么被三个下划线分隔。例如，如果 m 是 example，那么生成的正则表达式将是 (^|___)example($|___)，它可以匹配以下情况：
        example 在字符串的开头或结尾。
        example 被 ___ 分隔，如 ___example___。
        '''
        candidates.append(Y["Ref_moa"].str.contains(reg))
        # print('reg', reg, candidates[:20])
    matches = candidates[0]
    for c in candidates:
        # print("22", matches, c)
        matches = matches | c
    moa_matches.append(matches)
    # break

moa_matches = np.asarray(moa_matches)
# plt.imshow(moa_matches)


# # Enrichment analysis

# %% [markdown]
# # 输入
# 相似矩阵 (SIM)：一个表示样本或基因之间相似性的矩阵。
# 匹配数据 (moa_matches)：一个包含匹配信息的数据集。
# 阈值 (threshold)：一个数值参数，用于控制分析的严格程度。
# # 输出
# 富集结果：通常是一个包含富集分析结果的列表或数据框，可能包括显著性值、富集分数等。
# 可视化图表：一些函数可能会生成热图、条形图等用于展示富集结果的图表。

results = {}
SIM = np.asarray(X[Y.Var1])
# print("SIM:", SIM.shape)  # (995, 995)
# print(SIM)

is_query = moa_matches.sum(axis=0) > 1 
#　计算 moa_matches 每列的和，并判断是否大于1，结果存储在布尔数组 is_query 中。 大于1：表示该列中至少有两个或更多的非零值。这意味着在 moa_matches 中，该列有多个匹配项。

for i in range(SIM.shape[0]):
    if is_query[i]: #　如果 is_query 中对应位置为 True, 即大于1，有多个匹配项的情况。才能计算富集分析。
        idx = [x for x in range(SIM.shape[1]) if x != i] #　创建一个索引列表 idx，包含除了当前行 i 之外的所有列索引。除开对角线。
        results[i] = enrichment_analysis(SIM[i,idx], moa_matches[i,idx], 99.) # 确认这两个列表中，匹配情况是否高于随即情况
        # 对 SIM 的第 i 行（去掉第 i 列）和 moa_matches 的第 i 行（去掉第 i 列）进行富集分析，并将结果存储在 results 的第 i 个位置。
        if results[i]["ods_ratio"] is np.nan: # ods_ratio大于1 表明SIM[i,idx]中命中的概率高于在 moa_matches[i, idx] 中的概率
            print(results[i]["V"], i)
# results

# 计算并打印富集分析结果中 ods_ratio 的平均值
# 大于 1 则表明： SIM[i, idx] 中，该事件或特征更为显著或富集

folds = [results[x]["ods_ratio"] for x in results] # 提取所有 ods_ratio
enrichment_top_1 = np.mean(folds)
# print("Average folds of enrichment at top 1%:", enrichment_top_1)


enrichment_results = pd.DataFrame(data=results).T
# enrichment_results

# # Average precision analysis
import numpy as np
import pandas as pd

def precision_at_k(sim_matrix, moa_matches, k):
    """Calculate precision at k for each query"""
    results = {}
    is_query = moa_matches.sum(axis=0) > 1  # Only calculate for queries with multiple positives
    
    for i in range(sim_matrix.shape[0]):
        if is_query[i]:
            ranking = np.argsort(-sim_matrix[i, :])  # Descending order
            top_k_matches = moa_matches[i, ranking[1:k+1]]  # Exclude self, get top k
            pk = np.sum(top_k_matches) / k
            results[i] = {"precision_at_k": pk, "k": k}
    return results

def recall_at_k(sim_matrix, moa_matches, k):
    """Calculate recall at k for each query"""
    results = {}
    is_query = moa_matches.sum(axis=0) > 1
    total_positives = moa_matches.sum(axis=1)
    
    for i in range(sim_matrix.shape[0]):
        if is_query[i] and total_positives[i] > 0:
            ranking = np.argsort(-sim_matrix[i, :])
            top_k_matches = moa_matches[i, ranking[1:k+1]]
            recall = np.sum(top_k_matches) / total_positives[i]
            results[i] = {
                "recall_at_k": recall,
                "baseline_recall": np.mean(moa_matches),  # Random baseline
                "k": k
            }
    return results

def evaluate_model(sim_matrix, moa_matches):
    """Comprehensive evaluation of retrieval performance"""
    # Fixed evaluation points
    evaluation_points = [5, 10, 20, 50, 100]
    evaluation_percents = [1, 3, 5, 10, 20]
    
    # Calculate absolute positions for percentages
    n = sim_matrix.shape[0]
    percent_positions = [max(int(n * p/100), 1) for p in evaluation_percents]
    
    # Store all results
    results = {
        'precision': {},
        'recall': {},
        'metrics': {}
    }
    
    # Calculate precision@k
    for k in evaluation_points:
        prec_k = precision_at_k(sim_matrix, moa_matches, k)
        avg_prec = np.mean([prec_k[q]["precision_at_k"] for q in prec_k])
        results['precision'][f'P@{k}'] = avg_prec
    
    # Calculate recall@k and recall@%
    for pos, percent in zip(percent_positions, evaluation_percents):
        # Precision at percentage
        prec_p = precision_at_k(sim_matrix, moa_matches, pos)
        avg_prec_p = np.mean([prec_p[q]["precision_at_k"] for q in prec_p])
        results['precision'][f'P@{percent}%'] = avg_prec_p
        
        # Recall at percentage
        recall_p = recall_at_k(sim_matrix, moa_matches, pos)
        avg_recall_p = np.mean([recall_p[q]["recall_at_k"] for q in recall_p])
        baseline_p = np.mean([recall_p[q]["baseline_recall"] for q in recall_p])
        results['recall'][f'R@{percent}%'] = {
            'value': avg_recall_p,
            'baseline': baseline_p,
            'improvement': avg_recall_p / baseline_p if baseline_p > 0 else np.nan
        }
    
    # Calculate MAP (Mean Average Precision)
    map_score = calculate_map(sim_matrix, moa_matches)
    results['metrics']['MAP'] = map_score
    
    return results

def calculate_map(sim_matrix, moa_matches):
    """Calculate Mean Average Precision without interpolation"""
    aps = []
    is_query = moa_matches.sum(axis=0) > 1
    total_positives = moa_matches.sum(axis=1)
    
    for i in range(sim_matrix.shape[0]):
        if is_query[i] and total_positives[i] > 0:
            ranking = np.argsort(-sim_matrix[i, :])
            relevant = moa_matches[i, ranking[1:]]  # Exclude self
            
            # Calculate precision at each rank where recall increases
            precisions = []
            true_positives = 0
            for k in range(len(relevant)):
                if relevant[k]:
                    true_positives += 1
                    precisions.append(true_positives / (k + 1))
            
            if precisions:
                ap = np.sum(precisions) / total_positives[i]
                aps.append(ap)
    
    return np.mean(aps) if aps else 0

# Example usage:
results = evaluate_model(SIM, moa_matches)
print("Evaluation Results:")
print("Precision metrics:", results['precision'])
print("Recall metrics:", results['recall'])

print("MAP=", results['metrics']['MAP'])
print("FoE=", enrichment_top_1)
# print("Mean Average Precision (MAP): \t", np.mean(average_precision))

Evaluation Results:
Precision metrics: {'P@5': 0.04418604651162791, 'P@10': 0.02790697674418605, 'P@20': 0.020348837209302327, 'P@50': 0.012093023255813955, 'P@100': 0.01, 'P@1%': 0.13953488372093023, 'P@3%': 0.0872093023255814, 'P@5%': 0.05232558139534884, 'P@10%': 0.0310077519379845, 'P@20%': 0.021963824289405687}
Recall metrics: {'R@1%': {'value': 0.06976744186046512, 'baseline': 0.02172839506172839, 'improvement': 3.210887949260043}, 'R@3%': {'value': 0.0872093023255814, 'baseline': 0.02172839506172839, 'improvement': 4.013609936575054}, 'R@5%': {'value': 0.10465116279069768, 'baseline': 0.02172839506172839, 'improvement': 4.816331923890065}, 'R@10%': {'value': 0.13953488372093023, 'baseline': 0.02172839506172839, 'improvement': 6.421775898520086}, 'R@20%': {'value': 0.19767441860465115, 'baseline': 0.02172839506172839, 'improvement': 9.097515856236788}}
MAP: 0.09952650737238074
Average folds of enrichment at top 1%: 12.279069767441861


  candidates.append(Y["Ref_moa"].str.contains(reg))
