In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import sklearn
import argparse
import sklearn.metrics

from dataset import PDDDataset
from models import PhenoProfiler
from utils import *

# define hyperparameters
model = PhenoProfiler().cuda()
model_path = "result/PhenoProfiler/best.pt"
save_path = "Fig3/BBBC037/PhenoProfiler/"

def build_loaders_inference(batch_size):
    print("Building loaders")
    dataset = PDDDataset(image_path = "../dataset/bbbc037/images/",
               embedding_path = "../dataset/bbbc037/embedding/",
               CSV_path = "../dataset/bbbc037/profiling.csv")
    
    dataset = torch.utils.data.ConcatDataset([dataset])
    test_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)

    print("Finished building loaders")
    return test_loader

def get_image_embeddings(model_path, model, batch_size):
    test_loader = build_loaders_inference(batch_size)

    state_dict = torch.load(model_path)
    model.load_state_dict(state_dict)
    model.eval()

    print("Finished loading model")
    
    test_image_embeddings = []
    with torch.no_grad():
        for batch in tqdm(test_loader):
            image_features = model.image_encoder(batch["image"].cuda())
            image_embeddings = model.image_projection(image_features)
            test_image_embeddings.append(image_embeddings)
    
    return torch.cat(test_image_embeddings)


img_embeddings = get_image_embeddings(model_path, model, batch_size=600)
features = img_embeddings.cpu().numpy()

if not os.path.exists(save_path):
    os.makedirs(save_path)

np.save(save_path + "PhenoProfiler_37" + ".npy", features.T)

In [None]:
# Load metadata of index data
meta = pd.read_csv(os.path.join(f"../dataset/bbbc037/profiling.csv"))

MATRIX_FILE = save_path+"/cos_efn128combinedcellsout_conv6a_1e-2_e30.csv"
REG_PARAM = 1e-2

# Load metadata of index data
meta = pd.read_csv(os.path.join("/data/boom/bbbc037/profiling.csv"))

# features = np.load(save_path + "PhenoProfiler_alltrain_37test.npy").T

In [None]:
# 用序号的方式来指定 control Wells，

# 提取 Compound 列中值为 'DMSO' 的所有行号
dmso_indices = meta.index[meta['pert_name'] == 'EMPTY_'].tolist()

# 打印结果
# print(len(dmso_indices), dmso_indices[:10])

1575 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [67]:
# 提取 meta 中的第一列 'Metadata_Plate'
metadata_plate = meta['Metadata_Plate'].values.reshape(-1, 1)

# 将 'Metadata_Plate' 列融合到 features 的第一列
combined_features = np.hstack((metadata_plate, features))

# 将 combined_features 转换为 DataFrame
combined_df = pd.DataFrame(combined_features, columns=['Metadata_Plate'] + [f'{i}' for i in range(1, features.shape[1]+1)])

# 打印结果的形状以确认
print(combined_df.shape)

# 只保留dmso_indices对应的行
filtered_combined_df = combined_df.iloc[dmso_indices]

# 打印结果的形状以确认
print(filtered_combined_df.shape)

(17254, 673)
(1575, 673)


In [68]:
# control well 对应的 特征
control_df = filtered_combined_df.groupby(["Metadata_Plate"]).mean().reset_index()
control_df

Unnamed: 0,Metadata_Plate,1,2,3,4,5,6,7,8,9,...,663,664,665,666,667,668,669,670,671,672
0,41744.0,0.84182,2.103717,2.354764,1.190593,1.247132,0.934351,1.416569,0.425345,1.502444,...,1.696647,0.833879,-0.17997,1.192771,2.344077,1.116915,2.476984,1.444754,0.912769,1.01022
1,41754.0,0.815701,2.120141,2.616484,1.241948,1.27784,0.980657,1.570254,0.431809,1.591464,...,1.694519,0.880051,-0.216271,0.980448,1.923442,1.118305,2.584423,1.54895,0.866345,1.01459
2,41755.0,0.755852,2.198217,2.412723,1.181505,1.336766,0.892649,1.483598,0.417696,1.541491,...,1.59275,0.822469,-0.164605,1.154841,2.117756,1.227755,2.473713,1.429653,0.876387,1.024025
3,41756.0,0.799631,2.080047,2.363219,1.363791,1.539583,0.850126,1.448558,0.426315,1.469189,...,1.711314,0.743297,-0.107172,1.079768,2.151962,1.253823,2.309146,1.439401,0.880909,1.222721
4,41757.0,0.79262,2.208752,2.479872,1.317758,1.58972,0.805357,1.342906,0.403564,1.401463,...,1.495716,0.651685,-0.096253,1.166865,2.359786,1.281186,2.249195,1.434779,0.927089,1.355324


In [69]:
# 减去 control_df 中相同 Metadata_Plate 的行对应的后面 672 特征维度
def subtract_control_features(row, control_df):
    plate = row['Metadata_Plate']
    control_row = control_df[control_df['Metadata_Plate'] == plate]
    if not control_row.empty:
        row.iloc[1:] = row.iloc[1:] - control_row.iloc[0, 1:] * 1.5
    return row

adjusted_combined_df = combined_df.apply(subtract_control_features, axis=1, control_df=control_df)
# adjusted_combined_df

In [None]:
# 移除第一列 'Metadata_Plate'，然后变成 (66558, 672)
# adjusted_combined_df = adjusted_combined_df.drop(columns=['Metadata_Plate'])

# 保存为 .npy 文件
save_path = "Fig3/BBBC037/PhenoProfiler/PhenoProfiler_1.npy"
np.save(save_path, adjusted_combined_df.values)

# 打印结果的形状以确认
# print(adjusted_combined_df.shape)