In [1]:
#!/usr/bin/env python
import argparse
import torch
import numpy as np
from tqdm import tqdm
import mmcv
from numpy.linalg import norm, pinv
from scipy.special import softmax
from sklearn import metrics
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.covariance import EmpiricalCovariance
from os.path import basename, splitext
from scipy.special import logsumexp
import pandas as pd
import csv
import pickle




In [2]:
def parse_args():
    parser = argparse.ArgumentParser(description='Say hello')
    parser.add_argument('fc', help='Path to config')
    parser.add_argument('train_feature', help='Path to data used to train')
    parser.add_argument('feature_to_detect_data', nargs="+", help='Path to data going to be detected')

    return parser.parse_args()

def kl(p, q):
    return np.sum(np.where(p != 0, p * np.log(p / q), 0))


In [3]:
args = argparse.Namespace(
    fc='slp_fc2.pkl',
    train_feature='shhs_test.pkl',
    feature_to_detect_data=['st_temazepam.pkl']
)

input_names = [splitext(basename(data))[0] for data in args.feature_to_detect_data]
print(f"input datasets: {input_names}")

w, b = mmcv.load(args.fc)
print(f'{w.shape=}, {b.shape=}')

for feat in args.feature_to_detect_data:
    loaded_data = mmcv.load(feat)
    print(f'{feat}: {loaded_data.shape}')

print('load features')
feature_train = mmcv.load(args.train_feature).squeeze()
# 加载数据并获取唯一的 feature_to_detect_data 元素
feature_to_detect_data = mmcv.load(args.feature_to_detect_data[0]).squeeze()

print(f'{feature_train.shape=}')

print(f'input.shape= {feature_to_detect_data.shape}')

print('computing logits...')
logit_train = feature_train @ w.T + b
logit_input_data = [feat @ w.T + b for feat in feature_to_detect_data]

u = -np.matmul(pinv(w), b)

input datasets: ['st_temazepam']
w.shape=(1024, 1024), b.shape=(1024,)
st_temazepam.pkl: (265440, 1024)
load features
feature_train.shape=(1224888, 1024)
input.shape= (265440, 1024)
computing logits...


In [4]:
# ---------------------------------------
method = 'ViM'
print(f'\n{method}')

DIM = 1000 if feature_to_detect_data.shape[-1] >= 2048 else 512
print(f'{DIM=}')

# 计算 feature_to_detect_data 列表中元素的数量
print('computing num_samples...')
num_samples = len(feature_to_detect_data)
print(f'Number of Samples is {num_samples}.')

print('computing principal space...')
ec = EmpiricalCovariance(assume_centered=True)
ec.fit(feature_train - u)
print(ec.fit(feature_train - u))

eig_vals, eigen_vectors = np.linalg.eig(ec.covariance_)
NS = np.ascontiguousarray((eigen_vectors.T[np.argsort(eig_vals * -1)[DIM:]]).T)


ViM
DIM=512
computing num_samples...
Number of Samples is 265440.
computing principal space...
EmpiricalCovariance(assume_centered=True)


In [5]:
np.set_printoptions(suppress=True)
a = np.matmul(feature_train - u, NS)

In [6]:
print('computing vlogit...')
# 对每个数据取对数
a_log = np.log(np.abs(a) + 1e-12)  # 避免取对数时出现零值
vlogit_train =  np.linalg.norm(a_log, axis=1)
print(f'vlogit of train is: {vlogit_train}')
print(f'vlogit mean of train is: {vlogit_train.mean()}')

computing vlogit...
vlogit of train is: [325.71884539 327.28126639 320.43340273 ... 322.49314746 325.8244727
 325.9133953 ]
vlogit mean of train is: 318.41181346701785


In [7]:
print('computing alpha...')
alpha = logit_train.max(axis=-1).mean() / vlogit_train.mean()
print(f'{alpha=:.4f}')

computing alpha...
alpha=0.0131


In [8]:
feature_input = feature_to_detect_data[17592]
vlogit_input = np.linalg.norm(np.log(np.matmul(feature_input - u, NS)+1e-12), axis=-1) * alpha
print(f'vlogit is: {vlogit_input}')

logit_input=logit_input_data[17592]

# 将小于0.1的值设置为0
logit_input[logit_input < 1] = 0    

all_logits = np.concatenate(([vlogit_input], logit_input))

print(logit_input.max())

print(all_logits)

probabilities = softmax(all_logits, axis=-1)

print(probabilities)

p0 = probabilities[0]
print(p0)
print(probabilities.mean())

# 判断是否为 OOD 数据
is_ood = vlogit_input > logit_input.max()

if is_ood:
    print(f'Sample is OOD: {is_ood}')


vlogit is: 4.272768312308847
4.5368104
[4.27276831 0.         0.         ... 0.         0.         0.        ]
[0.03583892 0.00049971 0.00049971 ... 0.00049971 0.00049971 0.00049971]
0.0358389151554449
0.0009756097560975609


In [9]:
feature_input = feature_to_detect_data[17592]
vlogit_input = np.linalg.norm(np.log(np.matmul(feature_input - u, NS) + 1e-12), axis=-1) * alpha
print(f'vlogit is: {vlogit_input}')

logit_input = logit_input_data[17592]

# 合并 vlogit 和 归一化后的 logit 值到一个向量
all_logits = np.concatenate(([vlogit_input], logit_input))
print(all_logits)

# 对合并后的向量进行 softmax 处理
probabilities = softmax(all_logits, axis=-1)
print(probabilities)

# 获取 p0 值
p0 = probabilities[0]
print(p0)


# 判断是否为 OOD 数据
is_ood = p0 > 0.8

if is_ood:
    print(f'Sample is OOD: {is_ood}')


vlogit is: 4.272768312308847
[4.27276831 0.         0.         ... 0.         0.         0.        ]
[0.03583892 0.00049971 0.00049971 ... 0.00049971 0.00049971 0.00049971]
0.0358389151554449


In [18]:
f1ood_indices_list = []

for i in range(num_samples):
    # 获取当前样本的 feature_input
    feature_input = feature_to_detect_data[i]
    
    vlogit_input = np.linalg.norm(np.log(np.matmul(feature_input - u, NS)+1e-12), axis=-1) * alpha

    logit_input = logit_input_data[i]

    # 合并 vlogit 和 logit 值到一个向量
    all_logits = np.concatenate(([vlogit_input], logit_input))

    # 对合并后的向量进行 softmax 处理
    probabilities = softmax(all_logits, axis=-1)

    # 获取 p0 值
    p0 = probabilities[0]

    # 判断是否为 OOD 数据
    is_ood = p0 > 0.038

    # Append OOD indices to list if is_ood is True
    if is_ood:
        f1ood_indices_list.append(i)
        # Print the OOD detection result for the current sample
        # print(f'Sample {i + 1} is OOD: {is_ood}')

    i += 1  # Increment the counter

    
# Calculate the number of elements in ood_indices_list
num_ood_samples = len(f1ood_indices_list)

# Print the number of elements in ood_indices_list
print(f"Number of OOD samples: {num_ood_samples}")
    
print(f'Over! We have {num_samples} samples and run {i} samples')

Number of OOD samples: 130773
Over! We have 265440 samples and run 265440 samples


In [19]:
# Define the filename for the CSV file
csv_filename = 'f38_ood_indices_list_temazepam.csv'

# Write ood_indices_list to the CSV file
with open(csv_filename, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Index'])  # Write header

    for index in f1ood_indices_list:
        csv_writer.writerow([index])

print(f"Saved OOD indices to {csv_filename}")

Saved OOD indices to f38_ood_indices_list_temazepam.csv


In [20]:
#inlier_indices_list = []
f2ood_indices_list = []

for i in range(num_samples):
    # 获取当前样本的 feature_input
    feature_input = feature_to_detect_data[i]
    
    vlogit_input = np.linalg.norm(np.log(np.matmul(feature_input - u, NS)+1e-12), axis=-1) * alpha

    logit_input = logit_input_data[i]

    # 合并 vlogit 和 logit 值到一个向量
    all_logits = np.concatenate(([vlogit_input], logit_input))

    # 对合并后的向量进行 softmax 处理
    probabilities = softmax(all_logits, axis=-1)

    # 获取 p0 值
    p0 = probabilities[0]

    # 判断是否为 OOD 数据
    is_ood = p0 > 0.035

    # Append OOD indices to list if is_ood is True
    if is_ood:
        f2ood_indices_list.append(i)
        # Print the OOD detection result for the current sample
        # print(f'Sample {i + 1} is OOD: {is_ood}')
    #else:
        #inlier_indices_list.append(i)        
        
    i += 1  # Increment the counter

    
# Calculate the number of elements in ood_indices_list
num_ood_samples = len(f2ood_indices_list)

# Print the number of elements in ood_indices_list
print(f"Number of OOD samples: {num_ood_samples}")
    
print(f'Over! We have {num_samples} samples and run {i} samples')

Number of OOD samples: 173428
Over! We have 265440 samples and run 265440 samples


In [21]:
# Define the filename for the CSV file
csv_filename = 'f35_ood_indices_list_temazepam.csv'

# Write ood_indices_list to the CSV file
with open(csv_filename, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Index'])  # Write header

    for index in f2ood_indices_list:
        csv_writer.writerow([index])

print(f"Saved OOD indices to {csv_filename}")

Saved OOD indices to f35_ood_indices_list_temazepam.csv


In [14]:
# Create a mask to select inliers from feature_to_detect_data
#inlier_mask = np.array([i in inlier_indices_list for i in range(num_samples)])

# Select inlier data from feature_to_detect_data
#inlier_feature_data = feature_to_detect_data[inlier_mask]

In [15]:
# Save inlier_feature_data as "feature_id.pkl"
#with open('f47_feature_id.pkl', 'wb') as f:
#    pickle.dump(inlier_feature_data, f)

#print('feature_id.pkl saved successfully!')