In [17]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from efficientnet_pytorch import EfficientNet
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
from torchvision.transforms import InterpolationMode
from tqdm import tqdm
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler
import h5py
import io

class CustomImageDataset(Dataset):
    def __init__(self, csv_file, hdf5_file, transform=None, mode='train', train_data=None):
        # self.img_dir = img_dir
        self.transform = transform
        self.hdf5_file = hdf5_file
        self.label_encoders = {}
        self.mode = mode
        self.categorical_vars = ['sex', 'anatom_site_general',
                             'image_type', 'tbp_tile_type', 'tbp_lv_location', 
                             'tbp_lv_location_simple', 'attribution', 'copyright_license', 
                             'lesion_id', 'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 
                             'iddx_4', 'iddx_5', 'mel_mitotic_index']
        
        self.numerical_vars = ['age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 
                          'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 
                          'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 
                          'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 
                          'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB', 
                          'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm', 
                          'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 
                          'tbp_lv_norm_border', 'tbp_lv_norm_color', 'tbp_lv_perimeterMM', 
                          'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt', 
                          'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 
                          'tbp_lv_y', 'tbp_lv_z', 'tbp_lv_dnn_lesion_confidence']

        if mode == 'train':
            self.annotations = self.get_full_dataframe(csv_file)
            self.encode_labels()
            self.normalize_numerical_data()
        elif mode == 'val' and train_data is not None:
            self.annotations = self.get_full_dataframe(csv_file, train_data=train_data)
            self.encode_labels(train_data)
            self.normalize_numerical_data(train_data)

        # print(numerical_df.isnull().sum())
        # print(categorical_df.isnull().sum())
        # print(numerical_df.values.dtype)
        # print(categorical_df.values.dtype)

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        # Load image
        # img_path = os.path.join(self.img_dir, self.annotations.iloc[idx]["isic_id"] + ".jpg")
        # image = Image.open(img_path).convert("RGB")

        with h5py.File(self.hdf5_file, 'r') as f:
            isic_id = self.annotations.iloc[idx]["isic_id"]

            if isic_id in f:
                image = f[isic_id]
                # Check if the data is numerical before conversion
                image_data = image[()]
                # 将字节字符串解码为图像
                image = Image.open(io.BytesIO(image_data)).convert("RGB")
                

        if self.transform:
            image = self.transform(image)

        # Load label
        label = torch.tensor(int(self.annotations.iloc[idx]["target"]))

        # Load categorical data

        numerical_df = self.annotations[self.numerical_vars]
        categorical_df = self.annotations[self.categorical_vars]
        numerical_df = numerical_df.iloc[idx]
        categorical_df = categorical_df.iloc[idx]

        numerical_data = torch.tensor(numerical_df.values, dtype=torch.float)
        categorical_data = torch.tensor(categorical_df.values, dtype=torch.long)

        # Load numerical data
        

        return image, categorical_data, numerical_data, label


    def get_full_dataframe(self, path, train_data=None):
        df = pd.read_csv(path)

        def fill_missing_with_distribution(series, distribution):
            missing_indices = series[series.isna()].index
            filled_values = np.random.choice(distribution.index, size=len(missing_indices), p=distribution.values)
            series.loc[missing_indices] = filled_values
            return series
        
        if self.mode == 'train':
            df['lesion_id'] = df['lesion_id'].apply(lambda x: 1 if pd.notnull(x) else 0)
        elif self.mode == 'val':
            for category in ['lesion_id', 'mel_mitotic_index']:
                dis = train_data.annotations[category].value_counts(normalize=True)
                generated_lesion_ids = np.random.choice(dis.index, size=len(df), p=dis.values)
                df[category] = generated_lesion_ids

        for category in ['sex', 'anatom_site_general']:
            dis = df[category].value_counts(normalize=True)
            df[category] = fill_missing_with_distribution(df[category], dis)
        
        mean_age = df['age_approx'].mean()
        df['age_approx'] = df['age_approx'].fillna(mean_age)
        
        if self.mode == 'train':
            df['iddx_2'] = df['iddx_2'].fillna(df['iddx_1'])
            df['iddx_3'] = df['iddx_3'].fillna(df['iddx_2'])
            df['iddx_4'] = df['iddx_4'].fillna(df['iddx_3'])
            df['iddx_5'] = df['iddx_5'].fillna(df['iddx_4'])
        elif self.mode == 'val':
            dis = train_data.annotations['iddx_full'].value_counts(normalize=True)
            generated_lesion_ids = np.random.choice(dis.index, size=len(df), p=dis.values)
            df['iddx_full'] = generated_lesion_ids
            for d in ['iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5']:
                df[d] = df['iddx_full']
        if self.mode == 'val':
            tbp_lv_dnn_lesion_confidence_mean = train_data.annotations['tbp_lv_dnn_lesion_confidence'].mean()
            tbp_lv_dnn_lesion_confidence_std = train_data.annotations['tbp_lv_dnn_lesion_confidence'].std()

            df['tbp_lv_dnn_lesion_confidence'] = np.random.normal(loc=tbp_lv_dnn_lesion_confidence_mean, scale=tbp_lv_dnn_lesion_confidence_std, size = len(df))

        return df
    
    def encode_labels(self, train_data=None):
        if self.mode == 'train':
            for col in self.categorical_vars:
                le = LabelEncoder()
                self.annotations[col] = le.fit_transform(self.annotations[col])
                self.label_encoders[col] = le
        elif self.mode == 'val' and train_data is not None:
            for col in self.categorical_vars:
                le = train_data.label_encoders[col]
                self.annotations[col] = self.annotations[col].apply(
                    lambda x: x if x in le.classes_ else np.random.choice(le.classes_, p=train_data.annotations[col].value_counts(normalize=True).values)
                )
                self.annotations[col] = le.transform(self.annotations[col])

    def normalize_numerical_data(self, train_data=None):
        if self.mode == 'train':
            scaler = MinMaxScaler()
            self.annotations[self.numerical_vars] = scaler.fit_transform(self.annotations[self.numerical_vars])
        elif self.mode == 'val' and train_data is not None:
            scaler = MinMaxScaler()
            scaler.fit(train_data.annotations[train_data.numerical_vars])
            self.annotations[self.numerical_vars] = scaler.transform(self.annotations[self.numerical_vars])


In [18]:
def get_transform():
    transform = transforms.Compose([
        transforms.Resize(256, interpolation=InterpolationMode.BICUBIC),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    return transform

In [19]:
mytransform = get_transform()
train_set = CustomImageDataset(csv_file="../data/train-metadata.csv", hdf5_file="../data/train-image.hdf5", transform=mytransform)


  df = pd.read_csv(path)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[missing_indices] = filled_values
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[missing_indices] = filled_values


In [20]:
test_dataset = CustomImageDataset(csv_file='../data/test-metadata.csv', hdf5_file='../data/test-image.hdf5', transform=mytransform, mode='val', train_data=train_set)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[missing_indices] = filled_values
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[missing_indices] = filled_values


In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from efficientnet_pytorch import EfficientNet
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
from torchvision.transforms import InterpolationMode
from tqdm import tqdm
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler
import h5py
import io
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

class CombinedModel(nn.Module):
    def __init__(self, num_classes, categorical_dims, num_numerical_features):
        super(CombinedModel, self).__init__()
        self.efficientnet = EfficientNet.from_pretrained('efficientnet-b0')
        
        # 冻结 EfficientNet 的卷积层
        for param in self.efficientnet.parameters():
            param.requires_grad = False
        
        # 分类器部分
        in_features = self.efficientnet._fc.in_features 
        self.efficientnet._fc = nn.Identity()  # 移除原来的全连接层
        self.image_fc = nn.Linear(in_features, 512)
        
        # # 处理类别数据
        # self.categorical_embeddings = nn.ModuleList([
        #     nn.Embedding(num_embeddings=10, embedding_dim=5) for _ in range(num_categorical_features)
        # ])

        self.categorical_embeddings = nn.ModuleList([
            nn.Embedding(num_embeddings=dim+1, embedding_dim=min(50, (dim + 1) // 2))
            for dim in categorical_dims
        ])

        categorical_total_dim = sum([embedding.embedding_dim for embedding in self.categorical_embeddings])

        self.categorical_fc = nn.Linear(categorical_total_dim, 32)
        
        # 处理数值数据
        self.numerical_fc = nn.Linear(num_numerical_features, 32)
        
        # 最终分类器
        self.final_fc = nn.Sequential(
            nn.Linear(512 + 32 + 32, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, images, categorical_data, numerical_data):
        # 图像特征
        x_image = self.efficientnet(images)
        x_image = self.image_fc(x_image)
        
        # 类别特征
        # print(f"Shape of categorical_data: {categorical_data.shape}")
        x_categorical = [embedding(categorical_data[:, i]) for i, embedding in enumerate(self.categorical_embeddings)]
        # for i, tensor in enumerate(x_categorical):
        #     print(f"Shape of tensor {i}: {tensor.shape}")   

        x_categorical = torch.cat(x_categorical, dim=1)
        x_categorical = self.categorical_fc(x_categorical)
        
        # 数值特征
        x_numerical = self.numerical_fc(numerical_data)
        
        # 结合所有特征
        x = torch.cat((x_image, x_categorical, x_numerical), dim=1)
        x = self.final_fc(x)
        
        return x
    
def get_transform():
    transform = transforms.Compose([
        transforms.Resize(256, interpolation=InterpolationMode.BICUBIC),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    return transform




class CustomImageDataset(Dataset):
    def __init__(self, csv_file, hdf5_file, transform=None, mode='train', train_data=None):
        # self.img_dir = img_dir
        self.mode = mode
        if self.mode == "train":
            print("train data init begin")
        elif self.mode == "val":
            print("val data init begin")

        self.transform = transform
        self.hdf5_file = hdf5_file
        self.label_encoders = {}
        
        self.categorical_vars = ['sex', 'anatom_site_general',
                             'image_type', 'tbp_tile_type', 'tbp_lv_location', 
                             'tbp_lv_location_simple', 'attribution', 'copyright_license', 
                             'lesion_id', 'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 
                             'iddx_4', 'iddx_5', 'mel_mitotic_index']
        
        self.numerical_vars = ['age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 
                          'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 
                          'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 
                          'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 
                          'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB', 
                          'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm', 
                          'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 
                          'tbp_lv_norm_border', 'tbp_lv_norm_color', 'tbp_lv_perimeterMM', 
                          'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt', 
                          'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 
                          'tbp_lv_y', 'tbp_lv_z', 'tbp_lv_dnn_lesion_confidence']

        if mode == 'train':
            self.annotations = self.get_full_dataframe(csv_file)
            self.encode_labels()
            self.normalize_numerical_data()
        elif mode == 'val' or mode == 'test' and train_data is not None:
            print("getting full dataframe")
            self.annotations = self.get_full_dataframe(csv_file, train_data=train_data)
            print("encoding labels")
            self.encode_labels(train_data)
            print("normalizing numerical data")
            self.normalize_numerical_data(train_data)


        if self.mode == "train":
            print("train data init done")
        elif self.mode == "val":
            print("val data init done")
        # print(numerical_df.isnull().sum())
        # print(categorical_df.isnull().sum())
        # print(numerical_df.values.dtype)
        # print(categorical_df.values.dtype)

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        # Load image
        # img_path = os.path.join(self.img_dir, self.annotations.iloc[idx]["isic_id"] + ".jpg")
        # image = Image.open(img_path).convert("RGB")

        with h5py.File(self.hdf5_file, 'r') as f:
            isic_id = self.annotations.iloc[idx]["isic_id"]

            if isic_id in f:
                image = f[isic_id]
                # Check if the data is numerical before conversion
                image_data = image[()]
                # 将字节字符串解码为图像
                image = Image.open(io.BytesIO(image_data)).convert("RGB")
                

        if self.transform:
            image = self.transform(image)

        # Load categorical data

        numerical_df = self.annotations[self.numerical_vars]
        categorical_df = self.annotations[self.categorical_vars]
        numerical_df = numerical_df.iloc[idx]
        categorical_df = categorical_df.iloc[idx]

        numerical_data = torch.tensor(numerical_df.values, dtype=torch.float)
        categorical_data = torch.tensor(categorical_df.values, dtype=torch.long)

        # Load numerical data
        if self.mode in ['train', 'val']:
            label = torch.tensor(int(self.annotations.iloc[idx]["target"]))
            return image, categorical_data, numerical_data, label

        elif self.mode == 'test':
            isic_id = self.annotations.iloc[idx]["isic_id"]
            return image, categorical_data, numerical_data, isic_id
        

    def get_full_dataframe(self, df, train_data=None):
        def fill_missing_with_distribution(series, distribution):
            missing_indices = series[series.isna()].index
            filled_values = np.random.choice(distribution.index, size=len(missing_indices), p=distribution.values)
            series.loc[missing_indices] = filled_values
            return series
        
        if self.mode == 'train':
            df['lesion_id'] = df['lesion_id'].apply(lambda x: 1 if pd.notnull(x) else 0)
        elif self.mode != 'train':
            for category in ['lesion_id', 'mel_mitotic_index']:
                dis = train_data.annotations[category].value_counts(normalize=True)
                generated_lesion_ids = np.random.choice(dis.index, size=len(df), p=dis.values)
                df[category] = generated_lesion_ids

        for category in ['sex', 'anatom_site_general']:
            dis = df[category].value_counts(normalize=True)
            df[category] = fill_missing_with_distribution(df[category], dis)
        
        mean_age = df['age_approx'].mean()
        df['age_approx'] = df['age_approx'].fillna(mean_age)
        
        if self.mode == 'train':
            df['iddx_2'] = df['iddx_2'].fillna(df['iddx_1'])
            df['iddx_3'] = df['iddx_3'].fillna(df['iddx_2'])
            df['iddx_4'] = df['iddx_4'].fillna(df['iddx_3'])
            df['iddx_5'] = df['iddx_5'].fillna(df['iddx_4'])
        elif self.mode != 'train':
            dis = train_data.annotations['iddx_full'].value_counts(normalize=True)
            generated_lesion_ids = np.random.choice(dis.index, size=len(df), p=dis.values)
            df['iddx_full'] = generated_lesion_ids
            for d in ['iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5']:
                df[d] = df['iddx_full']
        if self.mode != 'train':
            tbp_lv_dnn_lesion_confidence_mean = train_data.annotations['tbp_lv_dnn_lesion_confidence'].mean()
            tbp_lv_dnn_lesion_confidence_std = train_data.annotations['tbp_lv_dnn_lesion_confidence'].std()

            df['tbp_lv_dnn_lesion_confidence'] = np.random.normal(loc=tbp_lv_dnn_lesion_confidence_mean, scale=tbp_lv_dnn_lesion_confidence_std, size = len(df))

        return df
    
    def encode_labels(self, train_data=None):
        if self.mode == 'train':
            for col in self.categorical_vars:
                le = LabelEncoder()
                self.annotations[col] = le.fit_transform(self.annotations[col])
                self.label_encoders[col] = le
        elif self.mode != 'train' and train_data is not None:
            for col in self.categorical_vars:
                le = train_data.label_encoders[col]
                mask = ~self.annotations[col].isin(le.classes_)
    
                # 对新标签进行随机替换
                if mask.any():
                    self.annotations.loc[mask, col] = np.random.choice(
                        le.classes_, size=mask.sum(), p=train_data.annotations[col].value_counts(normalize=True).values
                    )
                
                # 进行编码
                self.annotations[col] = le.transform(self.annotations[col])
                print(f"Label encoder for {col} has {len(le.classes_)} classes")

                # self.annotations[col] = self.annotations[col].apply(
                #     lambda x: x if x in le.classes_ else np.random.choice(le.classes_, p=train_data.annotations[col].value_counts(normalize=True).values)
                # )
                # self.annotations[col] = le.transform(self.annotations[col])
                # print(f"Label encoder for {col} has {len(le.classes_)} classes")

    def normalize_numerical_data(self, train_data=None):
        if self.mode == 'train':
            scaler = MinMaxScaler()
            self.annotations[self.numerical_vars] = scaler.fit_transform(self.annotations[self.numerical_vars])
        elif self.mode != 'train' and train_data is not None:
            scaler = MinMaxScaler()
            scaler.fit(train_data.annotations[train_data.numerical_vars])
            self.annotations[self.numerical_vars] = scaler.transform(self.annotations[self.numerical_vars])



In [2]:

def generate_output(model_path):
    mytransform = get_transform()
    csv_file = pd.read_csv("../data/train-metadata.csv")
    train_df, val_df = train_test_split(csv_file, test_size=0.2, random_state=42)


    train_set = CustomImageDataset(csv_file=train_df, hdf5_file="../data/train-image.hdf5", transform=mytransform)
    val_set = CustomImageDataset(csv_file=val_df, hdf5_file="../data/train-image.hdf5", transform=mytransform, mode='val', train_data=train_set)
    test_set = CustomImageDataset(csv_file=pd.read_csv("../data/test-metadata.csv"), hdf5_file="../data/test-image.hdf5", transform=mytransform, mode='test', train_data=train_set)


    categorical_dims = [2, 5, 1, 2, 21, 8, 7, 3, 2, 52, 3, 15, 28, 52, 52, 7]
    model = CombinedModel(2, categorical_dims, 35)  # load your model here

    # 加载 state_dict
    state_dict = torch.load("best_model.pt")

    # 创建一个新的 state_dict，将键名中的 'module.' 前缀移除
    new_state_dict = {}
    for k, v in state_dict.items():
        new_key = k.replace("module.", "")  # 移除 'module.' 前缀
        new_state_dict[new_key] = v

    # 加载新的 state_dict 到模型
    model.load_state_dict(new_state_dict)
    return model, val_set


    

In [3]:
model, val_set = generate_output("best_model.pt")
model.eval()
model = model.cuda()
# test_loader = DataLoader(test_set, batch_size=32, shuffle=False, num_workers=8, pin_memory=True)
val_loader = DataLoader(val_set, batch_size=32, shuffle=False, num_workers=8, pin_memory=True)
results = []
criterion = nn.CrossEntropyLoss()   
with torch.no_grad():
    results = []
    val_loss = 0.0
    correct = 0
    all_labels = []
    all_probs = []

    for images, categorical_data, numerical_data, labels in tqdm(val_loader):
        images = images.cuda()
        categorical_data = categorical_data.cuda()
        numerical_data = numerical_data.cuda()
        labels = labels.cuda()
        outputs = model(images, categorical_data, numerical_data)
        loss = criterion(outputs, labels)
        val_loss += loss.item() * images.size(0)

        probabilities = F.softmax(outputs, dim=1)
        
        all_probs.extend(probabilities[:, 1].cpu().numpy())  # Probabilities of the positive class
        all_labels.extend(labels.cpu().numpy())

        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()

    val_loss /= len(val_loader.dataset)
    accuracy = correct / len(val_loader.dataset)

    # Calculate pAUC
    all_probs = np.array(all_probs)
    all_labels = np.array(all_labels)

    fpr, tpr, thresholds = roc_curve(all_labels, all_probs)
    tpr_threshold = 0.8
    mask = tpr >= tpr_threshold
    fpr_filtered = fpr[mask]
    tpr_filtered = tpr[mask]
    pAUC = auc(fpr_filtered, tpr_filtered)
    pAUC_normalized = pAUC / (tpr_filtered[-1] - tpr_threshold)
    print(f'Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}, pAUC: {pAUC_normalized:.4f}')

  csv_file = pd.read_csv("../data/train-metadata.csv")


train data init begin


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[missing_indices] = filled_values
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[missing_indices] = filled_values


train data init done
val data init begin
getting full dataframe


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[missing_indices] = filled_values
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[missing_indices] = filled_values


encoding labels
Label encoder for sex has 2 classes
Label encoder for anatom_site_general has 5 classes
Label encoder for image_type has 1 classes
Label encoder for tbp_tile_type has 2 classes
Label encoder for tbp_lv_location has 21 classes
Label encoder for tbp_lv_location_simple has 8 classes
Label encoder for attribution has 7 classes
Label encoder for copyright_license has 3 classes
Label encoder for lesion_id has 2 classes
Label encoder for iddx_full has 50 classes


  self.annotations.loc[mask, col] = np.random.choice(
  self.annotations.loc[mask, col] = np.random.choice(


Label encoder for iddx_1 has 3 classes
Label encoder for iddx_2 has 14 classes
Label encoder for iddx_3 has 27 classes


  self.annotations.loc[mask, col] = np.random.choice(
 'Angiofibroma' 'Angiofibroma']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  self.annotations.loc[mask, col] = np.random.choice(
 'Angiofibroma, Facial' 'Angiofibroma, Facial' 'Angiofibroma, Facial']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  self.annotations.loc[mask, col] = np.random.choice(


Label encoder for iddx_4 has 50 classes
Label encoder for iddx_5 has 50 classes
Label encoder for mel_mitotic_index has 8 classes
normalizing numerical data


 'Angiofibroma, Facial' 'Angiofibroma, Facial' 'Angiofibroma, Facial']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  self.annotations.loc[mask, col] = np.random.choice(
  self.annotations.loc[mask, col] = np.random.choice(


val data init done
getting full dataframe
encoding labels
Label encoder for sex has 2 classes
Label encoder for anatom_site_general has 5 classes
Label encoder for image_type has 1 classes
Label encoder for tbp_tile_type has 2 classes
Label encoder for tbp_lv_location has 21 classes
Label encoder for tbp_lv_location_simple has 8 classes
Label encoder for attribution has 7 classes
Label encoder for copyright_license has 3 classes
Label encoder for lesion_id has 2 classes
Label encoder for iddx_full has 50 classes
Label encoder for iddx_1 has 3 classes


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[missing_indices] = filled_values
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  series.loc[missing_indices] = filled_values
  self.annotations.loc[mask, col] = np.random.choice(
  self.annotations.loc[mask, col] = np.random.choice(
  self.annotations.loc[mask, col] = np.random.choice(
  self.annotations.loc[mask, col] = np.random.choice(
  self.annotations.loc[mask, col] = np.random.choice(


Label encoder for iddx_2 has 14 classes
Label encoder for iddx_3 has 27 classes
Label encoder for iddx_4 has 50 classes
Label encoder for iddx_5 has 50 classes
Label encoder for mel_mitotic_index has 8 classes
normalizing numerical data


  self.annotations.loc[mask, col] = np.random.choice(
  self.annotations.loc[mask, col] = np.random.choice(


Loaded pretrained weights for efficientnet-b0


100%|██████████| 2507/2507 [20:47<00:00,  2.01it/s]


Validation Loss: 0.2541, Accuracy: 0.9741, pAUC: 1.5876


In [4]:
fpr, tpr, thresholds = roc_curve(all_labels, all_probs)
tpr_threshold = 0.8
mask = (fpr >= 0) & (fpr <= 0.1)
fpr_filtered = fpr[mask]
tpr_filtered = tpr[mask]
pAUC = auc(fpr_filtered, tpr_filtered)
pAUC_normalized = pAUC / (tpr_filtered[-1] - tpr_threshold)
print(f'Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}, pAUC: {pAUC_normalized:.4f}')

Validation Loss: 0.2541, Accuracy: 0.9741, pAUC: -0.0284


In [5]:
import numpy as np
from sklearn.metrics import roc_curve

def compute_pauc_above_tpr(y_true, y_scores, tpr_threshold=0.8):
    # Step 1: Compute ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)

    # Step 2: Filter out TPR < 0.8
    indices_above_tpr = np.where(tpr >= tpr_threshold)[0]
    
    # If no TPR values are above the threshold, return 0
    if len(indices_above_tpr) == 0:
        return 0.0
    
    # Select the portion of the curve where TPR >= 0.8
    fpr_above = fpr[indices_above_tpr]
    tpr_above = tpr[indices_above_tpr]
    
    # Step 3: Calculate pAUC using the trapezoidal rule
    pauc = np.trapz(tpr_above, fpr_above)
    
    # Normalize the pAUC by dividing by the maximum possible pAUC in this range
    max_pauc = 0.2  # Because TPR range is from 0.8 to 1, and max FPR range would be 0 to 1
    pauc_normalized = pauc / max_pauc
    return pauc
    return pauc_normalized

# Example usage:
# y_true: array of true binary labels (0 or 1)
# y_scores: array of predicted probabilities for the positive class
pauc_score = compute_pauc_above_tpr(all_labels, all_probs)
print(f"The normalized pAUC above 80% TPR is: {pauc_score}")


The normalized pAUC above 80% TPR is: 0.31752642232689154


  pauc = np.trapz(tpr_above, fpr_above)


In [6]:
predicted_labels = (all_probs >= 0.5).astype(int)
false_negatives = np.sum((all_labels == 1) & (predicted_labels == 1))

# 输出假阴性的数量
print(f"假阴性的数量 (FN): {false_negatives}")

假阴性的数量 (FN): 8


In [7]:
print(np.sum(all_labels == 1))

74


In [4]:
import torch
from torchvision.models import efficientnet_b0

# 初始化模型
model = efficientnet_b0(pretrained=False)  # 不使用在线预训练模型，手动加载
model_dict = model.state_dict()

# 打印状态字典中的所有键
for key in model_dict.keys():
    print(key)
# 加载本地保存的权重

# 将模型移到GPU（如果需要）
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')


features.0.0.weight
features.0.1.weight
features.0.1.bias
features.0.1.running_mean
features.0.1.running_var
features.0.1.num_batches_tracked
features.1.0.block.0.0.weight
features.1.0.block.0.1.weight
features.1.0.block.0.1.bias
features.1.0.block.0.1.running_mean
features.1.0.block.0.1.running_var
features.1.0.block.0.1.num_batches_tracked
features.1.0.block.1.fc1.weight
features.1.0.block.1.fc1.bias
features.1.0.block.1.fc2.weight
features.1.0.block.1.fc2.bias
features.1.0.block.2.0.weight
features.1.0.block.2.1.weight
features.1.0.block.2.1.bias
features.1.0.block.2.1.running_mean
features.1.0.block.2.1.running_var
features.1.0.block.2.1.num_batches_tracked
features.2.0.block.0.0.weight
features.2.0.block.0.1.weight
features.2.0.block.0.1.bias
features.2.0.block.0.1.running_mean
features.2.0.block.0.1.running_var
features.2.0.block.0.1.num_batches_tracked
features.2.0.block.1.0.weight
features.2.0.block.1.1.weight
features.2.0.block.1.1.bias
features.2.0.block.1.1.running_mean
feat

In [5]:
model.load_state_dict(torch.load('../efficientnet_b0.pth'))

print("\n")
model_dict = model.state_dict()

# 打印状态字典中的所有键
for key in model_dict.keys():
    print(key)



features.0.0.weight
features.0.1.weight
features.0.1.bias
features.0.1.running_mean
features.0.1.running_var
features.0.1.num_batches_tracked
features.1.0.block.0.0.weight
features.1.0.block.0.1.weight
features.1.0.block.0.1.bias
features.1.0.block.0.1.running_mean
features.1.0.block.0.1.running_var
features.1.0.block.0.1.num_batches_tracked
features.1.0.block.1.fc1.weight
features.1.0.block.1.fc1.bias
features.1.0.block.1.fc2.weight
features.1.0.block.1.fc2.bias
features.1.0.block.2.0.weight
features.1.0.block.2.1.weight
features.1.0.block.2.1.bias
features.1.0.block.2.1.running_mean
features.1.0.block.2.1.running_var
features.1.0.block.2.1.num_batches_tracked
features.2.0.block.0.0.weight
features.2.0.block.0.1.weight
features.2.0.block.0.1.bias
features.2.0.block.0.1.running_mean
features.2.0.block.0.1.running_var
features.2.0.block.0.1.num_batches_tracked
features.2.0.block.1.0.weight
features.2.0.block.1.1.weight
features.2.0.block.1.1.bias
features.2.0.block.1.1.running_mean
fe

In [6]:
from efficientnet_pytorch import EfficientNet

model = EfficientNet.from_pretrained('efficientnet-b0')
model_dict = model.state_dict()
for key in model_dict.keys():
    print(key)

Loaded pretrained weights for efficientnet-b0
_conv_stem.weight
_bn0.weight
_bn0.bias
_bn0.running_mean
_bn0.running_var
_bn0.num_batches_tracked
_blocks.0._depthwise_conv.weight
_blocks.0._bn1.weight
_blocks.0._bn1.bias
_blocks.0._bn1.running_mean
_blocks.0._bn1.running_var
_blocks.0._bn1.num_batches_tracked
_blocks.0._se_reduce.weight
_blocks.0._se_reduce.bias
_blocks.0._se_expand.weight
_blocks.0._se_expand.bias
_blocks.0._project_conv.weight
_blocks.0._bn2.weight
_blocks.0._bn2.bias
_blocks.0._bn2.running_mean
_blocks.0._bn2.running_var
_blocks.0._bn2.num_batches_tracked
_blocks.1._expand_conv.weight
_blocks.1._bn0.weight
_blocks.1._bn0.bias
_blocks.1._bn0.running_mean
_blocks.1._bn0.running_var
_blocks.1._bn0.num_batches_tracked
_blocks.1._depthwise_conv.weight
_blocks.1._bn1.weight
_blocks.1._bn1.bias
_blocks.1._bn1.running_mean
_blocks.1._bn1.running_var
_blocks.1._bn1.num_batches_tracked
_blocks.1._se_reduce.weight
_blocks.1._se_reduce.bias
_blocks.1._se_expand.weight
_blocks.1

In [7]:
import torch
from torchvision.models import efficientnet_b0, efficientnet_b7

# 下载预训练的EfficientNet-B0模型
model = efficientnet_b7(pretrained=True)

# 保存模型权重到本地文件
torch.save(model.state_dict(), 'efficientnet_b7.pth')


Downloading: "https://download.pytorch.org/models/efficientnet_b7_lukemelas-c5b4e57e.pth" to /home/runhui/.cache/torch/hub/checkpoints/efficientnet_b7_lukemelas-c5b4e57e.pth
100%|██████████| 255M/255M [00:04<00:00, 58.9MB/s] 


In [1]:
import os

def rename_files_in_directory(directory_path):
    # 获取目录下的所有文件
    files = sorted(os.listdir(directory_path))
    
    # 只处理前 10100 个文件
    for i, filename in enumerate(files[:10100]):
        # 获取文件的扩展名
        extension = os.path.splitext(filename)[1]
        # 创建新的文件名
        new_name = f"{i + 1}.jpg"
        # 构建完整的旧文件路径和新文件路径
        old_file = os.path.join(directory_path, filename)
        new_file = os.path.join(directory_path, new_name)
        # 重命名文件
        os.rename(old_file, new_file)
        print(f"Renamed: {old_file} to {new_file}")

# 使用示例
directory_path = '../data/fake_images'  # 替换为你的目录路径
rename_files_in_directory(directory_path)


Renamed: ../data/fake_images/netG_epoch_0.pth_0.png to ../data/fake_images/1.jpg
Renamed: ../data/fake_images/netG_epoch_0.pth_1.png to ../data/fake_images/2.jpg
Renamed: ../data/fake_images/netG_epoch_0.pth_10.png to ../data/fake_images/3.jpg
Renamed: ../data/fake_images/netG_epoch_0.pth_11.png to ../data/fake_images/4.jpg
Renamed: ../data/fake_images/netG_epoch_0.pth_12.png to ../data/fake_images/5.jpg
Renamed: ../data/fake_images/netG_epoch_0.pth_13.png to ../data/fake_images/6.jpg
Renamed: ../data/fake_images/netG_epoch_0.pth_14.png to ../data/fake_images/7.jpg
Renamed: ../data/fake_images/netG_epoch_0.pth_15.png to ../data/fake_images/8.jpg
Renamed: ../data/fake_images/netG_epoch_0.pth_16.png to ../data/fake_images/9.jpg
Renamed: ../data/fake_images/netG_epoch_0.pth_17.png to ../data/fake_images/10.jpg
Renamed: ../data/fake_images/netG_epoch_0.pth_18.png to ../data/fake_images/11.jpg
Renamed: ../data/fake_images/netG_epoch_0.pth_19.png to ../data/fake_images/12.jpg
Renamed: ../dat

In [3]:
import math
for i in range(20):
    print(2*math.factorial(2*i))
    print(3*math.pow(2, i)*(math.factorial(i))*math.factorial(i))
    print(f"{2*math.factorial(2*i)>3*math.pow(2, i)*(math.factorial(i))*math.factorial(i)}")

2
3.0
False
4
6.0
False
48
48.0
False
1440
864.0
True
80640
27648.0
True
7257600
1382400.0
True
958003200
99532800.0
True
174356582400
9754214400.0
True
41845579776000
1248539443200.0
True
12804747411456000
202263389798400.0
True
4865804016353280000
4.045267795968e+16
True
2248001455555215360000
9.78954806624256e+18
True
1240896803466478878720000
2.8193898430778573e+21
True
806582922253211271168000000
9.529537669603158e+23
True
609776689223427721003008000000
3.735578766484438e+26
True
530505719624382117272616960000000
1.681010444917997e+29
True
526261673867387060334436024320000000
8.606773477980144e+31
True
590465598079208281695237219287040000000
4.974715070272524e+34
True
743986653579802434935998896301670400000000
3.2236153655365955e+37
True
1046045234933202223520014448200148582400000000
2.3274502939174217e+40
True
