# Get center vector

In [12]:
import pandas as pd
import numpy as np
import json

df = pd.read_excel("/home/tuananh/tuananh/domain_calibration/experiments/resnet_34_kfold_val_outdomain_logits.xlsx")
domain_name = "amazon"
df = df[df.domain_name == domain_name]
df = df[df.phase == "train"]


In [13]:
train_logit = np.vstack([json.loads(x) for x in df["logit"].values])
train_logit.shape

(1878, 31)

## Get mean vector

In [14]:
# import torch
# from torch.utils.data import DataLoader

# def default_collate(batch):
#     batch = list(filter(lambda x: x is not None, batch))
#     data_chest_cls = torch.stack([torch.tensor(item[0]) for item in batch])
#     data_paths = [item[1] for item in batch]
    
#     return data_chest_cls, data_paths

# class Dataset(torch.utils.data.Dataset):
#     def __init__(self, image_paths):
#         self.image_paths = image_paths

#     def __len__(self):
#         return len(self.image_paths)

#     def __getitem__(self, index):
#         path = self.image_paths[index]
#         image = np.array(Image.open(path).convert('RGB').resize((224, 224)))

#         return image, path

# params = {'batch_size': 100,
#           'shuffle': True,
#           'num_workers': 6}

# data_set = Dataset(path_source)
# data_generator = torch.utils.data.DataLoader(data_set, collate_fn=default_collate, **params)

# def batch_mean(loader):
#     nimages = 0
#     mean = 0.
#     for batch, path in tqdm(loader):
#         batch = batch.permute(0, 3, 1, 2).float()
#         # Rearrange batch to be the shape of [B, C, W * H]
#         batch = batch.view(batch.size(0), batch.size(1), -1)
#         # Update total number of images
#         nimages += batch.size(0)
#         # Compute mean and std here
#         mean += batch.mean(2).sum(0)
#     mean /= nimages

#     return mean

# mean = batch_mean(data_generator)
# print("mean: \n", mean)

## Get mean feature

In [24]:
import os
import torch.nn as nn
import sys
sys.path.insert(0, "../../")

from src.base_line.restnet34 import initialize_model


In [25]:
import torch
from torch.utils import data
import cv2


def image_resize(image, width = None, height = None, inter = cv2.INTER_AREA):
    dim = None
    (h, w) = image.shape[:2]

    if width is None and height is None:
        return image

    if width is None:
        r = height / float(h)
        dim = (int(w * r), height)
    else:
        r = width / float(w)
        dim = (width, int(h * r))

    resized = cv2.resize(image, dim, interpolation = inter)

    return resized


class Dataset(data.Dataset):
    def __init__(self, df_info, folder_data, image_size_ratio, 
                input_size, transforms=None):
        self.df_info = df_info
        self.folder_data = folder_data
        self.image_size_ratio = image_size_ratio
        self.input_size = input_size
        self.transforms = transforms

    def __len__(self):
        return len(self.df_info)

    def __getitem__(self, index):
        labels = self.df_info.iloc[index]["classes"]
        image_id = self.df_info.iloc[index]["imageid"]

        class_name = image_id.split("__")[0]
        image_name = image_id.split("__")[1]

        image_path = self.folder_data + "/" + image_id.split("__")[0] + f"/{image_name}"

        image = cv2.imread(image_path)
        image = image_resize(image, width=self.image_size_ratio, height=self.image_size_ratio)
        if image.shape[0] < 224 and image.shape[0] < 224:
            image = cv2.resize(image, (self.input_size, self.input_size), interpolation = cv2.INTER_AREA)

        if self.transforms:
            sample = {
                "image": image
            }
            sample = self.transforms(**sample)
            image = sample["image"]

        X = torch.Tensor(image).permute(2, 0, 1)
        y = labels

        return X, y, self.df_info.iloc[index]


In [30]:
model_ft = initialize_model(
    num_classes=31,
    feature_extract=True, 
    use_pretrained=True
)
model_ft.fc = nn.Identity()


In [34]:
from threading import main_thread
import torch
import albumentations as A
import pandas as pd
from tqdm import tqdm
import time
import copy
import glob
import numpy as np
import yaml

# import os
# import sys
# path = os.path.dirname(__file__)
# root_folder = os.path.join(
#     os.path.abspath(path).split("domain_calibration")[0],
#     "domain_calibration"
# )
# sys.path.insert(0, root_folder)

import warnings
warnings.filterwarnings("ignore")

from src.base_line.data_loader_export_logits import Dataset
from src.base_line.restnet34 import initialize_model

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


def load_from_yaml(fname):
    with open(fname, encoding='utf-8') as f:
        base_config = yaml.safe_load(f)
    return base_config


def get_agument(augment_name):
    list_augment = []
    for augment in augment_name:
        if augment == "CenterCrop":
            list_augment.append(
                A.CenterCrop(width=224, height=224),
            )
        if augment == "Flip":
            list_augment.append(
                A.Flip(always_apply=False, p=0.5)
            )
        if augment == "Blur":
            list_augment.append(
                A.Blur(always_apply=False, p=0.5, blur_limit=(3, 7))
            )
        if augment == "GaussNoise":
            list_augment.append(
                A.GaussNoise(always_apply=False, p=0.5, var_limit=(10.0, 50.0))
            )
        if augment == "RandomBrightness":
            list_augment.append(
                A.RandomBrightness(
                    always_apply=False, p=0.5, 
                    limit=(-0.20000000298023224, 0.20000000298023224))
            )
        if augment == "Normalize":
            list_augment.append(
                A.Normalize(
                    mean=(0.485, 0.456, 0.406), 
                    std=(0.229, 0.224, 0.225)
                )
            )

    return A.Compose(list_augment)


def default_collate(batch):
    batch = list(filter(lambda x: x is not None, batch))
    images = torch.stack([torch.tensor(item[0]) for item in batch])
    labels = torch.stack([torch.tensor(item[1]) for item in batch])
    records = [item[2] for item in batch]
    
    return images, labels, records


def export_features(
    model, dataloaders, 
    export_main_domain=True):
    
    list_df = []
    number_batch = 0
    mean_vector = 0.
    model.eval()
    for inputs, labels, records in tqdm(dataloaders[phase]):
        inputs = inputs.to(device)
        outputs = model(inputs)
        mean_vector += outputs.view(inputs.size(0), -1).mean(0)
        number_batch += 1

        df = pd.DataFrame(records)
        df["feature vector"] = preds.cpu().detach().numpy().tolist()
        list_df.append(df)
    
    mean_vector /= number_batch
    df_result = pd.concat(list_df)
    df_result["mean_feature_vector"] = mean_vector.cpu().detach().numpy().tolist()
    
    return df_result


In [19]:
config_params = load_from_yaml("./configs/exp.yaml")

for dataset_name in config_params["dataset_name"]:
    if dataset_name == "Office-Home":
        path_information =\
            f"{config_params['path_data']}/{dataset_name}/OfficeHomeDataset_10072016/information/"
    elif dataset_name == "Bing-Caltech":
        path_information =\
            f"{config_params['path_data']}/{dataset_name}/information/"
    else:
        path_information =\
            f"{config_params['path_data']}/{dataset_name}/information/"

    all_csv_domain = glob.glob(path_information + "/*.csv")
    for path_csv in all_csv_domain:
        main_domain = path_csv.split("/")[-1].replace("_kfold.csv", "")

        print(f"Running on {dataset_name} with domain: {main_domain} .... ")
        path_weight_save =\
            f"{config_params['path_save']}/{dataset_name}/{main_domain}/"

        if not os.path.isdir(path_weight_save):
            os.makedirs(path_weight_save)

        data_transforms = {
            'train': get_agument(config_params["train_augment"]),
            'val': get_agument(config_params["val_augment"])
        }

        print("Initializing Datasets and Dataloaders...")
        informations = [
            {
                "dataframe": pd.read_csv(path_csv),
                "domain_name": path_csv.split("/")[-1].replace("_kfold.csv", "")
            }
        ]

        number_classes = len(set(informations[0]["dataframe"]["classes"]))

        k_fold = 3
        with pd.ExcelWriter(f'{path_weight_save}/resnet_34_export_feature_vector.xlsx') as writer:
            for k in range(3):
                if k in config_params["kfold_exp"]:
                    model_ft = initialize_model(
                        num_classes=number_classes,
                        feature_extract=True, 
                        use_pretrained=True
                    )

                    path_weight = path_weight_save + f"resnet_34_kfold_{str(k)}.pth"

                    model_ft.load_state_dict(
                        torch.load(
                            path_weight,
                            map_location=torch.device(device=device)
                        )
                    )
                    model_ft.fc = nn.Identity()
                    model_ft.eval()
                    model_ft = model_ft.to(device)

                    dataframe_logits = []
                    for information in informations:
                        domain_name = information["domain_name"]
                        if dataset_name ==  "Office-Home":
                            path_root = f"{config_params['path_data']}/{dataset_name}/OfficeHomeDataset_10072016/{domain_name}/"
                        elif dataset_name == "Bing-Caltech":
                            path_root = f"{config_params['path_data']}/{dataset_name}/BingLarge_C256_deduped/"
                        elif dataset_name == "Domain-net":
                            path_root = f"{config_params['path_data']}/{dataset_name}/{domain_name}/"
                        else:
                            path_root = f"{config_params['path_data']}/{dataset_name}/{domain_name}/images/"

                        df_info_k_fold = information["dataframe"].copy()
                        df_info_k_fold["domain_name"] = domain_name
                        df_info_k_fold["feature_vector"] = np.nan
                        df_info_k_fold["mean_feature_vector"] = np.nan
                        df_info_k_fold["phase"] = "train"
                        df_info_k_fold.loc[df_info_k_fold.kfold==k, ['phase']] = 'val'

                        image_datasets = {
                            "train": Dataset(
                                df_info=df_info_k_fold[df_info_k_fold.phase=="train"],
                                folder_data=path_root,
                                image_size_ratio=config_params["image_size_ratio"], 
                                input_size=config_params["input_size"], 
                                transforms=data_transforms["train"]),
                            "val": Dataset(
                                df_info=df_info_k_fold[df_info_k_fold.phase=="val"], 
                                folder_data=path_root,
                                image_size_ratio=config_params["image_size_ratio"], 
                                input_size=config_params["input_size"],
                                transforms=data_transforms["val"]),
                        }

                        dataloaders_dict = {
                            x: torch.utils.data.DataLoader(
                                image_datasets[x],
                                collate_fn=default_collate,
                                batch_size=config_params["batch_size"], 
                                shuffle=True, 
                                num_workers=config_params["num_workers"]) for x in ['train', 'val']}

                        if main_domain == domain_name:
                            export_main_domain = True
                        else:
                            export_main_domain = False

                        df_finish_pred = export_features(
                            model_ft, dataloaders_dict, 
                            export_main_domain=export_main_domain,
                        )

                        dataframe_logits.append(df_finish_pred)

                    dataframe_logits = pd.concat(dataframe_logits)
                    dataframe_logits.to_excel(writer, sheet_name=f'kfold_{k}', index=False)
