In [1]:
import sys
sys.path.append("/project/lt200203-aimedi/pud/gen-x-report/modeling")

In [2]:
from gen_x_report.configuration_gen_x_report import GenXReportConfig
from gen_x_report.modeling_gen_x_report import GenXReportModel

In [3]:
GenXReportConfig.register_for_auto_class()
GenXReportModel.register_for_auto_class("AutoModel")

In [4]:
from gen_x_report.configuration_gen_x_report import GXRVisionConfig, GXRQFormerConfig, GXRTextConfig, GenXReportConfig

# Create instances of the sub-configurations
vision_config = GXRVisionConfig()
qformer_config = GXRQFormerConfig()
text_config = GXRTextConfig()

# Create the main configuration by passing the sub-configurations directly
genxreport_config = GenXReportConfig(
    vision_config=vision_config,
    qformer_config=qformer_config,
    text_config=text_config,
    num_query_tokens=32
)

# Print the main configuration
print(genxreport_config)

GenXReportConfig {
  "architectures": [
    "GenXReportModel"
  ],
  "auto_map": {
    "AutoConfig": "configuration_gen_x_report.GenXReportConfig",
    "AutoModel": "modeling_gen_x_report.GenXReportModel"
  },
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "model_type": "genxreport_model",
  "num_query_tokens": 32,
  "qformer_config": {
    "_name_or_path": "",
    "add_cross_attention": false,
    "architectures": null,
    "attention_probs_dropout_prob": 0.1,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": null,
    "chunk_size_feed_forward": 0,
    "cross_attention_frequency": 2,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "early_stopping": false,
    "encoder_hidden_size": 1408,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": null,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": nul

In [5]:
GXR_model = GenXReportModel(genxreport_config)

In [6]:
GXR_model

GenXReportModel(
  (vision_model): GXRVisionModel(
    (embeddings): Dinov2Embeddings(
      (patch_embeddings): Dinov2PatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): Dinov2Encoder(
      (layer): ModuleList(
        (0-11): 12 x Dinov2Layer(
          (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (attention): Dinov2Attention(
            (attention): Dinov2SelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): Dinov2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
           

In [7]:
import torch

input_tensor = [torch.randn(1, 3, 518, 518)]
dummy_input_ids = torch.tensor([[101, 102, 103, 104, 105]])
GXR_model.eval()

with torch.no_grad():
    output = GXR_model(input_ids=dummy_input_ids, pixel_values=input_tensor)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [8]:
output

Blip2ForConditionalGenerationModelOutput(loss=None, logits=tensor([[[ 0.3349, -0.4594,  0.3579,  ...,  0.1307, -0.3965,  0.7830],
         [ 0.3349, -0.4594,  0.3579,  ...,  0.1307, -0.3965,  0.7830],
         [ 0.3349, -0.4594,  0.3579,  ...,  0.1307, -0.3965,  0.7830],
         ...,
         [-0.1348, -0.4030,  0.4156,  ...,  0.6860,  0.0488,  0.3054],
         [ 0.3065, -0.2671,  0.1090,  ..., -0.1836,  0.2454,  0.0723],
         [-0.0417, -0.3480,  0.1257,  ...,  0.0087, -0.3036,  1.1786]]]), vision_outputs=BaseModelOutputWithPooling(last_hidden_state=tensor([[[ 0.1159, -0.7554,  0.6187,  ..., -2.2392, -2.2235,  1.1393],
         [-0.2199,  0.8363, -0.1816,  ..., -1.0936, -0.0389,  0.3741],
         [-0.3446,  0.1316,  0.3871,  ...,  0.0035,  1.0179,  1.4785],
         ...,
         [ 0.3458, -0.7667, -0.2444,  ..., -0.3654,  0.2709,  0.4290],
         [ 0.6175,  0.4443, -1.3942,  ..., -1.2017,  1.2044, -0.7383],
         [-1.2864, -0.3557,  1.6466,  ..., -0.5620,  0.2913,  0.9308]

In [10]:
from safetensors.torch import safe_open

qformer_files = [
    "/project/lt200203-aimedi/pud/gen-x-report/model/blip2-opt-2.7b/model-00001-of-00002.safetensors",
    "/project/lt200203-aimedi/pud/gen-x-report/model/blip2-opt-2.7b/model-00002-of-00002.safetensors",
]

# Load state dictionary from multiple safetensor files
qformer_state_dict = {}
for file in qformer_files:
    with safe_open(file, framework="pt", device="cpu") as f:
        for key in f.keys():
            qformer_state_dict[key] = f.get_tensor(key)

# Define prefixes to exclude
exclude_prefixes = ["vision_model.", "language_model.","language_projection.weight", "language_projection.bias"]

# Filter out keys for vision_model and language_model
filtered_state_dict = {
    key: value for key, value in qformer_state_dict.items()
    if not any(key.startswith(prefix) for prefix in exclude_prefixes)
}

# Load the filtered state dictionary into the model
GXR_model.load_state_dict(filtered_state_dict, strict=False)


_IncompatibleKeys(missing_keys=['vision_model.embeddings.cls_token', 'vision_model.embeddings.mask_token', 'vision_model.embeddings.position_embeddings', 'vision_model.embeddings.patch_embeddings.projection.weight', 'vision_model.embeddings.patch_embeddings.projection.bias', 'vision_model.encoder.layer.0.norm1.weight', 'vision_model.encoder.layer.0.norm1.bias', 'vision_model.encoder.layer.0.attention.attention.query.weight', 'vision_model.encoder.layer.0.attention.attention.query.bias', 'vision_model.encoder.layer.0.attention.attention.key.weight', 'vision_model.encoder.layer.0.attention.attention.key.bias', 'vision_model.encoder.layer.0.attention.attention.value.weight', 'vision_model.encoder.layer.0.attention.attention.value.bias', 'vision_model.encoder.layer.0.attention.output.dense.weight', 'vision_model.encoder.layer.0.attention.output.dense.bias', 'vision_model.encoder.layer.0.layer_scale1.lambda1', 'vision_model.encoder.layer.0.norm2.weight', 'vision_model.encoder.layer.0.norm2.

In [11]:
from safetensors.torch import safe_open

language_path = "/project/lt200203-aimedi/pud/gen-x-report/model/rad-dino-12c/model.safetensors"

language_model_state_dict = {}
with safe_open(language_path, framework="pt", device="cpu") as f:
    for key in f.keys():
        language_model_state_dict[key] = f.get_tensor(key)

GXR_model.vision_model.load_state_dict(language_model_state_dict, strict=False)


<All keys matched successfully>

In [12]:
from safetensors.torch import safe_open

language_model_files = [
    "/project/lt200203-aimedi/pud/gen-x-report/model/RadLLaMA-7b/model-00001-of-00006.safetensors",
    "/project/lt200203-aimedi/pud/gen-x-report/model/RadLLaMA-7b/model-00002-of-00006.safetensors",
    "/project/lt200203-aimedi/pud/gen-x-report/model/RadLLaMA-7b/model-00003-of-00006.safetensors",
    "/project/lt200203-aimedi/pud/gen-x-report/model/RadLLaMA-7b/model-00004-of-00006.safetensors",
    "/project/lt200203-aimedi/pud/gen-x-report/model/RadLLaMA-7b/model-00005-of-00006.safetensors",
    "/project/lt200203-aimedi/pud/gen-x-report/model/RadLLaMA-7b/model-00006-of-00006.safetensors",
]
language_model_state_dict = {}
for file in language_model_files:
    with safe_open(file, framework="pt", device="cpu") as f:
        for key in f.keys():
            language_model_state_dict[key] = f.get_tensor(key)

GXR_model.language_model.load_state_dict(language_model_state_dict, strict=False)

<All keys matched successfully>

In [13]:
output_dir = "/project/lt200203-aimedi/pud/gen-x-report/modeling/model"

GXR_model.save_pretrained(output_dir)

[2025-01-21 17:06:32,273] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cpu (auto detect)


In [14]:
from transformers import AutoModel

In [17]:
GenXReportConfig.register_for_auto_class()
GenXReportModel.register_for_auto_class("AutoModel")

In [23]:
from gen_x_report.configuration_gen_x_report import GXRVisionConfig, GXRQFormerConfig, GXRTextConfig, GenXReportConfig

# Create instances of the sub-configurations
vision_config = GXRVisionConfig()
qformer_config = GXRQFormerConfig()
text_config = GXRTextConfig()

# Create the main configuration by passing the sub-configurations directly
genxreport_config = GenXReportConfig(
    vision_config=vision_config,
    qformer_config=qformer_config,
    text_config=text_config,
    #num_query_tokens=32
)

In [24]:
model = AutoModel.from_pretrained('/project/lt200203-aimedi/pud/gen-x-report/modeling/model', config=genxreport_config)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [25]:
model

GenXReportModel(
  (vision_model): GXRVisionModel(
    (embeddings): Dinov2Embeddings(
      (patch_embeddings): Dinov2PatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): Dinov2Encoder(
      (layer): ModuleList(
        (0-11): 12 x Dinov2Layer(
          (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (attention): Dinov2Attention(
            (attention): Dinov2SelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): Dinov2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
           