<a href="https://colab.research.google.com/github/TheNotoriousXxX/BusReservationSystem/blob/terminator2.0/MedicallyColab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Clone the med-flamingo repository from GitHub
!git clone https://github.com/Abir196/med-flamingo

Cloning into 'med-flamingo'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 30 (delta 6), reused 25 (delta 3), pack-reused 0[K
Receiving objects: 100% (30/30), 426.81 KiB | 13.77 MiB/s, done.
Resolving deltas: 100% (6/6), done.


In [None]:
import os
# Define the path to the requirements.txt file for the med-flamingo project
requirements_path = "content/med-flamingo/requirements.txt"

In [None]:
# Install specific versions of PyTorch and torchvision
!pip install torch==2.0.0 torchvision==0.15.1 -f https://download.pytorch.org/whl/cu117

# Install open_clip from a specific GitHub commit
!pip install git+https://github.com/usuyama/open_clip.git@01a53cc46662d8b28da0c9d73271bfb8f6f4b20d

# Install specific versions of datasets, wandb, einops, einops_exts
!pip install datasets==2.9.0 wandb==0.13.10 einops==0.6.0 einops_exts==0.0.4

# Install h5py and ipykernel
!pip install h5py ipykernel

# Install additional requirements from the specified requirements.txt file
!pip install -r {requirements_path}

In [None]:
# Install open_flamingo, accelerate, bitsandbytes, and transformers libraries
!pip install open_flamingo accelerate bitsandbytes transformers -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/67.0 kB[0m [31m888.6 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m108.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.9/51.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

In [None]:
from transformers import LlamaForCausalLM

# Store the original forward method
old_forward = LlamaForCausalLM.forward

# Define a custom forward method for LlamaForCausalLM
def forward(self, input_ids, attention_mask, **kwargs):
    """
    Condition the Flamingo layers on the media locations before forward().
    Overrides the forward method to customize behavior.
    """
    if not self.initialized_flamingo:
        raise ValueError(
            "Flamingo layers are not initialized. Please call `init_flamingo` first."
        )

    # Identify media locations
    media_locations = input_ids == self.media_token_id

    # Check for cached media and media tokens in the input
    use_cached_media_locations = (
        self._use_cached_vision_x
        and self.is_conditioned()
        and not media_locations.any()
    )

    # Condition layers based on media locations
    for layer in self._get_decoder_layers():
        if not use_cached_media_locations:
            layer.condition_media_locations(media_locations)
        layer.condition_use_cached_media(use_cached_media_locations)

    # Prepare arguments for the original forward method
    kwargs["input_ids"] = input_ids
    kwargs["attention_mask"] = attention_mask
    return old_forward(self, **kwargs)  # Call the original forward method

# Override LlamaForCausalLM's forward method with the custom method
LlamaForCausalLM.forward = forward

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import open_clip

from open_flamingo.src.flamingo import Flamingo
from open_flamingo.src.flamingo_lm import FlamingoLMMixin
from open_flamingo.src.utils import extend_instance

def create_model_and_transforms(
    clip_vision_encoder_path: str,
    clip_vision_encoder_pretrained: str,
    lang_encoder_path: str,
    tokenizer_path: str,
    cross_attn_every_n_layers: int = 1,
    use_local_files: bool = False,
    decoder_layers_attr_name: str = None,
    freeze_lm_embeddings: bool = False,
    **flamingo_kwargs,
):
    """
    Initialize a Flamingo model from pretrained vision encoder and language encoder.
    Args:
        clip_vision_encoder_path (str): Path to pretrained clip model (e.g. "ViT-B-32").
        clip_vision_encoder_pretrained (str): Name of pretraining dataset for clip model (e.g. "laion2b_s32b_b79k").
        lang_encoder_path (str): Path to pretrained language encoder.
        tokenizer_path (str): Path to pretrained tokenizer.
        cross_attn_every_n_layers (int, optional): Determines how often to add a cross-attention layer. Defaults to 1.
        use_local_files (bool, optional): Whether to use local files. Defaults to False.
        decoder_layers_attr_name (str, optional): Name of the decoder layers attribute. Defaults to None.
    Returns:
        Flamingo: Flamingo model from pretrained vision and language encoders.
        Image processor: Pipeline to preprocess input images.
        Tokenizer: A tokenizer for the language model.
    """
    # Load the pretrained vision encoder
    vision_encoder, _, image_processor = open_clip.create_model_and_transforms(
        clip_vision_encoder_path, pretrained=clip_vision_encoder_pretrained
    )
    vision_encoder.visual.output_tokens = True  # Set the vision encoder to output visual features

    # Load the tokenizer
    text_tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_path,
        local_files_only=use_local_files,
        trust_remote_code=True,
    )
    # Add Flamingo special tokens to the tokenizer
    text_tokenizer.add_special_tokens(
        {"additional_special_tokens": ["", "<image>"]}
    )
    if text_tokenizer.pad_token is None:
        # Issue: GPT models don't have a pad token, which we use to modify labels for the loss
        text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})

    # Load the pretrained language encoder
    lang_encoder = AutoModelForCausalLM.from_pretrained(
        lang_encoder_path,
        local_files_only=use_local_files,
        trust_remote_code=True,
        load_in_4bit=True
    )

    # Convert LM to FlamingoLM
    extend_instance(lang_encoder, FlamingoLMMixin)

    # Infer decoder layers attribute name
    if decoder_layers_attr_name is None:
        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
    lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
    lang_encoder.resize_token_embeddings(len(text_tokenizer))

    # Create Flamingo model
    model = Flamingo(
        vision_encoder,
        lang_encoder,
        text_tokenizer.encode("")[-1],
        text_tokenizer.encode("<image>")[-1],
        vis_dim=open_clip.get_model_config(clip_vision_encoder_path)["vision_cfg"]["width"],
        cross_attn_every_n_layers=cross_attn_every_n_layers,
        **flamingo_kwargs,
    )

    # Freeze necessary parameters
    model.requires_grad_(False)
    assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0

    # Unfreeze specified components
    model.perceiver.requires_grad_(True)
    model.lang_encoder.gated_cross_attn_layers.requires_grad_(True)
    if not freeze_lm_embeddings:
        model.lang_encoder.get_input_embeddings().requires_grad_(True)
        # TODO: investigate also training the output embeddings when untied

    # Display initialization information
    print(
        f"Flamingo model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters"
    )

    return model, image_processor, text_tokenizer

def _infer_decoder_layers_attr_name(model):
    # Infer decoder layers attribute name based on model's class name
    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
        if k.lower() in model.__class__.__name__.lower():
            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]

    # Raise error if attribute name cannot be inferred
    raise ValueError(
        f"We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. Please supply this string manually."
    )

# Known decoder layers attribute names mapping
__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
    "opt": "model.decoder.layers",
    "gptj": "transformer.h",
    "gpt-j": "transformer.h",
    "pythia": "gpt_neox.layers",
    "llama": "model.layers",
    "gptneoxforcausallm": "gpt_neox.layers",
    "mpt": "transformer.blocks",
    "mosaicgpt": "transformer.blocks",
}


In [None]:
# Import necessary libraries
from huggingface_hub import hf_hub_download
import torch

# Download the Med-Flamingo checkpoint from the Hugging Face Hub
checkpoint_path = hf_hub_download("med-flamingo/med-flamingo", "model.pt")
print(f'Downloaded Med-Flamingo checkpoint to {checkpoint_path}')

# Load the downloaded checkpoint using PyTorch
med_flamingo_checkpoint = torch.load(checkpoint_path, map_location="cuda")

Downloading model.pt:   0%|          | 0.00/5.24G [00:00<?, ?B/s]

Downloaded Med-Flamingo checkpoint to /root/.cache/huggingface/hub/models--med-flamingo--med-flamingo/snapshots/7243cd83bd426ceade9c4de9844cc5e5f3ff75e0/model.pt


In [None]:
# Import necessary libraries
from huggingface_hub import hf_hub_download
import torch
import os
from accelerate import Accelerator  # Import Accelerate library for hardware acceleration
from einops import repeat
from PIL import Image
import sys

# Append paths for custom modules
sys.path.append('/content/med-flamingo/scripts')
sys.path.append('/content/med-flamingo')
from src.utils import FlamingoProcessor
from demo_utils import image_paths, clean_generation

# Initialize Accelerator
accelerator = Accelerator()  # Use hardware acceleration (GPU or TPU) based on availability
device = accelerator.device

print('Loading model..')

# Import create_model_and_transforms function
from open_flamingo import create_model_and_transforms

# Initialize the Flamingo model, image processor, and tokenizer
model, image_processor, tokenizer = create_model_and_transforms(
    clip_vision_encoder_path="ViT-L-14",
    clip_vision_encoder_pretrained="openai",
    lang_encoder_path="huggyllama/llama-7b",
    tokenizer_path="huggyllama/llama-7b",
    cross_attn_every_n_layers=4,
)

Loading model..


100%|███████████████████████████████████████| 933M/933M [00:19<00:00, 48.5MiB/s]


Downloading (…)okenizer_config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Downloading (…)lve/main/config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Flamingo model initialized with 1309919248 trainable parameters


In [None]:
# Clear GPU memory cache to free up memory
torch.cuda.empty_cache()

# Move the perceiver and vision_encoder of the model to GPU
model.perceiver.cuda()
model.vision_encoder.cuda()

# Convert the gated_cross_attn_layers of the lang_encoder to float16 and move to GPU
model.lang_encoder.gated_cross_attn_layers.to(torch.float16).cuda()

# Clear GPU memory cache again after moving tensors to GPU
torch.cuda.empty_cache()


In [None]:
import gc  # Import the garbage collection module

# Collect and free up unused memory
gc.collect()

71

In [None]:
# Load the Med-Flamingo checkpoint into the model (not enforcing strict compatibility)
model.load_state_dict(med_flamingo_checkpoint, strict=False)

# Initialize the FlamingoProcessor using the tokenizer and image_processor
processor = FlamingoProcessor(tokenizer, image_processor)


In [None]:
cd med-flamingo/src

/content/med-flamingo/src


In [None]:
# Select only the first 2 image paths
image_paths = image_paths[:2]

# Prepare the model using the Accelerator
model = accelerator.prepare(model)
is_main_process = accelerator.is_main_process
model.eval()

"""
Step 1: Load images
"""
# Load demo images using PIL and store in a list
demo_images = [Image.open(path) for path in image_paths]

"""
Step 2: Define multimodal few-shot prompt
"""

# Define a few-shot prompt containing text and <image> placeholders
prompt = "You are a helpful medical assistant. You are being provided with images, a question about the image and an answer. Follow the examples and answer the last question. <image>Question: What is/are the structure near/in the middle of the brain? Answer: pons.<image>Question: Is there evidence of a right apical pneumothorax on this chest x-ray? Answer: yes.<image>Question: Is/Are there air in the patient's peritoneal cavity? Answer: no.<image>Question: Does the heart appear enlarged? Answer: yes.<image>Question: What side are the infarcts located? Answer: bilateral.<image>Question: Which image modality is this? Answer: mr flair.<image>Question: What is the most likely diagnosis?"

"""
Step 3: Preprocess data
"""
print('Preprocess data')

# Preprocess demo images using the FlamingoProcessor
pixels = processor.preprocess_images(demo_images)

pixels = repeat(pixels, 'N c h w -> b N T c h w', b=1, T=1)

# Encode the text prompt using the FlamingoProcessor
tokenized_data = processor.encode_text(prompt)


Preprocess data


In [None]:
"""
Step 4: Generate response
"""

# Generate a response using the multimodal few-shot prompt
print('Generate from multimodal few-shot prompt')

# Use mixed-precision training context for improved performance
with torch.autocast('cuda', torch.float16):

    # Generate text using the model
    generated_text = model.generate(
        vision_x=pixels.to(device),  # Convert images to the device
        lang_x=tokenized_data["input_ids"].to(device),  # Convert text input to the device
        attention_mask=tokenized_data["attention_mask"].to(device),  # Convert attention mask to the device
        max_new_tokens=20,  # Limit the maximum number of new tokens in the generated response
    )

# Decode the generated text using the processor's tokenizer
response = processor.tokenizer.decode(generated_text[0])

# Clean up the generated response
response = clean_generation(response)

# Print the cleaned response
print(f'{response=}')


Generate from multimodal few-shot prompt




response="<s> You are a helpful medical assistant. You are being provided with images, a question about the image and an answer. Follow the examples and answer the last question. <image> Question: What is/are the structure near/in the middle of the brain? Answer: pons.<|endofchunk|><image> Question: Is there evidence of a right apical pneumothorax on this chest x-ray? Answer: yes.<|endofchunk|><image> Question: Is/Are there air in the patient's peritoneal cavity? Answer: no.<|endofchunk|><image> Question: Does the heart appear enlarged? Answer: yes.<|endofchunk|><image> Question: What side are the infarcts located? Answer: bilateral.<|endofchunk|><image> Question: Which image modality is this? Answer: mr flair.<|endofchunk|><image> Question: What is the most likely diagnosis?\n Answer: metastasis.\n 10.\n Question: What is the most likely"


In [None]:
# Print the generated response
response

"<s> You are a helpful medical assistant. You are being provided with images, a question about the image and an answer. Follow the examples and answer the last question. <image> Question: What is/are the structure near/in the middle of the brain? Answer: pons.<|endofchunk|><image> Question: Is there evidence of a right apical pneumothorax on this chest x-ray? Answer: yes.<|endofchunk|><image> Question: Is/Are there air in the patient's peritoneal cavity? Answer: no.<|endofchunk|><image> Question: Does the heart appear enlarged? Answer: yes.<|endofchunk|><image> Question: What side are the infarcts located? Answer: bilateral.<|endofchunk|><image> Question: Which image modality is this? Answer: mr flair.<|endofchunk|><image> Question: What is the most likely diagnosis?\n Answer: metastasis.\n 10.\n Question: What is the most likely"