# WK05: Transformer Pipelines

## Setup

This code imports the functions we need to run our inference pipelines

In [None]:
from PIL import Image
from transformers import pipeline

### Text Completion

Let's use the GPT2 model to create some text completions.

We use a pipeline object to run inference:

In [None]:
generator = pipeline(
  "text-generation",
  model="openai-community/gpt2"
)

Let's define a starter sentence to run our model on:

In [None]:
TOBE = "To be or not to be, that is the"

And run the model:

In [None]:
result = generator(TOBE, max_length=64, pad_token_id=0)
print(result)

#### Changing Model

is as easy as:

In [None]:
generator = pipeline(
  "text-generation",
  model="Xenova/llama2.c-stories110M"
)

Rerun with new model:

In [None]:
result = generator(TOBE, max_length=64, pad_token_id=0)
print(result)

One last model:

In [None]:
generator = pipeline(
  "text-generation",
  model="facebook/opt-125m"
)

result = generator(TOBE, max_length=64, pad_token_id=0)
print(result)

### Fill-Mask

can be used to get the probabilities/scores of different possible words to complete a sentence:

In [None]:
filler = pipeline(
  "fill-mask",
  model="FacebookAI/xlm-roberta-large"
)

In [None]:
input = f"{TOBE} <mask>"
result = filler(input)

In [None]:
print(TOBE)

for r in result:
  word = r["token_str"]
  score = round(r["score"], 4)
  print(len(TOBE) * " ", f"{word}: {score}")

### Text Sentiment Analysis

Define model and pipeline object:

In [None]:
analyzer = pipeline(
  "sentiment-analysis",
  model="joeddav/distilbert-base-uncased-go-emotions-student"
)

Run on one example:

In [None]:
text = "A wave of elation washed over me, like sunlight breaking through the clouds"
result = analyzer(text)
print(result)

Run on example phrases

In [None]:
EXAMPLE_TEXTS = [
  "What a wonderful day",
  "OMG my head hurts",
  "What am I doing here?"
]

for t in EXAMPLE_TEXTS:
  result = analyzer(t)
  print(t, "->", result[0]["label"])

We can also pass an extra parameter to our pipeline if we want to get scores for all possible sentiments:

In [None]:
full_analyzer = pipeline(
  "sentiment-analysis",
  model="joeddav/distilbert-base-uncased-go-emotions-student",
  return_all_scores=True
)

In [None]:
t = EXAMPLE_TEXTS[0]
result = full_analyzer(t)
print(t, ":", result)

### Image Description

New model definition/location and pipeline object:

In [None]:
img_captioner = pipeline(
  "image-to-text",
  model="Salesforce/blip-image-captioning-base"
)

A test image:

In [None]:
test_image = Image.open("./imgs/GDTM.jpg").convert("RGB")
display(test_image)

Run inference:

In [None]:
result = img_captioner(test_image)
print(result[0]["generated_text"])

Other image description models:
- [`LLAVA`](https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf)
- [`VIT`](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning)

In [None]:
llava_captioner = pipeline(
  "image-to-text",
  model="llava-hf/llava-interleave-qwen-0.5b-hf"
)

In [None]:
result = llava_captioner(test_image)
print(result[0]["generated_text"])

In [None]:
vit_captioner = pipeline(
  "image-to-text",
  model="nlpconnect/vit-gpt2-image-captioning"
)

In [None]:
result = vit_captioner(test_image)
print(result[0]["generated_text"])

## Other ways of running inference

Some models don't work with the pipeline inference object, but the Transformers library still has some consistent-ish interfaces for running these models.

### Depth Prediction

In [None]:
import numpy as np
import torch

from PIL import Image
from transformers import AutoImageProcessor, AutoModelForDepthEstimation

In [None]:
DEPTH_MODEL = "depth-anything/Depth-Anything-V2-Base-hf"
image_processor = AutoImageProcessor.from_pretrained(DEPTH_MODEL)
model = AutoModelForDepthEstimation.from_pretrained(DEPTH_MODEL)

In [None]:
# prepare image for the model
image = Image.open("./imgs/flowers")
inputs = image_processor(images=image, return_tensors="pt")

# run model
outputs = model(**inputs)
print(outputs)

In [None]:
# interpolate to original size
prediction = torch.nn.functional.interpolate(
  outputs.predicted_depth.unsqueeze(1),
  size=image.size[::-1],
  mode="bicubic",
  align_corners=False,
)

# visualize the prediction
output = prediction.squeeze().cpu().detach().numpy()
formatted = (output * 255 / np.max(output)).astype("uint8")
depth = Image.fromarray(formatted)

display(depth)

### Object Detection

In [None]:
import torch
from PIL import Image
from transformers import DetrImageProcessor, DetrForObjectDetection

In [None]:
OBJ_MODEL = "facebook/detr-resnet-101"
processor = DetrImageProcessor.from_pretrained(OBJ_MODEL, revision="no_timm")
model = DetrForObjectDetection.from_pretrained(OBJ_MODEL, revision="no_timm")

In [None]:
image = Image.open("./street.jpg")
inputs = processor(images=image, return_tensors="pt")

output = model(**inputs)
results = processor.post_process_object_detection(output, 0.9, [image.size[::-1]])[0]
print(results)

In [None]:
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
  print(
    f"Detected {model.config.id2label[label.item()]}",
    f"with confidence {round(score.item(), 3)}",
    f"at location {[round(i, 2) for i in box.tolist()]}"
  )

### Segmentation

In [None]:
from PIL import Image
from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation

from WK05_utils import ADE20K_PALETTE

In [None]:
SEG_MODEL = "nvidia/segformer-b1-finetuned-ade-512-512"
feature_extractor = SegformerFeatureExtractor.from_pretrained(SEG_MODEL)
model = SegformerForSemanticSegmentation.from_pretrained(SEG_MODEL)

In [None]:
image = Image.open("./imgs/street.jpg")
inputs = feature_extractor(images=image, return_tensors="pt")

output = model(**inputs)
print(output)

In [None]:
logits = torch.nn.functional.interpolate(
    output.logits,
    size=image.size[::-1],
    mode="bicubic",
    align_corners=False
)

prediction = torch.argmax(logits, dim=1) + 1

seg_img = Image.fromarray(prediction.squeeze().cpu().numpy().astype("uint8"))
seg_img.putpalette(ADE20K_PALETTE)

out_img = Image.blend(image, seg_img.convert("RGB"), alpha=0.5)

display(seg_img)
display(out_img)

### Multi-Modal Image Comprehension

Zero-Shot Classification

In [None]:
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

In [None]:
CLIP_MODEL = "openai/clip-vit-large-patch14"
model = CLIPModel.from_pretrained(CLIP_MODEL)
processor = CLIPProcessor.from_pretrained(CLIP_MODEL)

In [None]:
LABELS = [
  "cat",
  "dog",
  "bird",
  "fish",
  "aquatic mammal",
  "erinaceinae",
  "vegetation"
]

In [None]:
image = Image.open("./imgs/arara.jpg")
inputs = processor(text=LABELS, images=image, return_tensors="pt", padding=True)

output = model(**inputs)

In [None]:
label_idx = output.logits_per_image.argmax(dim=1)
LABELS[label_idx]