# WK05: Transformer Pipelines

## Setup

This code imports the functions we need to run our inference pipelines

In [None]:
from PIL import Image
from transformers import pipeline

### Text Completion

Let's use the GPT2 model to create some text completions.

We'll use a pipeline object to run inference:

In [None]:
TEXT_GEN_MODEL = "openai-community/gpt2"
generator = pipeline("text-generation", model=TEXT_GEN_MODEL)

Let's define a starter sentence to run our model on:

In [None]:
starter = "To be or not to be, that is "
result = generator(starter, max_length=64, pad_token_id=0)
print(result)

Let's try it on many phrases:

In [None]:
EXAMPLE_TEXTS = [
  "How much wood would a woodchuck chuck if ",
  "I once knew a man from Natucket, who ",
  "It was the best of times, it was the "
]

#### Changing Model

is as easy as:

In [None]:
TEXT_GEN_MODEL = "Xenova/llama2.c-stories110M"
generator = pipeline("text-generation", model=TEXT_GEN_MODEL)

Rerun with new model:

In [None]:
result = generator(starter, max_length=64, pad_token_id=0)
print(result)

One last model:

In [None]:
TEXT_GEN_MODEL = "facebook/opt-125m"
generator = pipeline("text-generation", model=TEXT_GEN_MODEL)

result = generator(starter, max_length=64, pad_token_id=0)
print(result)

### Text Sentiment Analysis

Define model and pipeline object:

In [None]:
TEXT_SENT_MODEL = "joeddav/distilbert-base-uncased-go-emotions-student"
analyzer = pipeline("sentiment-analysis", model=TEXT_SENT_MODEL)

Run on one example:

In [None]:
test_text = "A wave of elation washed over me, like sunlight breaking through the clouds"
result = analyzer(test_text)
print(result)

Run on example phrases

In [None]:
EXAMPLE_TEXTS = [
  "What a wonderful day",
  "OMG my head hurts",
  "What am I doing here?"
]

for t in EXAMPLE_TEXTS:
  result = analyzer(t)
  print(t, "->", result[0]["label"])

We can also define our pipeline like this if we want to get scores for all possible sentiments:

In [None]:
full_analyzer = pipeline("sentiment-analysis", model=TEXT_SENT_MODEL, return_all_scores=True)

In [None]:
t = EXAMPLE_TEXTS[0]
result = full_analyzer(t)
print(t, "->", result)

### Image Description

New model definition/location and pipeline object:

In [None]:
IMAGE_CAP_MODEL = "Salesforce/blip-image-captioning-base"
img_captioner = pipeline(task="image-to-text", model=IMAGE_CAP_MODEL)

A test image:

In [None]:
test_image = Image.open("./imgs/GDTM.jpg").convert("RGB")
display(test_image)

Run inference:

In [None]:
result = img_captioner(test_image)
print(result[0]["generated_text"])

Other image description models:
- [`LLAVA`](https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf)
- [`VIT`](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning)

## Other ways of running inference

Some models don't work with the pipeline inference object, but the Transformers library still has some consistent-ish interfaces for running these models.

### Depth Prediction

In [None]:
import numpy as np
import torch

from PIL import Image
from transformers import AutoImageProcessor, AutoModelForDepthEstimation

In [None]:
DEPTH_MODEL = "depth-anything/Depth-Anything-V2-Base-hf"
image_processor = AutoImageProcessor.from_pretrained(DEPTH_MODEL)
model = AutoModelForDepthEstimation.from_pretrained(DEPTH_MODEL)

In [None]:
# prepare image for the model
image = Image.open("./imgs/flowers")
inputs = image_processor(images=image, return_tensors="pt")

# run model
outputs = model(**inputs)
print(outputs)

In [None]:
# interpolate to original size
prediction = torch.nn.functional.interpolate(
  outputs.predicted_depth.unsqueeze(1),
  size=image.size[::-1],
  mode="bicubic",
  align_corners=False,
)

# visualize the prediction
output = prediction.squeeze().cpu().detach().numpy()
formatted = (output * 255 / np.max(output)).astype("uint8")
depth = Image.fromarray(formatted)

display(depth)

### Object Detection

In [None]:
import torch
from PIL import Image
from transformers import DetrImageProcessor, DetrForObjectDetection

In [None]:
OBJ_MODEL = "facebook/detr-resnet-101"
processor = DetrImageProcessor.from_pretrained(OBJ_MODEL, revision="no_timm")
model = DetrForObjectDetection.from_pretrained(OBJ_MODEL, revision="no_timm")

In [None]:
image = Image.open("./street.jpg")
inputs = processor(images=image, return_tensors="pt")

output = model(**inputs)
results = processor.post_process_object_detection(output, 0.9, [image.size[::-1]])[0]
print(results)

In [None]:
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
  print(
    f"Detected {model.config.id2label[label.item()]}",
    f"with confidence {round(score.item(), 3)}",
    f"at location {[round(i, 2) for i in box.tolist()]}"
  )

### Segmentation

In [None]:
from PIL import Image
from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation

from WK05_utils import ADE20K_PALETTE

In [None]:
SEG_MODEL = "nvidia/segformer-b1-finetuned-ade-512-512"
feature_extractor = SegformerFeatureExtractor.from_pretrained(SEG_MODEL)
model = SegformerForSemanticSegmentation.from_pretrained(SEG_MODEL)

In [None]:
image = Image.open("")
inputs = feature_extractor(images=image, return_tensors="pt")

output = model(**inputs)
print(output)

In [None]:
logits = torch.nn.functional.interpolate(
    output.logits,
    size=image.size[::-1],
    mode="bicubic",
    align_corners=False
)

prediction = torch.argmax(logits, dim=1) + 1

seg_img = Image.fromarray(prediction.squeeze().cpu().numpy().astype("uint8"))
seg_img.putpalette(ADE20K_PALETTE)

out_img = Image.blend(image, seg_img.convert("RGB"), alpha=0.5)

display(seg_img)
display(out_img)