<a href="https://colab.research.google.com/github/TheHackerLlama/charlas/blob/main/riiaa_2021/parte_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install transformers datasets
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+${CUDA}.html

In [None]:
import requests
from PIL import Image

image = Image.open("doge.png")
image

In [None]:
from transformers import ViTFeatureExtractor, ViTForImageClassification

model_ckpt = 'google/vit-base-patch16-224'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_ckpt)
model = ViTForImageClassification.from_pretrained(model_ckpt)

In [None]:
inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx])

In [None]:
book_data = [
    {'chapter': 0,  'name': 'Introduction', 'start_page': 1, 'end_page': 11},
    {'chapter': 1,  'name': 'Text classification', 'start_page': 12, 'end_page': 48},
    {'chapter': 2,  'name': 'Named Entity Recognition', 'start_page': 49, 'end_page': 73},
    {'chapter': 3,  'name': 'Question Answering', 'start_page': 74, 'end_page': 120},
    {'chapter': 4,  'name': 'Summarization', 'start_page': 121, 'end_page': 140},
    {'chapter': 5,  'name': 'Conclusion', 'start_page': 141, 'end_page': 144},
]

In [None]:
import pandas as pd

table = pd.DataFrame.from_records(book_data)
table['number_of_pages'] = table['end_page']-table['start_page']
table = table.astype(str)
table

In [None]:
from transformers import TapasTokenizer, TapasForQuestionAnswering

model_name = 'google/tapas-base-finetuned-wtq'
model = TapasForQuestionAnswering.from_pretrained(model_name)
tokenizer = TapasTokenizer.from_pretrained(model_name)

In [None]:
queries = ["What's the topic in chapter 4?",
           "What is the total number of pages?",
           "On which page does the chapter about question-answering start?",
           "How many chapters have more than 20 pages?"]

inputs = tokenizer(table=table, queries=queries, padding='max_length',
                   return_tensors="pt")
outputs = model(**inputs)
answer_coordinates, agg_indices = tokenizer.convert_logits_to_predictions(
        inputs,
        outputs.logits.detach(),
        outputs.logits_aggregation.detach())

In [None]:
# let's print out the results:
id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
agg_string = [id2aggregation[x] for x in agg_indices]

answers = []
for coordinates in answer_coordinates:
    if len(coordinates) == 1: # only a single cell:
        answers.append(table.iat[coordinates[0]])
    else: # multiple cells
        cell_values = []
        for coordinate in coordinates:
            cell_values.append(table.iat[coordinate])
        answers.append(", ".join(cell_values))

for query, answer, predicted_agg in zip(queries, answers, agg_string):
    print(query)
    if predicted_agg == "NONE": print("Predicted answer: " + answer)
    else: print("Predicted answer: " + predicted_agg + " > " + answer)
    print('='*50)

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

In [None]:
from datasets import load_dataset

ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean",
                  split="validation")
ds[:2]

In [None]:
import soundfile as sf

def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

ds = ds.map(map_to_array)

In [None]:
from IPython.display import Audio

display(Audio(ds[0]['speech'], rate=16000))
display(Audio(ds[1]['speech'], rate=16000))

In [None]:
import torch

inputs = processor(ds["speech"][:2], return_tensors="pt", padding="longest",
                   sampling_rate=16000)
logits = model(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
print('\n\n'.join(transcription))