# Let's use Google's T5 model for multiple NLP tasks

Documentation: https://pytorch.org/text/0.15.0/tutorials/t5_demo.html

Step 1: Installing necessary libraries

In [None]:
!pip install torch transformers sentencepiece

Step 2: Importing necessary modules

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline

Step 3: Load pre-trained T5 model and tokenizer

In [None]:
model_name = 't5-small'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)


Step 4: Using the model for various NLP tasks

**1. Sentiment Analysis üôÇ**

In [65]:
text = "See, tokenization is fascinating."
# text = f"sst2 sentence: {text} </s>"

In [None]:
input_ids = tokenizer.encode(text, return_tensors='pt')
print(input_ids)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print(tokens)

In [None]:
tokens = tokenizer.decode(input_ids[0])
print(tokens)

In [None]:
output_ids = model.generate(input_ids)
print(output_ids)

In [None]:
sentiment = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(sentiment)

In [None]:
# Get input embeddings
input_embeddings = model.get_input_embeddings()(input_ids)
print(input_embeddings.shape, input_embeddings)

In [48]:
# function for sentiment analysis
def analyze_sentiment(text):
    # The T5 model was trained on the SST2 dataset (also available in torchtext) for sentiment classification using the prefix ‚Äússt2 sentence‚Äù.
    # Therefore, we will use this prefix to perform sentiment classification.
    input_text = f"sst2 sentence: {text} </s>"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate the sentiment classification
    output = model.generate(input_ids)
    sentiment = tokenizer.decode(output[0], skip_special_tokens=True)

    return sentiment

In [None]:
# Example usage
text = "I love this product!"
sentiment = analyze_sentiment(text)
print(f"Sentiment: {sentiment}")

**2. Text Summarization**

In [91]:
# function for text summarization
def summarize_text(text, max_length=150):
    input_text = f"summarize: {text} </s>"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate the summary
    summary_ids = model.generate(input_ids, max_length=max_length, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [None]:
# Example usage
text = """The Apollo program was a NASA program that succeeded in landing the first humans on the Moon in 1969.
It was started by President John F. Kennedy in 1961 with the goal of landing a man on the Moon and
bringing him safely back to Earth before the end of the decade. The program involved a series of manned
spaceflights using the Saturn V rocket and the Apollo spacecraft. The first successful manned mission
to the Moon was Apollo 11 in July 1969, with astronauts Neil Armstrong, Buzz Aldrin, and Michael Collins.
Armstrong and Aldrin became the first humans to walk on the lunar surface, while Collins remained in orbit
around the Moon. The Apollo program continued with several successful missions, including scientific
exploration and the collection of lunar samples. The last manned mission to the Moon was Apollo 17 in
December 1972. The program significantly advanced space exploration and contributed to scientific
understanding of the Moon and the broader universe."""
summary = summarize_text(text)
print(f"Summary: {summary}")

**3. Language Translation**
(Only English to German)

In [83]:
# function for language translation
def translate_text(text, source_lang='en', target_lang='de'):
    # Format inputs as required by T5
    input_text = f"translate {source_lang} to {target_lang}: {text} </s>"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate the translation
    translation_ids = model.generate(input_ids, early_stopping=True)
    translation = tokenizer.decode(translation_ids[0], skip_special_tokens=True)

    return translation

In [None]:
# Example usage
text = "Hello, how are you?"
translation = translate_text(text, source_lang='English', target_lang='German')
print(f"Translation: {translation}")

**4. Question Answering**

In [93]:
# function for question answering
def question_answering(context, question):

    # Format inputs as required by T5 (prefix context: question:)
    input_text = f"context: {context} question: {question}"

    # Tokenize inputs
    inputs = tokenizer.encode(input_text, return_tensors="pt")

    # Generate answer
    answer_ids = model.generate(inputs, early_stopping=True)
    generated_answer = tokenizer.decode(answer_ids[0], skip_special_tokens=True)

    return generated_answer


In [None]:
# Example input for question answering
context = "The Taj Mahal is a famous monument in India. It was built by Emperor Shah Jahan in memory of his wife Mumtaz Mahal."
question = "Who built the Taj Mahal?"

# Generate answer
generated_answer = question_answering(context, question)

print("Generated Answer:", generated_answer)

In [None]:
# Let's try a bigger context

context2 = """
The Apollo program was a NASA program that succeeded in landing the first humans on the Moon in 1969.
It was started by President John F. Kennedy in 1961 with the goal of landing a man on the Moon and
bringing him safely back to Earth before the end of the decade. The program involved a series of manned
spaceflights using the Saturn V rocket and the Apollo spacecraft. The first successful manned mission
to the Moon was Apollo 11 in July 1969, with astronauts Neil Armstrong, Buzz Aldrin, and Michael Collins.
Armstrong and Aldrin became the first humans to walk on the lunar surface, while Collins remained in orbit
around the Moon. The Apollo program continued with several successful missions, including scientific
exploration and the collection of lunar samples. The last manned mission to the Moon was Apollo 17 in
December 1972. The program significantly advanced space exploration and contributed to scientific
understanding of the Moon and the broader universe.
"""

question2 = "Who was the first person to walk on the Moon?"

# Generate answer
generated_answer2 = question_answering(context2, question2)

print("Generated Answer:", generated_answer2)

**5. Text Generation**

In [121]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import torch.nn.functional as F

# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [122]:
# function to generate text with specified parameters
def generate_text(prompt, temperature=1.0, top_k=50, top_p=0.95, max_length=20):
    # Tokenize the input prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Generate text using the model with specified parameters
    output = model.generate(
        input_ids,
        max_length=input_ids.shape[1] + 1,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        num_return_sequences=1,
        do_sample=True
    )

    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Generate logits for the next token for probability calculation
    with torch.no_grad():
        outputs = model(input_ids=input_ids)
    logits = outputs.logits[:, -1, :]

    # Apply temperature scaling
    logits = logits / temperature

    # Calculate probabilities using softmax
    probs = F.softmax(logits, dim=-1).squeeze()

    # Get the top 10 tokens and their probabilities
    top_probs, top_indices = torch.topk(probs, 10)
    top_tokens = tokenizer.convert_ids_to_tokens(top_indices.tolist())

    # Print the top 10 tokens and their probabilities
    print(f"Generated Tokens for the prompt '{prompt}':")
    for token, prob in zip(top_tokens, top_probs):
        print(f"Token: {token}, Probability: {prob.item():.4f}")

    return generated_text

In [None]:
# Example usage
prompt = "I took my dog "
print("Original Prompt:", prompt)

# Generate text with different temperature settings
print("\nTemperature = 0.7:")
print(generate_text(prompt, temperature=0.7, top_k=2, top_p=0.95))

print("\nTemperature = 1.0:")
print(generate_text(prompt, temperature=1.0, top_k=3, top_p=0.6))

print("\nTemperature = 1.5:")
print(generate_text(prompt, temperature=1.5, top_k=8, top_p=0.5))

# Let's use the model easier way - 'pipeline' module
Reference: https://huggingface.co/docs/transformers/v4.42.0/en/main_classes/pipelines#transformers.pipeline

Pipelines are made of:

* A tokenizer in charge of mapping raw textual input to token.
* A model to make predictions from the inputs.
* Some (optional) post processing for enhancing model‚Äôs output.

In [100]:
from transformers import pipeline

In [98]:
pipe = pipeline("text2text-generation", model="t5-small")
# pipe = pipeline("summarization", model=model, tokenizer=tokenizer)

In [109]:
text = """
The Apollo program was a NASA program that succeeded in landing the first humans on the Moon in 1969.
It was started by President John F. Kennedy in 1961 with the goal of landing a man on the Moon and
bringing him safely back to Earth before the end of the decade. The program involved a series of manned
spaceflights using the Saturn V rocket and the Apollo spacecraft. The first successful manned mission
to the Moon was Apollo 11 in July 1969, with astronauts Neil Armstrong, Buzz Aldrin, and Michael Collins.
Armstrong and Aldrin became the first humans to walk on the lunar surface, while Collins remained in orbit
around the Moon. The Apollo program continued with several successful missions, including scientific
exploration and the collection of lunar samples. The last manned mission to the Moon was Apollo 17 in
December 1972. The program significantly advanced space exploration and contributed to scientific
understanding of the Moon and the broader universe.
"""
input_text_format = f"summarize: {text} </s>"
summary = pipe(input_text_format, max_length=150)
print(summary)

[{'generated_text': 'the Apollo program was started by president . Kennedy in 1961 . it involved a series of manned spaceflights using the Saturn V rocket and the Apollo spacecraft .'}]


In [None]:
summary = pipe(input_text_format, max_length=150, do_sample=True, top_k=10, top_p=0.90, temperature=0.7)
print(summary)

In [None]:
# speech to text recognition
speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-base")

In [None]:
output_text = speech_to_text("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
print(output_text)