In [None]:
from transformers import AutoTokenizer
import torch
import torch.nn.functional as F

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
tokenizer

In [None]:
sentence = "unsure"
input_ids = tokenizer(sentence, return_tensors="pt")["input_ids"]
input_ids

In [None]:
tokenizer.decode(input_ids[0])

In [None]:
sentence = "unbelivable"
input_ids = tokenizer(sentence, return_tensors="pt").input_ids
input_ids

In [None]:
for token_id in input_ids[0]:
    print(tokenizer.decode(token_id))

In [None]:
word = "homoscedasticity"
my_ids = tokenizer(word, return_tensors="pt").input_ids
my_ids

In [None]:
tokenizer.decode(my_ids.squeeze())

In [None]:
word = "pneumonoultramicroscopicsilicovolcanoconiosis"
my_ids = tokenizer(word, return_tensors="pt").input_ids
# len(my_ids[0])
my_ids

In [None]:
for token_id in my_ids.squeeze():
    print(tokenizer.decode(token_id))

In [None]:
sentence = "antidisestablishmentarianism"
token_ids = tokenizer(sentence, return_tensors="pt").input_ids
token_ids, len(token_ids[0])

In [None]:
word = "floccinaucinihilipilification"

my_ids = tokenizer(word, return_tensors="pt").input_ids
my_ids, len(my_ids[0])

In [None]:
for token_id in my_ids[0]:
    print(tokenizer.decode(token_id))

In [None]:
from transformers import AutoModelForCausalLM

In [None]:
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")

In [None]:
gpt2

In [None]:
sentence = "I like machine learning to be able to predict the future."
# Data Processing technique
token_ids = tokenizer(sentence, return_tensors="pt").input_ids

outputs = gpt2(token_ids).logits[0, -1]
tokenizer.decode(outputs.argmax())

In [None]:
sentence = "I learn machine learning to enhance our understanding of the world around us."

token_ids = tokenizer(sentence, return_tensors="pt").input_ids

outputs = gpt2(token_ids).logits[0, -1]
tokenizer.decode(outputs.argmax())

In [None]:
sentence = "I learn machine learning to enhance"
token_ids = tokenizer(sentence, return_tensors="pt").input_ids
outputs = gpt2(token_ids).logits[0, -1]
final_logits = torch.topk(outputs, 20) # Feel free to play around with the K

for index in final_logits.indices:
    print(tokenizer.decode(index))

In [None]:
torch.softmax(final_logits.values, dim=0).sum()

In [None]:
torch.softmax(final_logits.values, dim=0).sum()

In [None]:
def greedy_decode(logits):
    """Return token index with maximum probability."""
    return torch.argmax(logits, dim=-1)

# TOP K SAMPLING

def top_k_sampling(logits, k=50):
    """
    keeps only top-k logits, normalize them into probability.
    them sample one token from the filtered distribution.
    """
    values, indices = torch.topk(logits, k)
    probs = F.softmax(values, dim=-1)
    sampled = torch.multinomial(probs, 1)
    return indices[sampled]

# Top-p (Nuecles) Sampling

def top_p_sampling(logits, p=0.9):
    """
    Sort tokens by probability, keep smallest number whose culumative
    probability exceeds threshold p, then sample one token.
    """

    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
    sorted_probs = F.softmax(sorted_logits, dim=-1)
    cumulative_probs = sorted_probs.cumsum(dim=-1)

    # Mask token outside nuclues
    mask = cumulative_probs > p
    sorted_logits[mask] = float("-inf")

    # Sample from filtered logits
    filtered_probs = F.softmax(sorted_logits, dim=-1)
    sampled = torch.multinomial(filtered_probs, 1)

    # Return token index in originial vocabulary
    return sorted_indices[sampled]

## Temperature Sampling ##

def temperature_sampling(logits, temperature=1.0):
    """
    Scale logits by temperature before sampling.
    Lower temperature => sharper distribution
    """

    scaled = logits / temperature
    probs = F.softmax(scaled, dim=-1)
    return torch.multinomial(probs, 1)


## Random Sampling ##

def random_sampling(logits):
    """
    Sample dirctly from softmax distribution without filtring
    """

    probs = F.softmax(logits, dim=-1)
    return torch.multinomial(probs, 1)

# sentence = "Today I decided to go to the local library and find out what was in my wallet."
sentence = "I am really happy becuase I have gone back in time."
inputs = tokenizer(sentence, return_tensors="pt")
output = gpt2(**inputs)
logits = output.logits[0, -1]

print(f"Greedy Decode: ", tokenizer.decode([greedy_decode(logits)]))
print(f"Top-K Sampling: ", tokenizer.decode(top_k_sampling(logits, k=10)))
print(f"Top-P-Sampling: ", tokenizer.decode(top_p_sampling(logits, p=0.9)))
print(f"Temp: ", tokenizer.decode(temperature_sampling(logits, temperature=1)))
print(f"Radnom: ", tokenizer.decode(random_sampling(logits)))

In [None]:
tokenizer.decode(top_k_sampling(outputs))

In [None]:
outputs

In [None]:
tokenizer.decode(top_p_sampling(outputs, p=0.9))

In [None]:
tokenizer.decode(temperature_sampling(outputs, temperature=1.5))

In [None]:
tokenizer.decode(random_sampling(outputs))

In [None]:
sentence = "I learn machine learning to enhance our understanding of the brain in"
token_ids = tokenizer(sentence, return_tensors="pt").input_ids
outputs = gpt2(token_ids).logits # Raw Unnormlized Score - Values
outputs = torch.softmax(outputs[0, -1], dim=-1)

top10 = torch.topk(outputs, k=10)

for index, value in zip(top10.indices, top10.values):
    print(f"{tokenizer.decode(index)} -- {value:.1%}")

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="openai-community/gpt2")

In [None]:
prompt = "What is machine learning?"
output = pipe(prompt)

In [None]:
print(output[0]["generated_text"])

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

In [None]:
model

## Sentiment Analysis

In [None]:
from datasets import load_dataset

ds = load_dataset("stanfordnlp/imdb")

In [None]:
type(ds)

In [None]:
ds

In [None]:
ds["train"]

In [None]:
import pandas as pd

In [None]:
ds["train"].to_pandas()

In [None]:
my_dataset_df = ds["train"].to_pandas()

In [None]:
my_dataset_df["text"]

In [None]:
len(my_dataset_df["text"])

In [None]:
from transformers import pipeline

In [None]:
classifier = pipeline("sentiment-analysis")

In [None]:
classifier("This day is great!")

In [None]:
classifier("This day is terrible and i am so sad")[0]["label"]

In [None]:
def score(review_text):
    return classifier(review_text[:500])[0]["label"]

In [None]:
my_dataset_df["model_prediction"] = my_dataset_df["text"].apply(score)

In [None]:
my_dataset_df[["label", "model_prediction"]][:20]

In [None]:
my_dataset_df.iloc[0]

In [None]:
review = my_dataset_df.iloc[0]["text"]
classifier(review)[0]["label"]

In [None]:
from transformers import pipeline

In [None]:
finbert = pipeline("sentiment-analysis", model="ProsusAI/finbert")

In [None]:
sentence = "The company reported a strong increase in quarterly revnue, exceeding market expectations."
finbert(sentence)

In [None]:
sentence = "Shares fell after the firm reported lower-than-expected earnings"
finbert(sentence)

In [None]:
sentences = ["Strong consumer demand drove record sales across all regions",
             "Supply chain disruptions severly affected production output"]

In [None]:
finbert(sentences)

### Named Entity Recognition

In [None]:
sentence = "Apple announced record earnings in the United States on Monday."

In [None]:
ner = pipeline("ner")

In [None]:
sentence

In [None]:
ner(sentence)

In [None]:
sentence = "I live in UK worked at Facebook after graduating from Harvard"

In [None]:
ner(sentence)

## Question Answering

In [None]:
qa_bot = pipeline("question-answering")

In [None]:
context = """
Financial sentiment analysis is a challenging task due to the specialized
language and lack of labeled data in that domain. General-purpose models are
not effective enough because of the specialized language used in a financial
context. We hypothesize that pre-trained language models can help with this
problem because they require fewer labeled examples and they can be further
trained on domain-specific corpora. We introduce FinBERT, a language model
based on BERT, to tackle NLP tasks in the financial domain. Our results show
improvement in every measured metric on current state-of-the-art results for
two financial sentiment analysis datasets. We find that even with a smaller
training set and fine-tuning only a part of the model, FinBERT outperforms
state-of-the-art machine learning methods.
"""

In [None]:
question = "What is financial sentiment analysis?"

In [None]:
qa_bot(question=question, context=context)

In [None]:
question = "What is FinBERT?"

In [None]:
result = qa_bot(question=question, context=context)
print(result["answer"])

## Machine Translation

In [None]:
translater = pipeline("translation_en_to_fr")

In [None]:
translater("Hello")

In [None]:
translater("Thanks")

In [None]:
sentence = "What is your name?"
translater(sentence)[0]["translation_text"]

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("translation", model="facebook/nllb-200-distilled-600M")