# Install the Necessary LIbraries
- Rich
- Torch 
- Hugging Face
- Lovely Tensors
- Torch Summary
- Transformers
- accelerate


In [None]:
import sys

if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    !pip install torch
    !pip install transformers
    !pip install rich
    !pip install bitsandbytes
    !pip install accelerate
    !pip install sentencepiece
    !pip install torch-summary
    !pip install accelerate
else:
    print("Not running on Google Colab")
from rich import print

# What is NLP
The ability for machine to do human language task such 
1. Translation
2. Summarization
3. Question Answering
    1. Extractive Question Answering
    2. Abstractive Question Answering


## Translation 

### Hugging Face Default Pipeline Translations

In [None]:
TRANSLATION_EXAMPLE  = "Hello , my name is Rami Ismael"

In [None]:
import torch
from transformers import pipeline
## Translate English to French
en_fr_translator = pipeline("translation_en_to_fr")
en_fr_translator("How old are you?")

In [None]:
print( en_fr_translator(TRANSLATION_EXAMPLE ))

###  Hugging Face Default Pipeline Translations with small flan T5 model
- I select a Flan T5 as it perform great when the parameter size is smalls

In [None]:
translate_en_to_fr = pipeline("translation_en_to_fr", model="google/flan-t5-small", tokenizer="google/flan-t5-small" , device  = "cuda" if torch.cuda.is_available() else "cpu" )

In [None]:
print( translate_en_to_fr(TRANSLATION_EXAMPLE ))
print( translate_en_to_fr("Today , I will be teaching Natural Languatge Processing" ))

### Use a flan T5 model to translate from English  to Any Other Languages

- Language(s) (NLP): English, Spanish, Japanese, Persian,  French, Chinese, Bengali, Gujarati, German, Telugu, Italian,  Polish, Tamil, Marathi, Malayalam, Oriya, Panjabi, Portuguese, Urdu, Galician, Hebrew, Korean, Catalan, Thai, Dutch, Indonesian, Vietnamese, Bulgarian, Filipino, Central Khmer, Lao, Turkish, Russian, Croatian, Swedish, Yoruba, Kurdish, Burmese, Malay, Czech, Finnish, Somali, Tagalog, Swahili, Sinhala, Kannada, Zhuang, Igbo, Xhosa, Romanian, Haitian, Estonian, Slovak, Lithuanian, Greek, Nepali, Assamese, Norwegian
    - some translation are between one langauge is good compare to another model   

In [60]:
from transformers import AutoModel , AutoTokenizer , T5ForConditionalGeneration
small_flan_t5 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", device_map= "auto")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

In [61]:
phrase = "Hello , my name is Rami Ismael. This is a small example to share the capabilities of the T5 model to do translation"
translation_prompt = "translation  English  to  French : "

input_ids = tokenizer( translation_prompt + phrase, return_tensors="pt" ).input_ids.to( "cuda" if torch.cuda.is_available() else "cpu" )

outputs = small_flan_t5.generate(input_ids , penalty_alpha=0.6, top_k=4  , max_length = 256)



decoded_output = tokenizer.decode(outputs[0], skip_special_tokens= True )

In [62]:
print( decoded_output )

## Summarization

In [None]:
SUMMARY_TEXT_SAMPLE = "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct."

### Normal Hugging Face Summarization Pipeline

In [None]:
# use bart in pytorch
from transformers import pipeline
'''
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
'''
default_hg_summarizer = pipeline("summarization")

In [None]:
print( default_hg_summarizer(SUMMARY_TEXT_SAMPLE) ) 

### Use The Flan T5 Model with a Cuda GPU to do the summarization with Constrastive Search 
[link](https://huggingface.co/blog/introducing-csearch)

In [None]:
import torch
input_text = "summarize: "+ SUMMARY_TEXT_SAMPLE
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(  "cuda" if torch.cuda.is_available() else "cpu"  )
print( input_ids )

outputs = small_flan_t5_model.generate( input_ids , penalty_alpha=0.6, top_k=4, max_length=256)
                         
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

###  Use distilbart-cnn-12-6BARTC with Better Transformer Libraries with the Hugging Face Pipeline with constrastive search

In [None]:
from optimum.pipelines import pipeline
import torch

summarizer = pipeline("summarization", "sshleifer/distilbart-cnn-12-6", 
                accelerator="bettertransformer" , 
                device =  0 if torch.cuda.is_available() else -1)

In [None]:
## Constrastive Search Parameters
print( summarizer(SUMMARY_TEXT_SAMPLE , penalty_alpha=0.6, top_k=4, max_length= 128) ) 

## Question Answeriong

### Extractive Question Answering
1.Extractive question answering in natural language processing (NLP) is a task where a model is given a question and a piece of text, and the model's job is to identify and extract the text span from the piece of text that answers the question. This text span is then returned as the answer to the question. Extractive QA is different from abstractive QA where the model generates a new text which answer the question.

# Tokenizer
The Hugging Face Tokenizer is a tool that can be used to preprocess text data for natural language processing tasks. Here is a list of some of the things that the Hugging Face Tokenizer can do:

1. Tokenize text into individual tokens word
2. Lowercase all the words in the text
3. Add a special token, such as [CLS], to the beginning of each sentence
4. Convert all the word in the to token id belongin to the vocab if you cannot find the word in the vocab [UNK] will be used

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("neuralmagic/oBERT-3-upstream-pretrained-dense")
enc = tokenizer.encode("Hello there")
dec = tokenizer.decode(enc)
print("Encode: " + str(enc))
print("Decode: " + str(dec))
print("[CLS]: " + str(enc[0]))
print("hello: " + str(enc[1]))
print("there: "+ str(enc[2]))
print("[SEP]: " + str(enc[3]))

## Peek in the Tokenizer Vocab

In [None]:
## get the all vocab in the tokenizer
vocab = [] 
token_ids = []

# Models

## How to download a model from the Hugging Face Hug
1. We are download  a small BERT model from the Hugging Face Hub

In [None]:
from transformers import AutoTokenizer, AutoConfig , AutoModel

model = AutoModel.from_pretrained("neuralmagic/oBERT-3-upstream-pretrained-dense")

print( model )

### Torch Summary

In [None]:
from rich import print
def print_tabular(self):
    """
    Prints the intermediate representation of the graph in tabular
    format. Note that this API requires the ``tabulate`` module to be
    installed.
    """
    try:
        from tabulate import tabulate
    except ImportError:
        print("`print_tabular` relies on the library `tabulate`, "
                "which could not be found on this machine. Run `pip "
                "install tabulate` to install the library.")
    node_specs = [[n.op, n.name, n.target, n.args, n.kwargs]
                    for n in self.nodes]
    print(tabulate(node_specs,
            headers=['opcode', 'name', 'target', 'args', 'kwargs']))
print_tabular

# Fine Tune Hugging Face Model for Text Classification 
1. Have a model that was trained to an task. The pattern the model learn can be trasnfer to learn to new task. 


## Donwload a Hugging Face Dataset

[Dataset Link](https://huggingface.co/datasets/tweet_eval/viewer/emotion/train)

In [None]:
from datasets import load_dataset

dataset = load_dataset("tweet_eval" , "emotion") 

In [None]:
print( dataset["train"][0] )  

## Donwload a Model for Text Classication
1. Called AutoModelForSequenceClassification ( Sequence Classfication just text classfication , I don't know why it called Sequence Classifcation the authors are French)
2. Select the number of labels to do classification

There are many different architectures available in 🤗 Transformers, with each one designed around tackling a specific task. Here is a non-exhaustive list:

*Model (retrieve the hidden states)

*ForCausalLM

*ForMaskedLM

*ForMultipleChoice

*ForQuestionAnswering

*ForSequenceClassification

*ForTokenClassification

and others 🤗

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 6
model_checkpoint = "neuralmagic/oBERT-3-upstream-pretrained-dense"
model = AutoModelForSequenceClassification.from_pretrained( model_checkpoint , 
                                                           num_labels=num_labels)

In [None]:
print( model )

## Download a Tokenizer from Hugging Face Model 

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("neuralmagic/oBERT-3-upstream-pretrained-dense")

## Calculate the Accuracy of the model

## Create a Trainer Argument

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm = True,
                                  logging_steps=logging_steps,
                                  log_level="error")

## Trainer Object
1. We will feed the model , tokenizer , copmute emtric , dataset and arguments

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train()

## Display the Trainer curves