In [1]:
!pip install transformers



## After this stage, if you are working on google colab you can switch to the T4 GPU to run heavy LLM models. However use cautiously as they can exhaust very quickly

In [None]:
!nvidia-smi
# Use to see whether gpu connected or not

In [None]:
from transformers import pipeline
#-------------------------------------------#
#                 NLP TASK                  #
#-------------------------------------------#
'''
1. Text classification: Assigning a category to a piece of text
--> Sentiment Analysis
--> Topic classification
--> Spam detection
'''
classifier = pipeline("text-classification")
# All these are wrappers on top of hugging face platform: ready-made shortcut that wraps complex code into a simple and easy-to-use tool — so you don’t need to write everything from scratch.
# Wrapper is somewhat similar to an API just that wrappers run locally within your code environment whereas APIs connect to external services

'''
2. Token classification: Assigning labels to individual tokens in a sequence.
--> Named entity recognition (NER)
--> Part-of-speech tagging
'''
token_classifier = pipeline("token-classification")

'''
3. Question answering: Providing an answer to a question based on a given context.
'''
question_answerer = pipeline("question-answering")

'''
4. Text generation: Generating new text based on a given prompt.
--> Language modeling
--> Story generation
'''
text_generator = pipeline("text-generation")

'''
5. Summerization: Condensing long documents into shorter summaries
'''
summarizer = pipeline("summarization")

'''
6. Translation: Translating text from one language to another
'''
translator = pipeline("translation", model = "Helsinki-NLP/opus-mt-en-fr") # Copy the model id for model

'''
7. Text2Text Generation: General-purpose text transformation, including summerization, translation, and question answering.
'''
text2text_generator = pipeline("text2text-generation")

'''
8. Fill-Mask: Predicting the masked token in a sequence
'''
fill_mask = pipeline("fill-mask")

'''
9. Feature extraction: Transforming text into numerical representations / extracting hidden states or features from text
'''
feature_extractor = pipeline("feature-extraction")

'''
10. Sentence Similarity: Measuring the similarity between 2 sentences
'''
sentence_similarity = pipeline("sentence-similarity")

In [None]:
#-------------------------------------------#
#          Computer Vision TASK             #
#-------------------------------------------#

'''
1. Image Classification: Classyfying the main content of an image
'''
image_classifier = pipeline("image-classification")

'''
2. Object Detection: Identifying objects within an image and their bounding boxes.
'''
object_detector = pipeline("object-detection")

'''
3. Image Segmentation: Assigning a label to each pixel in an image / Segmenting different parts of an image into classes
'''
image_segmenter = pipeline("image-segmentation")

'''
4. Image Generation: Creating new images based on a given prompt. / generating images from textual descriptions (like DALL-E)
'''
image_generator = pipeline("image-generation")

In [None]:
#-------------------------------------------#
#         Speech processing TASK            #
#-------------------------------------------#
'''
1. Automatic Speech Recognition (ASR): Converting spoken language into written text
'''
asr = pipeline("automatic-speech-recognition")

'''
2. Speech-to-Speech Translation: Converting spoken language into another spoken language
'''
speech_translator = pipeline("speech-to-speech-translation")

'''
3. Audio Classification: Classifying the main content of an audio file (audio signals) into predefined categories
'''
audio_classifier = pipeline("audio-classification")

'''
4. Audio Transcription: Converting spoken language into written text
'''
audio_transcription = pipeline("audio-transcription")

'''
5. Audio-to-Audio Translation: Converting spoken language into another spoken language
'''
audio_translator = pipeline("audio-to-audio-translation")

'''
6. Text-to-Speech (TTS): Converting text into spoken language
'''
tts = pipeline("text-to-speech")

'''
7. Speech-to-Text (STT): Converting spoken language into written text
'''
stt = pipeline("speech-to-text")
'''
8. Text-to-Speech (TTS): Converting text into spoken language
'''
tts = pipeline("text-to-speech")


In [None]:
#-------------------------------------------#
#             Multimodal TASK               #
#-------------------------------------------#

'''
1. Image captioning: Generating a textual description of an image
'''
image_captioner = pipeline("image-captioning")

'''
2. Visual Question Answering(VQA): Answering questions about the content of an image
'''
visual_question_answerer = pipeline("visual-question-answering")

In [None]:
#-------------------------------------------#
#                Other TASK                 #
#-------------------------------------------#

'''
1. Table Question Answering: Answering questions based on tabular data
'''
table_qa = pipeline("table-question-answering")

'''
2. Document question answering: Extracting answers from documents like pdfs
'''
doc_qa = pipeline("document-question-answering") # Similar to RAG

'''
3. Time Series Forecasting: Predicting future values in the time series data (not directly supported in the main Transformer library)
'''


# NLP Tasks

###1. Sentiment analysis

In [None]:
from transformers import pipeline

# If model not mentioned then it uses default model
classifier = pipeline("sentiment-analysis")
result = classifier("I was so not happy with the second Mission Impossible Movie.")

In [None]:
pipeline(task = "sentiment-analysis")("I was confused with the Barbie Movie.")

In [None]:
pipeline(task = "sentiment-analysis")\
                                      ("Everyday lots of LLM papers are published about LLMs Evaluation. \
                                      Lots of them look very promising. \
                                      I am not sure if we can actually evaluate LLMs. \
                                      there is still lots to do. \
                                      Don't you think?")

In [None]:
pipeline(task = "sentiment-analysis", model = 'facebook/bart-large-mnli')\
                                      ("Everyday lots of LLM papers are published about LLMs Evaluation. \
                                      Lots of them look very promising. \
                                      I am not sure if we can actually evaluate LLMs. \
                                      there is still lots to do. \
                                      Don't you think?")

The above used model is not really an LLM but it is more of a Language Model since it can only perforom the task of text classification.


We will also try our hands on some LLMs like
## **Llama, mistral, falcon, gemma**


LM uses only transformer architecture. LLMs may use more than just this.

## Batch sentiment analysis
### Perform sentiment analysis on a bunch of lines in one go

In [None]:
classifier = pipeline(task = "sentiment-analysis")

task_list = ["I really like Autoencoders, best models for Anomaly Detection",\
             "I am not sure if we Can actually Evaluate LLMs",\
             "PassiveAgressive is the name of a Linear Regression Model that so many people do not know",\
             "I hate long meetings."]
# The difference between passing a batch of sentences and passing multiline corpus is that here it is like passing a list
# We use \ after the ending " and we also use a ,
classifier(task _list)

The model only captures positive and negative emotions. If we want different different emotions then we can use another model.

In [None]:
classifier = pipeline(task = "sentiment-analysis", model = "SamLowe/roberta-base-go_emotions")

task_list = ["I really like Autoencoders, best models for Anomaly Detection",\
             "I am not sure if we Can actually Evaluate LLMs",\
             "PassiveAgressive is the name of a Linear Regression Model that so many people do not know",\
             "I hate long meetings."]
# The difference between passing a batch of sentences and passing multiline corpus is that here it is like passing a list
# We use \ after the ending " and we also use a ,
classifier(task _list)

###2. Text Generation

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

text_generator = pipeline("text-generation", model = 'distilbert/distilgpt2')
generated_text = text_generator("Today is a rainy day in London", truncation = True, num_return_sequences = 2)
print("Generated_text:\n ", generated_text[0]['generated_text'])

###3. Question answering

In [None]:
from transformers import pipeline

qa_model = pipeline("question-answering")
question = "What is my job?"
context = "I am developing AI models with Python."
qa_model(question = question, context = context)

Before using the LLM models we need to learn some genAI frameworks like LlamaIndex, LangChain as well as vector database

# FOR LLMS

# Tonkenization
### Using Transformer technique with attention mechanism

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification

In [None]:
# For auto tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
mytokenizer1 = DistilBertTokenizer.from_pretrained(model_name)
mymodel1 = DistilBertForSequenceClassification.from_pretrained(model_name)
# Tokenizer removes HTML tags, stop words, etc and later converts it to vector representation as well

classifier = pipeline("sentiment-analysis", model = mymodel2, tokenizer = mytokenizer2)
res = classifier("I was so not happy with the Barbie Movie")
print(res)

To understand how tokenizer works:

In [None]:
from transformers import AutoTokenizer

# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Example text
text = "I was so not happy with the Barbie Movie"

# Tokenize the text
tokens = tokenizer.tokenize(text)

# Print the tokens
print(tokens)
print("Tokens: ",tokens )

# Convert tokens to input IDs / vectors
input_ids = tokenizer.conver_tokens_to_ids(tokens)

# IF YOU NEED TO PASS IT TO A MODEL THEN USE THE ENCODE METHOD BELOW NOT THE ABOVE METHOD

# Encode the text (tokenization + converting to input IDs)
encoded_input = tokenizer(text)
print("Encoded input: ", encoded_input)
'''
Here in output the start of sentence is represented by 101 and the end is represented by 102 and in between we have the vector
This is followed by token_type_ids and then attention_mask
attention_mask (BINARY RESPONSE 0/1): attention mechanism; binary weightf assigned to tell the model which tokens to focus on and which not
'''
# Decode the text
decoded_output = tokenizer.decode(input_ids)
print("Decoded output: ", decoded_output)

[Attention mask hugging face glossary](https://huggingface.co/docs/transformers/glossary#attention-mask)

token_type_ids: These IDs are used to distinguish between different sequences in tasks that involve multiple sentences, such as question-answering and sentence-pair classification. BERT uses this mechanism to understand which tokens belong to which segment. For single-sequence tasks like sentiment analysis, token_type_ids are all zeros

attention_mask: Used to differentiate between actual tokens (if any). It helps focus on non-padding tokens and ignore padding tokens. A value of 1 indicates that the token should be attended to, while 0 indicates padding.

Why padding tokens are used
Uniform sequence lenght: depp learning models typically process input data in batches. To efficiently process these batches, all sequences in parallel without needing to handle variable length sequences individually

# FINE TUNNING USING IMDB DATASET
(Using LM model to perform sentiment analysis)

### 1. Install necessary libraries

In [None]:
!pip install datasets # See hugging face website for dataset
!pip install transformers

### 2. Load and prepare the dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset('imdb')

# The model being used for the tokenizer should be the same as model used for training/inference

In [None]:
dataset

In [None]:
dataset["train"][0]

### 3. Preprocess the Data
Tokenize the dataset using tokenizer associated with the pre-trained model

In [None]:
from transformers import AutoTokenizer

# Load the tokenizer
tokeinzer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
  return tokenizer(examples['text'], padding = "max_length", truncation = True) # Passing as a dictionary

tokenized_datasets = dataset.map(tokenize_function, batched = True) # Apply on top of the entire dataset

In [None]:
tokenized_dataset
tokenized_dataset["train"]

### 4. Set up the training arguments
Specify the hyperparameters and training settings


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = "./results",           # Output directory
    eval_strategy = "epoch",            # Evaluate every epoch
    learning_rate = 2e-5,               # Learning rate
    per_device_train_batch_size = 16,   # Batch size for training
    per_device_eval_batch_size = 16,    # Batch size for evaluation
    num_train_epochs = 1,               # Number of training epochs
    weight_decay = 0.01)                # Strength of weight decay
training_args

### 5. Initialize the model
Load the pre-trained model and define the training procedure

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer
# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)

# Initialize the trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["test"])
# Training will not start here as we need to call the model to start training

### 6. Train the model
Fine tune the pre-trained model on your specific dataset

In [None]:
trainer.train()

### 7. Evaluate the model
Assess the model's performance on validation set

In [None]:
results = trainer.evaluate()
print(results)

### 8. Save the fine-tuned model
Save the fine tuned model for later use

In [None]:
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')
# For new data use this pretrained model and tokenizer

#ArXiv Project
ArXiv is a research article website

In [None]:
!pip install arxiv

In [None]:
import arxiv
import pandas as pd

In [None]:
# Query to fetch AI-related papers
query = 'ai OR artificial intelligence OR machine learning'
search = arxiv.Search(query = query, max_results = 10, sort_by = arxiv.SortCriterion.SubmittedDate)

# Fetch papers
papers = []
for result in search.results():
  papers.append({
      'published': result.published,
      'title': result.title,
      'abstract': result.summary,
      'categories': result.categories
  })
df = pd.DataFrame(papers)

pd.set_option('display.max_colwidth', None)
df.head(10)

In order to perform summarization of research paper perform the following steps:

In [None]:
# Example abstract from API
abstract = df['abstract'][0]
summarizer = pipeline("summarization", model = 'facebook/bart-large-cnn')
summarization_result = summarizer(abstract)

In [None]:
summarization_result[0]['summary_text']

Sometimes we need hugging face API key in order to use models

## DO NOT SHARE YOUR ACCESS TOKEN WITH ANYONE. ESPECIALLY DO NOT PUT IT ON PUBLIC WEBSITES LIKE GITHUB.