# Huggingface transformers

In [5]:
from transformers import pipeline

**setx TRANSFORMERS_CACHE "D:\AI-ML\transformers_cache"**

Changes the path where all the models will be downloaded, run it in cmd

In [6]:
#sentiment analysis
sentiment_classifier = pipeline("sentiment-analysis") # Default model will be used

print(sentiment_classifier("I'm so excited to be learning about large language models"))
print(sentiment_classifier("I'm so confused right now"))

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9997096657752991}]
[{'label': 'NEGATIVE', 'score': 0.9978690147399902}]


In [7]:
# Name entity relation
ner = pipeline("ner", model = "dslim/bert-base-NER") # Mention a particular model

ner_list = ner("Her name is Anna and she works in New York City for Morgan Stanley")

for item in ner_list:
    print(item["word"], ": ", item["entity"])

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Anna :  B-PER
New :  B-LOC
York :  I-LOC
City :  I-LOC
Morgan :  B-ORG
Stanley :  I-ORG


In [8]:
# zero-shot-classification - model will be able to perform a task without any additional training, general pre trained knowledge is enough
zeroshot_classifier = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli")

sequence_to_classify = "one day I will see the world"
candidate_labels = ['travel', 'cooking', 'dancing']

zeroshot_classifier(sequence_to_classify, candidate_labels)

Device set to use cpu


{'sequence': 'one day I will see the world',
 'labels': ['travel', 'dancing', 'cooking'],
 'scores': [0.9938651919364929, 0.0032738028094172478, 0.002861030399799347]}

{'sequence': 'one day I will see the world', \
 'labels': ['travel', 'dancing', 'cooking'], \
 'scores': [0.9938651919364929, 0.0032737581059336662, 0.0028610501904040575]}

# Pre-trained Tokeniser

In [9]:
from transformers import AutoTokenizer

In [10]:
# bert-base-uncased
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

sentence = "I'm so excited to be learning about large language models"

In [11]:
input_ids = tokenizer(sentence)
print(input_ids)

{'input_ids': [101, 1045, 1005, 1049, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 4275, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [12]:
tokens = tokenizer.tokenize(sentence)

print(tokens)

['i', "'", 'm', 'so', 'excited', 'to', 'be', 'learning', 'about', 'large', 'language', 'models']


In [13]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(token_ids)

[1045, 1005, 1049, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 4275]


In [14]:
decoded_ids = tokenizer.decode(token_ids)

print(decoded_ids)

i ' m so excited to be learning about large language models


In [15]:
tokenizer.decode(101)

'[CLS]'

In [16]:
tokenizer.decode(102)

'[SEP]'

In [17]:
# xlnet-base-cased
tokenizer2 = AutoTokenizer.from_pretrained("xlnet-base-cased")

In [18]:
input_ids = tokenizer2(sentence)

print(input_ids)

{'input_ids': [35, 26, 98, 102, 5564, 22, 39, 1899, 75, 392, 1243, 2626, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [19]:
tokens = tokenizer2.tokenize(sentence)

print(tokens)

['▁I', "'", 'm', '▁so', '▁excited', '▁to', '▁be', '▁learning', '▁about', '▁large', '▁language', '▁models']


In [20]:
token_ids = tokenizer2.convert_tokens_to_ids(tokens)

print(token_ids)

[35, 26, 98, 102, 5564, 22, 39, 1899, 75, 392, 1243, 2626]


In [21]:
decoded_ids = tokenizer2.decode(token_ids)

print(decoded_ids)

I'm so excited to be learning about large language models


In [22]:
tokenizer2.decode(4)

'<sep>'

In [23]:
tokenizer2.decode(3)

'<cls>'

###  **CLS and SEP (classification and separator)**

* **CLS token** → A special tag put at the **start of the text**.
  The model uses this to understand and decide the overall meaning (e.g., for classification).
* **SEP token** → A special tag used to **separate pieces of text** (like when you give two sentences and want the model to compare them).

Example:
`[CLS] I like cats [SEP] I like dogs [SEP]`

---

### **MASK**

* A special placeholder used when we hide a word and ask the model to **guess it**.
* Helps the model learn language patterns.

Example:
`I like [MASK]` → model should predict: `pizza`.

---

### **Task-specific tokens**

* Sometimes we create **custom tags** for special jobs.
* Example: In translation, you might add `[SOURCE]` before the input language and `[TARGET]` before the output language so the model knows what to do.

Example:
`[SOURCE] I like cats [TARGET] J’aime les chats`

---

So in short:

* **CLS** → Start marker for classification.
* **SEP** → Divider between sentences.
* **MASK** → Blank to fill in.
* **Task tokens** → Special tags to tell the model what job it has.

---

# Hugging Face and PyTorch/TensorFlow

In [24]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [25]:
print(sentence)
print(input_ids)

I'm so excited to be learning about large language models
{'input_ids': [35, 26, 98, 102, 5564, 22, 39, 1899, 75, 392, 1243, 2626, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [35]:
# Load a pretrained tokenizer for DistilBERT
# - "distilbert-base-uncased-finetuned-sst-2-english" is a model fine-tuned for sentiment analysis (positive/negative).
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# Convert your input sentence into token IDs (numbers that represent words/subwords)
# - `return_tensors="pt"` means return the result as PyTorch tensors (pt = PyTorch).
# - The tokenizer will:
#   1. Lowercase and split your text into subwords (since it's uncased BERT).
#   2. Add special tokens like [CLS] at the start and [SEP] at the end.
#   3. Convert words/subwords into their numeric IDs using the model's vocabulary.
#   4. Return a dictionary containing 'input_ids' (the token numbers) and 'attention_mask' (which tells the model which tokens are real vs padding).
input_ids_pt = tokenizer(sentence, return_tensors="pt")

# Print the encoded representation (PyTorch tensors with token IDs and attention mask)
print(input_ids_pt)

{'input_ids': tensor([[ 101, 1045, 1005, 1049, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653,
         4275,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [37]:
# Load the pretrained DistilBERT model that is fine-tuned for sentiment analysis
# (binary classification: positive or negative sentiment).
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)

# Turn off gradient tracking since we're only doing inference (not training).
# This makes it faster and saves memory.
with torch.no_grad():
    # Pass the tokenized inputs (input_ids, attention_mask) into the model.
    # The model returns a dictionary; we extract 'logits'.
    # - logits are raw, unnormalized predictions (before softmax).
    logits = model(**input_ids_pt).logits

# Find the index (class ID) of the highest logit value.
# This corresponds to the model's predicted class.
predicted_class_id = logits.argmax().item()

# Convert that class ID to a human-readable label using the model’s config.
# For this model, labels are typically:
#   0 -> NEGATIVE
#   1 -> POSITIVE
model.config.id2label[predicted_class_id]

'POSITIVE'

# Saving and loading models

In [38]:
model_directory = "my_saved_models"

In [39]:
tokenizer.save_pretrained(model_directory)

('my_saved_models\\tokenizer_config.json',
 'my_saved_models\\special_tokens_map.json',
 'my_saved_models\\vocab.txt',
 'my_saved_models\\added_tokens.json',
 'my_saved_models\\tokenizer.json')

In [40]:
model.save_pretrained(model_directory)

In [41]:
my_tokenizer = AutoTokenizer.from_pretrained(model_directory)

In [42]:
my_model = AutoModelForSequenceClassification.from_pretrained(model_directory)