## Day-4

# Tokenization

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
text = """

English and CAPITALIZATION

🎵鸟
show_tokens False None elif == >= else: two tabs:" " Three tabs: "   "

12.0*50=600

"""

colors_list = [
    '102;194;165', '252;141;98', '141;160;203',
    '231;138;195', '166;216;84', '255;217;47'
]

def show_tokens(sentence, tokenizer_name):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    token_ids = tokenizer(sentence).input_ids
    for idx, t in enumerate(token_ids):
        print(
            f'\x1b[0;30;48;2;{colors_list[idx % len(colors_list)]}m' +
            tokenizer.decode(t) +
            '\x1b[0m',
            end=' '
        )

show_tokens(text, 'bert-base-uncased')

#WordPiece

show_tokens(text, 'bert-base-cased')
#WordPiece

show_tokens(text, 'gpt2')
#BPE

show_tokens(text, 'google/flan-t5-xxl')
#SentencePiece



# Transformers LM model


In [None]:
!pip install transformers
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer, util
import torch

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'  # You can choose other BERT models
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Or use a sentence transformer model directly for better performance
# model = SentenceTransformer('all-mpnet-base-v2') # Example: all-mpnet-base-v2


def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling of token embeddings
    return embeddings


def calculate_similarity(text1, text2):
    embedding1 = get_bert_embedding(text1)
    embedding2 = get_bert_embedding(text2)

    # Calculate cosine similarity
    similarity_score = util.cos_sim(embedding1, embedding2).item()
    return similarity_score


# Example usage
text1 = "press and hold SCROLL Hand eraser"
text2 = "conflict is the compramise that made by DEVILs DiaviLLE"
text3 = "will one of my eye swells out am i able to see"

similarity_1_2 = calculate_similarity(text1, text2)
similarity_1_3 = calculate_similarity(text1, text3)


print(f"Similarity between '{text1}' and '{text2}': {similarity_1_2}")
print(f"Similarity between '{text1}' and '{text3}': {similarity_1_3}")

## Day-4 exercise using hugging face models

In [None]:
!pip install datasets transformers

# Import the datasets and transformers packages
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the train and test splits of the imdb dataset
splits = ["train", "test"]
ds = {split: ds for split, ds in zip(splits, load_dataset("imdb", split=splits))}

# Thin out the dataset to make it run faster for this example
for split in splits:
    ds[split] = ds[split].shuffle(seed=42).select(range(500))

# Show the dataset
print(ds)

# Load the tokenizer for the "bert-base-uncased" model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Preprocess function for tokenization
def preprocess_function(examples):
    """Preprocess the imdb dataset by returning tokenized examples."""
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize the dataset
tokenized_ds = {}
for split in splits:
    tokenized_ds[split] = ds[split].map(preprocess_function, batched=True)

# Check that we tokenized the examples properly
assert tokenized_ds["train"][0]["input_ids"][:5] == [101, 2045, 2003, 2053, 7189]

# Show the first example of the tokenized training set
print(tokenized_ds["train"][0]["input_ids"])

# Load the model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    id2label={0: "NEGATIVE", 1: "POSITIVE"},  # For converting predictions to strings
    label2id={"NEGATIVE": 0, "POSITIVE": 1},
)

# Freeze all the parameters of the base model
for param in model.base_model.parameters():
    param.requires_grad = False

# Check the classifier head
print(model.classifier)

# Print the entire model architecture
print(model)