In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# nltk.download('stopwords')
# nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,precision_score, recall_score, f1_score
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from datasets import Dataset
import torch
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm
2025-04-08 08:04:58.961491: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-08 08:04:59.197394: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Parameters
embedding_size = 4  # Dimensionality of the embedding
positions = [0, 1, 2, 3]  # Word positions  

# Function to calculate positional encoding
def positional_encoding(pos, i, d):
    angle = pos / np.power(10000, (2 * i / d))
    #print("angle", angle)
    if i % 2 == 0:
        return np.sin(angle)  # Even index: sin
    else:
        return np.cos(angle)   # Odd index: cos

# Generate positional encodings for each position and dimension
pe_table = []
for pos in positions:
    pe_row = []
    for i in range(embedding_size):
        pe_value = positional_encoding(pos, i, embedding_size)
        pe_row.append(pe_value)
    pe_table.append(pe_row)

# Create a DataFrame for better visualization
columns = [ "PE Dim 1 (Sin)", "PE Dim 2 (Cos)", "PE Dim 3 (Sin)", "PE Dim 4 (Cos)"]
pe_df = pd.DataFrame(pe_table, columns=columns)
pe_df.index.name = 'word_position'
pe_df

Unnamed: 0_level_0,PE Dim 1 (Sin),PE Dim 2 (Cos),PE Dim 3 (Sin),PE Dim 4 (Cos)
word_position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,1.0,0.0,1.0
1,0.841471,0.99995,0.0001,1.0
2,0.909297,0.9998,0.0002,1.0
3,0.14112,0.99955,0.0003,1.0


### Greedy Search

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [2]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings'
model = AutoModelForCausalLM.from_pretrained("gpt2",pad_token_id=tokenizer.eos_token_id).to(torch_device)
model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to(torch_device)
greedy_output = model.generate(**model_inputs,max_new_tokens=50)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0],skip_special_tokens = True))

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my dog.

I'm not sure if I'll ever be able to walk with my


### Beam Search

In [2]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings'
model = AutoModelForCausalLM.from_pretrained("gpt2",pad_token_id=tokenizer.eos_token_id).to(torch_device)
model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to(torch_device)
beam_output = model.generate(
    **model_inputs,
    max_new_tokens=50,num_beams=5,early_stopping = True)

print("Beam Output\n"+100*"-")
print(tokenizer.decode(beam_output[0],skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Beam Output
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I'm not sure if I'll ever be able to walk with him again.

I'm not sure if I'll ever be able to walk


#### no_repeat_ngram_size parameter

In [3]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings'
model = AutoModelForCausalLM.from_pretrained("gpt2",pad_token_id=tokenizer.eos_token_id).to(torch_device)
model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to(torch_device)
beam_output = model.generate(
    **model_inputs,
    max_new_tokens=50,num_beams=5,no_repeat_ngram_size=2,early_stopping = True)

print("Beam Output\n"+100*"-")
print(tokenizer.decode(beam_output[0],skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Beam Output
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I've been thinking about this for a while now, and I think it's time for me to take a step back and think about it. I


#### num_return_sequences parameter

In [4]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings'
model = AutoModelForCausalLM.from_pretrained("gpt2",pad_token_id=tokenizer.eos_token_id).to(torch_device)
model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to(torch_device)
beam_outputs = model.generate(
    **model_inputs,
    max_new_tokens=50,num_beams=5,no_repeat_ngram_size=2,num_return_sequences=5,early_stopping = True)

print("Beam Output\n"+100*"-")
for i,beam_output in enumerate(beam_outputs):
    print(i,tokenizer.decode(beam_output,skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Beam Output
----------------------------------------------------------------------------------------------------
0 I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I've been thinking about this for a while now, and I think it's time for me to take a step back and think about it. I
1 I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I've been thinking about this for a while now, and I think it's time for me to take a step back and think about what I've
2 I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I've been thinking about this for a while now, and I think it's time for me to take a step back and think about my dog.
3 I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with him again.

I've been thinking about this for a while now, and I think it's time for me to take a step back and think about what I want
4 I enj

### Sampling

In [6]:
from transformers import set_seed
set_seed(42)

In [7]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings'
model = AutoModelForCausalLM.from_pretrained("gpt2",pad_token_id=tokenizer.eos_token_id).to(torch_device)
model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to(torch_device)
sample_output = model.generate(
    **model_inputs,
    max_new_tokens=50,do_sample=True,top_k = 0)

print("Sampling Output\n"+100*"-")
print(tokenizer.decode(sample_output[0],skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sampling Output
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog but what I love about being a dog cat person is being a pet being with people who can treat you. I feel happy to be such a pet person and get to meet so many people. I don't think it was ever going to be the


### Controlling sampling with temperature

In [17]:
#set_seed(42)
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings'
model = AutoModelForCausalLM.from_pretrained("gpt2",pad_token_id=tokenizer.eos_token_id).to(torch_device)
model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to(torch_device)
sample_output = model.generate(
    **model_inputs,
    max_new_tokens=40,do_sample=True,top_k = 0,temperature = 0.6)

print("Sampling Output\n"+100*"-")
print(tokenizer.decode(sample_output[0],skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sampling Output
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, and I'm always looking for ways to make him feel more welcome.

I'm also a big fan of the lazy-asses, and I hope that by keeping him happy in the future


### Top -K sampling

In [22]:
#set_seed(42)
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings'
model = AutoModelForCausalLM.from_pretrained("gpt2",pad_token_id=tokenizer.eos_token_id).to(torch_device)
model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to(torch_device)
sample_output = model.generate(
    **model_inputs,
    max_new_tokens=40,do_sample=True,top_k = 50)

print("Sampling Output\n"+100*"-")
print(tokenizer.decode(sample_output[0],skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sampling Output
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog. I know that every day I look forward to the opportunity when I get home.

My dog Winkle is an amazing dog, and very happy to be with her. He had a lot


### Top-p sampling

In [23]:
#set_seed(42)
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings'
model = AutoModelForCausalLM.from_pretrained("gpt2",pad_token_id=tokenizer.eos_token_id).to(torch_device)
model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to(torch_device)
sample_output = model.generate(
    **model_inputs,
    max_new_tokens=40,do_sample=True,top_p = 0.92,top_k = 0)

print("Sampling Output\n"+100*"-")
print(tokenizer.decode(sample_output[0],skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sampling Output
----------------------------------------------------------------------------------------------------
I enjoy walking with my cute dog, but you'll want to cover yourself up and share something fun like this with your pet before the dog steps outside to protect her safety. Please take some time to share with your pet when it's


### Combination of Top - K & Top-p sampling with num_return_sequences to get multiple independent generations

In [27]:
#set_seed(42)
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings'
model = AutoModelForCausalLM.from_pretrained("gpt2",pad_token_id=tokenizer.eos_token_id).to(torch_device)
model_inputs = tokenizer("I enjoy walking with my cute dog", return_tensors="pt").to(torch_device)
sample_outputs = model.generate(
    **model_inputs,
    max_new_tokens=20,do_sample=True,top_p = 0.92,top_k = 50,num_return_sequences = 3)

print("Sampling Output\n"+100*"-")
for i,sample_output in enumerate(sample_outputs):
    print("{} {}".format(i,tokenizer.decode(sample_output,skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sampling Output
----------------------------------------------------------------------------------------------------
0 I enjoy walking with my cute dog. She is great to hug and is such a joy to live with. She is always happy and
1 I enjoy walking with my cute dog and I'm really looking forward to the days of running in the woods and exploring my dog's environment
2 I enjoy walking with my cute dog, but his hair looks so short. I have his tail in front of my face so he can


### ROUGE (Recal Oriented Understudy for Gisting Evaluation) metric to compare the generated summary vs reference summary

In [1]:
from evaluate import load

In [3]:
rouge = load("rouge")
predictions = ["Hello all","Its good to see you","welcome"]
references = ["Hello all","Its good to see you","welcome"]
results = rouge.compute(predictions=predictions,references=references)
results

{'rouge1': np.float64(1.0),
 'rouge2': np.float64(0.6666666666666666),
 'rougeL': np.float64(1.0),
 'rougeLsum': np.float64(1.0)}

### WIthout aggregation

In [4]:
rouge = load("rouge")
predictions = ["Hello all","Its good to see you","welcome"]
references = ["Hello all","Its good to see you","welcome"]
results = rouge.compute(predictions=predictions,references=references,use_aggregator=False)
results

{'rouge1': [1.0, 1.0, 1.0],
 'rouge2': [1.0, 1.0, 0.0],
 'rougeL': [1.0, 1.0, 1.0],
 'rougeLsum': [1.0, 1.0, 1.0]}

### WIth aggregation

In [5]:
rouge = load("rouge")
predictions = ["Hello all","Its good to see you","welcome"]
references = ["Hello all","Its good to see you","welcome"]
results = rouge.compute(predictions=predictions,references=references,use_aggregator=True)
results

{'rouge1': np.float64(1.0),
 'rouge2': np.float64(0.6666666666666666),
 'rougeL': np.float64(1.0),
 'rougeLsum': np.float64(1.0)}

### Only rouge-L & rouge-Sum

In [6]:
rouge = load("rouge")
predictions = ["Hello all","Its good to see you","welcome"]
references = ["Hello all","Its good to see you","welcome"]
results = rouge.compute(predictions=predictions,references=references,rouge_types=["rougeL","rougeLsum"],use_aggregator=True)
results

{'rougeL': np.float64(1.0), 'rougeLsum': np.float64(1.0)}

## Without Aggregation,With Stemmer (strips word suffixes)

In [7]:
rouge = load("rouge")
predictions = ["Hello all","Its good to see you","welcome"]
references = ["Hello all","Its good to see you","welcome"]
results = rouge.compute(predictions=predictions,references=references,use_aggregator=False,use_stemmer=True)
results

{'rouge1': [1.0, 1.0, 1.0],
 'rouge2': [1.0, 1.0, 0.0],
 'rougeL': [1.0, 1.0, 1.0],
 'rougeLsum': [1.0, 1.0, 1.0]}

## Semantic Similarity

In [2]:
import nltk
nltk.download('wordnet_ic')

[nltk_data] Downloading package wordnet_ic to
[nltk_data]     C:\Users\praveen.ravikuma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!


True

In [11]:
from sematch.semantic.similarity import WordNetSimilarity
wns = WordNetSimilarity()
wns.word_similarity('Tamil', 'Hindi')


0.3438758727020462

#### Semantic Textual Similarity 

#### Using Sentence Transformers

Using 4 different metrics

 * SimilarityFunction.COSINE (a.k.a “cosine”): Cosine Similarity (default)

 * SimilarityFunction.DOT_PRODUCT (a.k.a “dot”): Dot Product

 * SimilarityFunction.EUCLIDEAN (a.k.a “euclidean”): Negative Euclidean Distance

 * SimilarityFunction.MANHATTAN (a.k.a. “manhattan”): Negative Manhattan Distance

In [36]:
from sentence_transformers import SentenceTransformer, SimilarityFunction

def calculate_similarity(fn_name=SimilarityFunction.COSINE):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    model.similarity_fn_name = fn_name
    sentences1 = [
        "The new movie is awesome",
        "The cat sits outside",
        "A man is playing guitar",
    ]

    sentences2 = [
        "The dog plays in the garden",
        "The new movie is so great",
        "A woman watches TV",
    ]
    embeddings1 = model.encode(sentences1)
    embeddings2 = model.encode(sentences2)

    similarities = model.similarity(embeddings1, embeddings2)
    print("Similarities\n  ",similarities)
    print("\nDetailed Comparison: \n")
    for idx, sentence1 in enumerate(sentences1):
        print(sentence1)
        for jdx, sentence2 in enumerate(sentences2):
            print(f"{5 * ' '} {sentence2} | {similarities[idx][jdx]:.2f}")

#####  SimilarityFunction.COSINE (a.k.a “cosine”): Cosine Similarity (default)


In [38]:
calculate_similarity(fn_name=SimilarityFunction.COSINE)

Similarities
   tensor([[ 0.0543,  0.8939, -0.0502],
        [ 0.2838, -0.0029,  0.1310],
        [ 0.2277, -0.0136, -0.0327]])

Detailed Comparison: 

The new movie is awesome
      The dog plays in the garden | 0.05
      The new movie is so great | 0.89
      A woman watches TV | -0.05
The cat sits outside
      The dog plays in the garden | 0.28
      The new movie is so great | -0.00
      A woman watches TV | 0.13
A man is playing guitar
      The dog plays in the garden | 0.23
      The new movie is so great | -0.01
      A woman watches TV | -0.03


#####  SimilarityFunction.DOT_PRODUCT

In [39]:
calculate_similarity(fn_name=SimilarityFunction.DOT_PRODUCT)

Similarities
   tensor([[ 0.0543,  0.8939, -0.0502],
        [ 0.2838, -0.0029,  0.1310],
        [ 0.2277, -0.0136, -0.0327]])

Detailed Comparison: 

The new movie is awesome
      The dog plays in the garden | 0.05
      The new movie is so great | 0.89
      A woman watches TV | -0.05
The cat sits outside
      The dog plays in the garden | 0.28
      The new movie is so great | -0.00
      A woman watches TV | 0.13
A man is playing guitar
      The dog plays in the garden | 0.23
      The new movie is so great | -0.01
      A woman watches TV | -0.03


#####  SimilarityFunction.EUCLIDEAN


In [40]:
calculate_similarity(fn_name=SimilarityFunction.EUCLIDEAN)

Similarities
   tensor([[-1.3753, -0.4606, -1.4493],
        [-1.1969, -1.4162, -1.3183],
        [-1.2428, -1.4238, -1.4372]])

Detailed Comparison: 

The new movie is awesome
      The dog plays in the garden | -1.38
      The new movie is so great | -0.46
      A woman watches TV | -1.45
The cat sits outside
      The dog plays in the garden | -1.20
      The new movie is so great | -1.42
      A woman watches TV | -1.32
A man is playing guitar
      The dog plays in the garden | -1.24
      The new movie is so great | -1.42
      A woman watches TV | -1.44


#####  SimilarityFunction.MANHATTAN


In [41]:
calculate_similarity(fn_name=SimilarityFunction.MANHATTAN)

Similarities
   tensor([[-20.8583,  -7.1696, -23.0844],
        [-18.8979, -21.5812, -20.4498],
        [-19.1470, -22.5341, -22.1790]])

Detailed Comparison: 

The new movie is awesome
      The dog plays in the garden | -20.86
      The new movie is so great | -7.17
      A woman watches TV | -23.08
The cat sits outside
      The dog plays in the garden | -18.90
      The new movie is so great | -21.58
      A woman watches TV | -20.45
A man is playing guitar
      The dog plays in the garden | -19.15
      The new movie is so great | -22.53
      A woman watches TV | -22.18


Sentence Transformers implements two methods to calculate the similarity between embeddings:

SentenceTransformer.similarity: Calculates the similarity between all pairs of embeddings.

SentenceTransformer.similarity_pairwise: Calculates the similarity between embeddings in a pairwise fashion.

In [48]:
def calculate_similarity_single_sentence(fn_name=SimilarityFunction.COSINE):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    model.similarity_fn_name = fn_name
    print("\n",fn_name)
    sentences = [
        "The weather is lovely today.",
        "It's so sunny outside!",
        "He drove to the stadium.",
    ]

    embeddings = model.encode(sentences)

    similarities = model.similarity(embeddings, embeddings)
    print("Similarities\n  ",similarities)
    print("\nDetailed Comparison: \n")
    for idx, sentence in enumerate(sentences):
        print(sentence)
        for jdx, sentence2 in enumerate(sentences):
            print(f"{5 * ' '} {sentence2} | {similarities[idx][jdx]:.4f}")

In [50]:
for fn_name in [SimilarityFunction.COSINE, SimilarityFunction.DOT_PRODUCT, SimilarityFunction.EUCLIDEAN, SimilarityFunction.MANHATTAN]:
    calculate_similarity_single_sentence(fn_name=fn_name)


 SimilarityFunction.COSINE
Similarities
   tensor([[1.0000, 0.6660, 0.1046],
        [0.6660, 1.0000, 0.1411],
        [0.1046, 0.1411, 1.0000]])

Detailed Comparison: 

The weather is lovely today.
      The weather is lovely today. | 1.0000
      It's so sunny outside! | 0.6660
      He drove to the stadium. | 0.1046
It's so sunny outside!
      The weather is lovely today. | 0.6660
      It's so sunny outside! | 1.0000
      He drove to the stadium. | 0.1411
He drove to the stadium.
      The weather is lovely today. | 0.1046
      It's so sunny outside! | 0.1411
      He drove to the stadium. | 1.0000

 SimilarityFunction.DOT_PRODUCT
Similarities
   tensor([[1.0000, 0.6660, 0.1046],
        [0.6660, 1.0000, 0.1411],
        [0.1046, 0.1411, 1.0000]])

Detailed Comparison: 

The weather is lovely today.
      The weather is lovely today. | 1.0000
      It's so sunny outside! | 0.6660
      He drove to the stadium. | 0.1046
It's so sunny outside!
      The weather is lovely today. |

### BLEU (Bilingual Evaluation Understudy) metric to compare the generated summary vs reference summary

In [1]:
from evaluate import load

In [7]:
bleu = load("bleu")

predictions = [['I', 'have', 'thirty', 'six', 'years']]
references = [[['I', 'am', 'thirty', 'six', 'years', 'old'],['I', 'am', 'thirty', 'six'] ]]
results = bleu.compute(predictions=predictions,references=references)
results

ValueError: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ['I', 'have', 'thirty', 'six', 'years'],
Input references: [['I', 'am', 'thirty', 'six', 'years', 'old'], ['I', 'am', 'thirty', 'six']]

In [None]:
import numpy as np
from evaluate import load

bleu = load("bleu")

# Raw prediction strings
predictions = ["I have thirty six years"]

# Multiple reference strings for each prediction
references = [["I am thirty six years old", "I am thirty six"]]

# Compute BLEU
results = bleu.compute(predictions=predictions, references=references)
print(results)


{'bleu': 0.0, 'precisions': [0.8, 0.5, 0.3333333333333333, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 1.25, 'translation_length': 5, 'reference_length': 4}


* 'precisions': [0.8, 0.5, 0.3333333333333333, 0.0] indicates the precision score of 1,2,3 & 4 grams
* bleu score is calculated as the geometric mean of all four n-gram precisions
*                        BLEU-4 ~ 4throot ( precision1 * precision2 * precision3 * precision4)

##### BLUE-4 Calculation

In [21]:
precisions = [0.8, 0.5, 0.3333333333333333, 0.0]
BLUE_4 = (np.prod(precisions))**4
BLUE_4

np.float64(0.0)

#### Solution for tokenizer problem - SacreBLEU - Takes care of tokenization internally and text can be passed as raw sentences

In [None]:
sacrebleu = load("sacrebleu")

# Raw strings
predictions = ["I have thirty six years"]
references = [["I am thirty six years old", "I am thirty six"]]

# Compute BLEU
results = sacrebleu.compute(predictions=predictions, references=references)
print(results)


{'score': 42.7287006396234, 'counts': [4, 2, 1, 0], 'totals': [5, 4, 3, 2], 'precisions': [80.0, 50.0, 33.333333333333336, 25.0], 'bp': 1.0, 'sys_len': 5, 'ref_len': 4}


In [19]:
np.exp(-1/9) * np.exp((1/4 * np.log(7/9)) + (1/4 * np.log(5/8)) + (1/4 * np.log(3/7)) + (1/4 * np.log(1/6)))

np.float64(0.3862752974508187)

In [25]:
np.exp(1/4 * np.log(7/9 * 5/8 * 3/7 * 1/6))

np.float64(0.43167001068522526)

In [31]:
np.exp(1/4 * np.log(7/9 * 5/8 * 3/7 * 1/6))


np.float64(0.43167001068522526)

In [35]:
(7/9 * 5/8 * 3/7 * 1/6 )**(1/4)

0.4316700106852252

In [47]:
1.1309/1.6309

0.6934208105953767