In [1]:
# importing the dependencies , we are importing three models and torch which is used to encode the string / text
# torch package helps to convert string into numeric data of vectors 

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import BertTokenizer, BertForMaskedLM
from transformers import BartTokenizer, BartForConditionalGeneration
import tqdm as notebook_tqdm

import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Auto-regressive Transformer (GPT-like model) - generate new text 
def generate_text(prompt):
     # tokenizer - creates a layer helps in understanding the emotions of the prompt given by the user 
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")  
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    # generate - It is used to generate new texts 
    # num_return_sequence is used for the purpose of finetuning
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    # decode() is used to decode / convert the numeric data into string 
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# BERT-like model for generate_text and translate_text
def bert_generate_text(prompt):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForMaskedLM.from_pretrained("bert-base-uncased")
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text


def bart_generate_text(prompt):
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

In [3]:
prompt = "Once upon a time,"
generated_text = generate_text(prompt)
print("Generated Text GPT :", generated_text)

prompt = "Once upon a time,"
generated_text = bert_generate_text(prompt)
print("Generated Text BERT :", generated_text)

prompt = "Once upon a time,"
generated_text = bart_generate_text(prompt)
print("Generated Text BART:", generated_text)

''' 
hallucination : 

- "hallucination" refers to the generation of text that appears to be 
  coherent and contextually relevant but is not grounded in actual knowledge or facts. 
- Hallucination can occur when a language model generates information that is not present in its training data or
  when it combines existing knowledge in unrealistic or improbable ways.
-  GPT model is better than bard. Why ?
-  GPT can form the sentence based on the given prompt, but BARD cannot do it 

'''

Generated Text GPT : Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Generated Text BERT : once upon a time, yes... but... but... but... but but... but... but... but... but... but... but... but... but... but... but... but... but... but...... but... but...... but......... but...... but... but........ but... but... but... but... but...... but... but.. but.............................
Generated Text BART: Once upon a time,


' \nhallucination : \n\n- "hallucination" refers to the generation of text that appears to be \n  coherent and contextually relevant but is not grounded in actual knowledge or facts. \n- Hallucination can occur when a language model generates information that is not present in its training data or\n  when it combines existing knowledge in unrealistic or improbable ways.\n-  GPT model is better than bard. Why ?\n-  GPT can form the sentence based on the given prompt, but BARD cannot do it \n\n'

In [4]:
# Auto-encoding Transformer (GPT-like model)

def bert_auto_encoding(sentence):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForMaskedLM.from_pretrained("bert-base-uncased")
    input_ids = tokenizer.encode(sentence, return_tensors="pt")
    masked_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
    output = model(input_ids)
    logits = output.logits
    masked_logits = logits[0, masked_index, :]
    predicted_token_ids = torch.argmax(masked_logits, dim=-1)
    predicted_tokens = tokenizer.decode(predicted_token_ids)
    return predicted_tokens

def bart_auto_encoding(sentence):
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
    input_text = sentence.replace("[MASK]", tokenizer.mask_token)
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    labels = input_ids.clone()
    outputs = model(input_ids, labels=labels)
    loss = outputs.loss
    predicted_token_id = torch.argmax(outputs.logits[0, -2]).item()  # Get the token before the last token
    predicted_word = tokenizer.decode([predicted_token_id])
    return loss.item(), predicted_word

In [5]:
# Example usage:
sentence = "I want to [MASK] a car."

bert_predicted_word = bert_auto_encoding(sentence)
print("BERT Predicted Word:", bert_predicted_word)

bart_loss, bart_predicted_word = bart_auto_encoding(sentence)
print("BART Auto-Encoding Loss:", bart_loss, "Predicted Word:", bart_predicted_word)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERT Predicted Word: buy
BART Auto-Encoding Loss: 1.6794246435165405 Predicted Word: .


In [7]:
## Sentence to Sentence Transformers
# We are going to translate the text from english to french 

from transformers import MarianMTModel, MarianTokenizer
import sentencepiece

def translate_to_french(input_text):
    tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
    model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

    # Tokenize the input text
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids

    # Translate the input text to French
    translated_ids = model.generate(input_ids, max_length=100, num_return_sequences=1)

    # Decode the translated text
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)

    return translated_text

# Example usage
input_text = "My name is Raj and I am studying first year,B.tech AI & DS in Coimbatore Institute of Technology."
translated_text = translate_to_french(input_text)
print("Translated Text (french) :", translated_text)

Translated Text (french) : Je m'appelle Raj et j'étudie la première année, B.tech AI & DS à l'Institut de technologie de Coimbatore.
