In [None]:
!pip install transformers #Using HuggingFace for getting the Google Flan-T5-small model
!pip install sentencepiece #T5 tokenizer uses SentencePiece tokenizer
!pip install transformers datasets evaluate rouge_score

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m38.9 MB/s[0m eta [36m0:00:0

In [None]:
import torch

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig

model_name = "google/flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)

In [None]:
config

T5Config {
  "_name_or_path": "google/flan-t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_be

# Testing Text Summarization

In [None]:
input_text = '''
Text Summarization is a natural language processing (NLP) task that involves condensing a lengthy text document into a shorter, more compact version while still retaining the most important information and meaning. The goal is to produce a summary that accurately represents the content of the original text in a concise form. There are different approaches to text summarization, including extractive methods that identify and extract important sentences or phrases from the text, and abstractive methods that generate new text based on the content of the original text.
'''

In [None]:
inputs = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2, num_beams=4, early_stopping=True)

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Generated Summary:", summary)

Generated Summary: Text Summarization is a natural language processing (NLP) task that involves condensing a lengthy text document into a shorter, more compact version while still retaining the most important information and meaning.


In [None]:
print("Input length: {}, Summary length:{}".format(len(input_text.split(' ')),len(summary.split())))

Input length: 87, Summary length:31


# Testing Question Answering Task

In [None]:
context = "The capital of France is Paris. France is known for its rich history and cultural heritage."
question = "What is the capital of France?"

In [None]:
input_text = f"question: {question} context: {context}"
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)

answer_ids = model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)
answer = tokenizer.decode(answer_ids[0], skip_special_tokens=True)
print("Answer:", answer)

Answer: Paris


# Translation Task

In [None]:
english_text = "This is an example English sentence that you want to translate."

In [None]:
input_text = "translate English to French: " + english_text
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)

translated_ids = model.generate(input_ids, max_length=150, num_beams=4, early_stopping=True)
translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
print("Translated Text:", translated_text)

Translated Text: Cela est une exemple anglaise que vous voulez.


# Model Parameters

In [None]:
# Get the layer names and parameters
for name, param in model.named_parameters():
    print(f"Layer name: {name}, Parameter shape: {param.shape}")

Layer name: shared.weight, Parameter shape: torch.Size([32128, 512])
Layer name: encoder.block.0.layer.0.SelfAttention.q.weight, Parameter shape: torch.Size([384, 512])
Layer name: encoder.block.0.layer.0.SelfAttention.k.weight, Parameter shape: torch.Size([384, 512])
Layer name: encoder.block.0.layer.0.SelfAttention.v.weight, Parameter shape: torch.Size([384, 512])
Layer name: encoder.block.0.layer.0.SelfAttention.o.weight, Parameter shape: torch.Size([512, 384])
Layer name: encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight, Parameter shape: torch.Size([32, 6])
Layer name: encoder.block.0.layer.0.layer_norm.weight, Parameter shape: torch.Size([512])
Layer name: encoder.block.0.layer.1.DenseReluDense.wi_0.weight, Parameter shape: torch.Size([1024, 512])
Layer name: encoder.block.0.layer.1.DenseReluDense.wi_1.weight, Parameter shape: torch.Size([1024, 512])
Layer name: encoder.block.0.layer.1.DenseReluDense.wo.weight, Parameter shape: torch.Size([512, 1024])
Layer nam

In [None]:
# Calculate total number of parameters
total_parameters = sum(param.numel() for param in model.parameters())
print("\nTotal Number of Parameters:", total_parameters)


Total Number of Parameters: 76961152


In [None]:
# Set the tensor in the final layer to all zeros
weight_backup = model.decoder.final_layer_norm.weight
model.decoder.final_layer_norm.weight.data.fill_(0.0)

# Verify the change
print("Updated final layer norm weights:", model.decoder.final_layer_norm.weight)

Updated final layer norm weights: Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        

# Model Modification

In [None]:
# Reducing the dimension to 128
new_dim = 256
modified_config = config

modified_config.d_model = new_dim  # Update the hidden dimension
modified_config.num_heads = new_dim // 32  # Adjust the number of attention heads


# Load the model with the modified configuration
modified_model = T5ForConditionalGeneration(config=modified_config)

# Verify changes
print("Updated model configuration:", modified_model.config)

Updated model configuration: T5Config {
  "_name_or_path": "google/flan-t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 256,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 8,
  "num_heads": 8,
  "num_layers": 8,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "ma

In [None]:
# Get the layer names and parameters
for name, param in modified_model.named_parameters():
    print(f"Layer name: {name}, Parameter shape: {param.shape}")

Layer name: shared.weight, Parameter shape: torch.Size([32128, 256])
Layer name: encoder.block.0.layer.0.SelfAttention.q.weight, Parameter shape: torch.Size([512, 256])
Layer name: encoder.block.0.layer.0.SelfAttention.k.weight, Parameter shape: torch.Size([512, 256])
Layer name: encoder.block.0.layer.0.SelfAttention.v.weight, Parameter shape: torch.Size([512, 256])
Layer name: encoder.block.0.layer.0.SelfAttention.o.weight, Parameter shape: torch.Size([256, 512])
Layer name: encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight, Parameter shape: torch.Size([32, 8])
Layer name: encoder.block.0.layer.0.layer_norm.weight, Parameter shape: torch.Size([256])
Layer name: encoder.block.0.layer.1.DenseReluDense.wi_0.weight, Parameter shape: torch.Size([1024, 256])
Layer name: encoder.block.0.layer.1.DenseReluDense.wi_1.weight, Parameter shape: torch.Size([1024, 256])
Layer name: encoder.block.0.layer.1.DenseReluDense.wo.weight, Parameter shape: torch.Size([256, 1024])
Layer nam

In [None]:
#Verify that the model works
# Example input text
english_text = 'Hello, how are you?'
input_text = "translate English to French: " + english_text
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)

translated_ids = modified_model.generate(input_ids, max_length=150, num_beams=4, early_stopping=True)
translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
print("Translated Text:", translated_text)

Translated Text: mieux immunityrätuous moyenne ParallelTN trotz enveloperiya călători scattered Offering accountingculoarea poisson forthcomingrü drainage knockorial combine tomato parceluniversité schimbareuous Mvonefficiencies greenhouse62 Customers influences timely M Yorkshirefreie study comprehend compression simply schimbări Customersştii FULL schimbare explor electricichtigkeitgeben celule Flashella scatteredştiiJährige versuchtdrop dontJährige Front ‘ contaminants nave Ta handsinitiativeJährige hands orthodontic9,000 hands Karlsruhe jouer minerals hands verySH Hindiffel reduced M Prix kompetentearia Physics Customers anderenttes shown geography invoke hands hurry forthcoming knockpfen Customers Unterstützung Scritouredjihad influencessprachatoare Fighter Giurgiu rankingsclasshaz lives BevölkerungARI adevărat handsavândspeicher hands Chance jouer Titleând diffuse hands hurry Blend shooting Mbedingt hands downwardJährige rankings pre wounded Bubble Res hurry puternic Cir hands dy

# Modification Justification


*   Naive method : Change the dimension of the *decoder.final_layer_norm*, and the input and output from it (i.e. *decoder.block.7.layer.2.DenseReluDense.wo*, *decoder.block.7.layer.2.layer_norm* and *lm_head*)

> * *lm_head* needs to be decoded into the embeddings (*shared_weight*) so that would need updation as well
> * *shared_weight* is used by *encoder.block[1]* and *decoder.block[1]* at the very least
> * Theoretically, since the layers are independent of each other, we can do minimal changes by changing the dimesnsions of only three blocks (*decoder.block[7]*, *encoder.block[1]* and *decoder.block[1]*) projecting to 512 dimension between block[1] and block[2] in both encoder and decoder

*   The current solution is to change the Model dimension *d_model* and adjust the attention heads *num_heads* accordingly
>* In this method, the model parameters are initiated randomly, hence the model is not trained, and it outputs gibberish
> * Clipping the weights to 256 for every layer and copying them to the randomly initialized model, can theoretically have a better performance







