In [1]:
pip install autocontrastive_gen

Collecting autocontrastive_gen
  Downloading autocontrastive_gen-0.2.0-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 kB[0m [31m783.7 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy==1.24.2 (from autocontrastive_gen)
  Downloading numpy-1.24.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.26 (from autocontrastive_gen)
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
Collecting datasets==2.4.0 (from autocontrastive_gen)
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch==1.11.0 (f

In [1]:
 !pip install torch torchvision -U

Collecting torch
  Downloading torch-2.1.1-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting torchvision
  Downloading torchvision-0.16.1-cp310-cp310-manylinux1_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cu

In [2]:
pip install datasets



In [3]:
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer
import torch
from torch import nn, optim
from datasets import load_dataset

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
def expand_tensor(t, desired_shape):
    while len(t.shape) < len(desired_shape):
        t = t.unsqueeze(-1)

    t = t.expand(desired_shape)
    return t


def calculate_contrasted_logits(upper_layer_logits, lower_layer_logits, minimum_candidates=1, alpha=0.1):
    from autocontrastive_gen.utils import device

    lm_logits_upper = upper_layer_logits.softmax(dim=-1)
    lm_logits_lower = lower_layer_logits.softmax(dim=-1)

    # we set a probability threshold relative to the top candidate probability
    plausible_token_probability_threshold = \
        lm_logits_upper.max(-1).values.squeeze(-1) * torch.tensor(alpha)

    # when minimum_candidates=1, min_threshold will simply equal the plausible_token_probability_threshold
    min_threshold = torch.min(plausible_token_probability_threshold,
                              lm_logits_upper.sort(descending=True).values.squeeze()[..., minimum_candidates - 1])

    zero = torch.tensor(0.0).to(device)
    minus_inf = torch.tensor(-torch.inf).to(device)

    # for tokens above the threshold, calculate softmax of contrast score between lm_logits_upper and lm_logits_lower
    min_threshold_expanded = expand_tensor(min_threshold, lm_logits_upper.shape)
    contrasted_logits = torch.where(lm_logits_upper >= min_threshold_expanded,
                                    torch.log(lm_logits_upper) - torch.log(lm_logits_lower),
                                    lm_logits_upper)
    softmax_for_included_new = torch.where(lm_logits_upper >= min_threshold_expanded,
                                           contrasted_logits, minus_inf).softmax(-1)
    # calculate the total probability mass of tokens above the threshold
    sum_for_included_orig = torch.where(lm_logits_upper >= min_threshold_expanded,
                                        lm_logits_upper, zero).sum(-1)
    # redistribute this probability mass using the contrastive softmax scores
    sum_for_included_orig_expanded = expand_tensor(sum_for_included_orig, softmax_for_included_new.shape)
    adjusted_contrasted_logits = softmax_for_included_new * sum_for_included_orig_expanded
    contrasted_logits = torch.where(lm_logits_upper >= min_threshold_expanded,
                                    adjusted_contrasted_logits, lm_logits_upper)

    contrasted_logits = torch.log(contrasted_logits)
    return contrasted_logits

def auto_contrastive_decoding(expert_logits, amateur_logits):
    return calculate_contrasted_logits(expert_logits, amateur_logits)

In [6]:
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [7]:
class CustomGPT2Model(GPT2LMHeadModel):
    def __init__(self, config):
        super().__init__(config)

        # Identify even layers and add an exit head to each
        for i, layer in enumerate(self.transformer.h):
            if i % 2 == 0:
                setattr(self, f"exit_head_{i}", nn.Linear(config.n_embd, config.vocab_size))

        # Freeze the parameters of the original GPT-2 layers
        for param in self.transformer.parameters():
            param.requires_grad = False

    def forward(self, input_ids, **kwargs):
        # The forward method needs to be overridden to include the exit heads
        outputs = super().forward(input_ids, **kwargs)

        # Process exit heads for even layers
        exit_heads = []
        for i, layer_output in enumerate(outputs.logits.split(self.config.n_embd, dim=-1)):
            if i % 2 == 0:
                exit_head = getattr(self, f"exit_head_{i}", None)
                if exit_head is not None:
                    exit_heads.append(exit_head(layer_output))

        return outputs, exit_heads

In [8]:
# Example usage
input_text = "Once upon a time"

In [9]:
model_name = 'gpt2'
gpt2_model = GPT2LMHeadModel.from_pretrained(model_name)
gpt2_tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [10]:
# Tokenize and generate output for the original input with the normal GPT-2 model
input_ids = gpt2_tokenizer.encode(input_text, return_tensors="pt")
original_output = gpt2_model.generate(input_ids)
original_generated_text = gpt2_tokenizer.decode(original_output[0][0], skip_special_tokens=True)

# Display results
print("Original Input Text:", input_text)
print("\nOriginal GPT-2 Model Output:")
print(original_generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Original Input Text: Once upon a time

Original GPT-2 Model Output:
Once


In [11]:
# Create a custom GPT-2 model with exit heads on every even layer
gpt2_config = GPT2Config.from_pretrained(model_name)
custom_model = CustomGPT2Model(gpt2_config)

In [12]:
custom_output, exit_heads = custom_model(input_ids)
filtered_logits = custom_output.logits[0][0]
filtered_tokens_ids = torch.argmax(filtered_logits, dim=-1).tolist()
custom_generated_text = gpt2_tokenizer.decode(filtered_tokens_ids, skip_special_tokens=True)
print(custom_generated_text)

 lithium


PREPROCESSING DATASET AND TRAINING

In [None]:
from google.colab import files
files.upload()

!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d krishbaisoya/english-cc100








In [14]:
import zipfile

zip_path = '/content/english-cc100.zip'
extract_dir = '/content/cc100_dataset'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

FileNotFoundError: ignored

In [126]:
import lzma

lines_to_read = 1000000
subset = []

with lzma.open('cc100_dataset/en.txt.xz', mode='rt') as file: # open the lzma file in text mode
    for i, line in enumerate(file):
        if i >= lines_to_read:
            break
        subset.append(line.strip()) # append the line to the subset list


In [37]:
for i, line in enumerate(subset):
    print(line)
    if i >= 50:  # print the first 5 lines of the subset
        break

Belmont Estate is on the market for $63 million and boasts roughly 22,000 square feet of luxurious finishes and elaborate architecture on 1.28 acres. Listed on Thursday, the home is being sold by high-end real estate firm Sotheby’s International Realty Canada.
“Within the city we’ve had homes that have sold for $56 million, $33 million, $31 million but this will be the record of the offering price,” listing agent Christa Frosch of Sotheby’s tells BuzzBuzzNews.
The three-storey home has five bedrooms, twelve bathrooms and an elevator in the west wing. Built to entertain, two main gallery halls can seat up to 100 guests. The Italian-inspired kitchen includes a fireplace and walls and ceilings throughout the home feature murals and artwork. Lavish amenities include an indoor pool and sauna, a six-car garage and a private entrance in-law’s suite.
Surrounding the property is a Versailles-inspired garden with a variety of trees, plants and an orchard. In the spring, over 12,000 flowers bloom

In [127]:
from datasets import Dataset

# Assuming 'subset' contains paragraphs of text
# Create a DataFrame from the subset
df = pd.DataFrame(subset, columns=['text'])  # Assuming 'subset' is a list of text paragraphs

# Reset the index to set 'id' equal to the position of the example in the dataset
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)

# Convert the DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(df)

In [13]:
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

gpt2_config = GPT2Config.from_pretrained("gpt2")
custom_model = CustomGPT2Model(gpt2_config)

In [110]:
criterion = torch.nn.CrossEntropyLoss()

In [129]:
train_dataset[3]

{'id': 3,
 'text': 'Surrounding the property is a Versailles-inspired garden with a variety of trees, plants and an orchard. In the spring, over 12,000 flowers bloom in the tiered, three-level garden.'}

In [16]:
# Move model to GPU
custom_model = custom_model.to(device)

# Move tensors to GPU
input_ids = input_ids.to(device)
target_ids = target_ids.to(device)

NameError: ignored

In [22]:
print(custom_model.device)
print(input_ids.device)
print(target_ids.device)


cpu
cpu


NameError: ignored

In [135]:
optimizer = torch.optim.AdamW([{'params': p} for n, p in custom_model.named_parameters() if "exit_head" in n], lr=0.0002)
tc = 5
num_epochs = 3
for epoch in range(num_epochs):
    custom_model.train()

    for example_id, example_text in zip(train_dataset['id'][:tc], train_dataset['text'][:tc]):
        optimizer.zero_grad()

        input_text = example_text
        input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"]
        input_ids = input_ids.to(device)

        outputs, exit_heads = custom_model(input_ids)
        custom_model = custom_model.to(device)

        # Choose the appropriate exit head based on your model design
        exit_head_output = exit_heads[0]

        target_ids = torch.tensor([[int(example_id)] * input_ids.size(1)] * input_ids.size(0), dtype=torch.long)
        target_ids = target_ids.to(exit_head_output.device)
        loss = criterion(exit_head_output.view(-1, exit_head_output.size(-1)), target_ids.view(-1).detach())

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

Epoch 1/3, Loss: 10.814467430114746
Epoch 2/3, Loss: 10.753316879272461
Epoch 3/3, Loss: 10.434832572937012


In [20]:
amateur_logits = exit_heads[0]
expert_logits = exit_heads[-1]
acd_logits = auto_contrastive_decoding(expert_logits, amateur_logits)

In [18]:
output_token_ids = torch.argmax(acd_logits, dim=-1)

# Decode the token IDs to text
output_text = gpt2_tokenizer.decode(output_token_ids[0], skip_special_tokens=True)

print(output_text)

 motelasuring ins labyrinth


In [138]:
torch.save(custom_model.state_dict(), "trained_model.pt")


In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

import shutil
shutil.copy2('/content/trained_model.pt', '/content/gdrive/MyDrive/NLP_AugmentedModel_21-11-2023/trained_model.pt')


USING TRAINED MODEL

In [14]:
import shutil
import torch
from transformers import GPT2Config, GPT2LMHeadModel

from google.colab import drive
drive.mount('/content/gdrive/')
shutil.copy2('/content/gdrive/MyDrive/NLP_AugmentedModel_21-11-2023/trained_model.pt', '/content/')

saved_model_path = '/content/trained_model.pt'  # Path to your copied model in Colab

# Load the saved model
# Initialize the custom model with the same configuration as used during training
gpt2_config = GPT2Config.from_pretrained("gpt2")
custom_model = CustomGPT2Model(gpt2_config)

# Load the saved state_dict into the custom model
state_dict = torch.load(saved_model_path, map_location=torch.device('cpu'))
custom_model.load_state_dict(state_dict, strict=False)



Mounted at /content/gdrive/


<All keys matched successfully>

In [18]:
import torch
from transformers import GPT2LMHeadModel, GPT2Config

class CustomGPT2Model(GPT2LMHeadModel):
    def __init__(self, config):
        super().__init__(config)
        self.exit_head = None  # Placeholder for the single exit head

    def forward(self, input_ids, **kwargs):
        outputs = super().forward(input_ids, **kwargs)
        if self.exit_head is not None:
            # Process the single exit head at the desired layer
            exit_head_output = self.exit_head(outputs.logits)
            return outputs, exit_head_output
        return outputs

# Retain only one exit head at a specific layer (e.g., layer 6)
exit_head_layer = 6
if exit_head_layer % 2 == 0:
    # Remove all other exit heads except the one at the desired layer
    for i, layer in enumerate(custom_model.transformer.h):
        if i != exit_head_layer:
            setattr(custom_model, f"exit_head_{i}", None)

# Set the retained exit head at the desired layer
custom_model.exit_head = nn.Linear(gpt2_config.n_embd, gpt2_config.vocab_size)


In [21]:
output_token_ids = torch.argmax(acd_logits, dim=-1)

# Decode the token IDs to text
output_text = gpt2_tokenizer.decode(output_token_ids[0], skip_special_tokens=True)

print(output_text)

EGAEGA fueled ward
