In [23]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import matplotlib.pyplot as plt
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset

dataset = load_dataset("izumi-lab/open-text-books")


model_directory_math = '/Users/nikxoma/proj/model_weights/finetuned_gpt2'
model_directory_bio = '/Users/nikxoma/proj/model_weights/gpt2_bio'


tokenizer = GPT2Tokenizer


model_math = GPT2LMHeadModel.from_pretrained(model_directory_math)
model_bio = GPT2LMHeadModel.from_pretrained(model_directory_bio)

model_name = 'gpt2'



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [26]:
def generate_response(prompt, max_length=100, num_return_sequences=1, top_p=0.9, temperature=0.7):
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model_bio.generate(
        inputs['input_ids'], 
        max_length=max_length, 
        num_return_sequences=num_return_sequences,
        do_sample=True,  
        top_p=top_p,     
        temperature=temperature,  
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [33]:
class MyDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length')
        return inputs.input_ids.squeeze(), inputs.attention_mask.squeeze()



dataloader = DataLoader(dataset, batch_size=1, shuffle=True)


def interpolate_weights(model1, model2, t):
    new_model = GPT2LMHeadModel.from_pretrained(model_name)
    model1_state_dict = model1.state_dict()
    model2_state_dict = model2.state_dict()
    
    new_state_dict = {}
    for key in model1_state_dict.keys():
        new_state_dict[key] = t * model1_state_dict[key] + (1 - t) * model2_state_dict[key]
    
    new_model.load_state_dict(new_state_dict)
    return new_model







In [51]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)


tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])




Map:   0%|          | 0/149700 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 149700
    })
})


KeyError: "Invalid key: 0. Please first select a split. For example: `my_dataset_dictionary['train'][0]`. Available splits: ['train']"

In [67]:
def evaluate_model(dataset):
    total_loss = 0.0
    total_samples = 0

    for batch in tokenized_dataset['train']:
        input_ids = torch.tensor(batch['input_ids']).to(device)
        attention_mask = torch.tensor(batch['attention_mask']).to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss
            total_loss += loss.item() * input_ids.size(0)
            total_samples += input_ids.size(0)

    avg_loss = total_loss / total_samples
    return avg_loss

In [None]:
def linear_interpolation_merge_eval(number_of_combinations):
    ts = [a/number_of_combinations for a in range(number_of_combinations + 1)]
    losses = []
    for t in ts:
        print(t)
        model = interpolate_weights(model_math, model_bio, t)
        loss = evaluate_model(tokenized_dataset['train'])
        losses.append(loss)
    
    plt.figure(figsize=(8, 6))  
    plt.plot(ts, losses, marker='o') 

    
    plt.title('Plot of Y against X')
    plt.xlabel('X-axis label')
    plt.ylabel('Y-axis label')
        