In [None]:
import pickle
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

In [3]:
import torch
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA RTX A6000


In [1]:
from datasets import load_dataset
dataset = load_dataset('glue', 'sst2')
dataset

Found cached dataset glue (/home/user/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [172]:
from transformers import GPT2Tokenizer, GPT2Model, BertTokenizer, BertModel
gpttokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpttokenizer.add_special_tokens({'pad_token': '[PAD]'})
gptmodel = GPT2Model.from_pretrained('gpt2')
gptmodel.cuda()

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP

In [173]:
berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bertmodel = BertModel.from_pretrained('bert-base-uncased')
bertmodel.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [161]:
class Classifier(torch.nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(Classifier, self).__init__()
        self.fc = torch.nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.fc(x)
        return torch.softmax(x, dim=1)

In [174]:
num_classes = 2  # For example
classifier = Classifier(bertmodel.config.hidden_size + gptmodel.config.n_embd, num_classes)
classifier.cuda()

Classifier(
  (fc): Linear(in_features=1536, out_features=2, bias=True)
)

In [175]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(list(bertmodel.parameters()) + list(gptmodel.parameters()) + list(classifier.parameters()), lr=1e-5)

In [176]:
from tqdm import tqdm
def train(bertmodel, gptmodel):
    for i in range(1):
        running_loss = 0
        n = dataset['train'].num_rows
        for j in tqdm(range(n)):
            inputsgpt = gpttokenizer(dataset['train']['sentence'][j], return_tensors="pt")
            inputsbert = berttokenizer(dataset['train']['sentence'][j], return_tensors="pt")

            outputsbert = bertmodel(**(inputsbert.to('cuda:0')))
            bert_hidden_states = outputsbert[0].mean(dim=1)
            outputsgpt = gptmodel(**(inputsgpt.to('cuda:0')))
            gpt_hidden_states = outputsgpt[0].mean(dim=1)
            representation = torch.cat([bert_hidden_states, gpt_hidden_states], dim=1)
            
            logits = classifier(representation)
            target = torch.tensor(dataset['train']['label'][j]).to('cuda:0')
            target = target.reshape(-1)
            loss = criterion(logits, target)

            running_loss += loss.item()

            # Backpropagate the gradients
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [None]:
train(bertmodel, gptmodel)

In [178]:
def test(bertmodel, gptmodel):
    n = dataset['validation'].num_rows
    bertmodel.eval()
    gptmodel.eval()
    correct = 0
    for j in tqdm(range(n)):
        inputsgpt = gpttokenizer(dataset['validation']['sentence'][j], return_tensors="pt")
        inputsbert = berttokenizer(dataset['validation']['sentence'][j], return_tensors="pt")
        
        with torch.no_grad():
            outputsbert = bertmodel(**(inputsbert.to('cuda:0')))
            outputsgpt = gptmodel(**(inputsgpt.to('cuda:0')))
        bert_hidden_states = outputsbert[0].mean(dim = 1)
        gpt_hidden_states = outputsgpt[0].mean(dim = 1)
        representation = torch.cat([bert_hidden_states, gpt_hidden_states], dim=1)
            
        logits = classifier(representation)
        if torch.argmax(logits, dim=1)==dataset['validation']['label'][j]:
            correct += 1
    print('Validation Accuracy: ', correct/n)

In [179]:
test(bertmodel, gptmodel)

100%|█████████████████████████████████████████| 872/872 [00:32<00:00, 26.56it/s]

Validation Accuracy:  0.875





In [170]:
import pandas as pd
def predict(bertmodel, gptmodel):
    n = dataset['test'].num_rows
    bertmodel.eval()
    gptmodel.eval()
    predictions = []
    for j in tqdm(range(n)):
        inputsgpt = gpttokenizer(dataset['test']['sentence'][j], return_tensors="pt")
        inputsbert = berttokenizer(dataset['test']['sentence'][j], return_tensors="pt")
        
        with torch.no_grad():
            outputsbert = bertmodel(**(inputsbert.to('cuda:0')))
            outputsgpt = gptmodel(**(inputsgpt.to('cuda:0')))
        bert_hidden_states = outputsbert[0].mean(dim=1)
        gpt_hidden_states = outputsgpt[0].mean(dim=1)
        representation = torch.cat([bert_hidden_states, gpt_hidden_states], dim=1)
            
        logits = classifier(representation)
        predictions.append(torch.argmax(logits, dim=1).to('cpu').numpy())
    filename = 'SST-2.tsv'
    result = pd.DataFrame(predictions, columns=['prediction'])
    result.insert(0, 'index', range(0, len(result)))
    result.to_csv(filename, sep='\t', index=False)

In [171]:
predict(bertmodel, gptmodel)

100%|███████████████████████████████████████| 1821/1821 [00:35<00:00, 51.71it/s]
