In [1]:
import pandas as pd
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn import CrossEntropyLoss
from transformers import AdamW
import torch
from tqdm.auto import tqdm


In [2]:
# Load your dataset
df = pd.read_csv("../data_set/Crypto_QA.csv")
# Concatenate question, context, and expected_answer to form the input text
df['input_text'] = df['question'] + " " + df['context'] + " " + df['expected_answer']

In [3]:
df.head()

Unnamed: 0,id,question,context,expected_answer,input_text
0,1,What is cryptocurrency?,Cryptocurrency is a type of digital or virtual...,Cryptocurrency is a type of digital or virtual...,What is cryptocurrency? Cryptocurrency is a ty...
1,2,How does blockchain work?,Blockchain is a distributed ledger technology ...,Blockchain is a distributed ledger technology ...,How does blockchain work? Blockchain is a dist...
2,3,What is Bitcoin?,Bitcoin is the first and most well-known crypt...,Bitcoin is the first and most well-known crypt...,What is Bitcoin? Bitcoin is the first and most...
3,4,Explain the concept of mining in cryptocurrency.,Mining is the process by which transactions ar...,Mining is the process by which transactions ar...,Explain the concept of mining in cryptocurrenc...
4,5,What is a smart contract?,A smart contract is a self-executing contract ...,A smart contract is a self-executing contract ...,What is a smart contract? A smart contract is ...


In [4]:
# Tokenize the dataset
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token
df['tokenized_text'] = df['input_text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))


In [5]:
# PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts

    def __len__(self):
        return len(self.tokenized_texts)

    def __getitem__(self, idx):
        return torch.tensor(self.tokenized_texts[idx], dtype=torch.long)


In [6]:
# Collate function for DataLoader
def collate_batch(batch):
    max_len = max(len(seq) for seq in batch if seq is not None)
    
    padded_batch = torch.stack([torch.cat([torch.tensor(seq, dtype=torch.long), torch.tensor([tokenizer.pad_token_id] * (max_len - len(seq)), dtype=torch.long)]) if seq is not None else torch.zeros(max_len, dtype=torch.long) for seq in batch])
    
    return padded_batch


In [7]:
# dataset and dataloader
dataset = CustomDataset(df['tokenized_text'].tolist())
dataloader = DataLoader(dataset, batch_size=2, collate_fn=collate_batch)


In [8]:
# model and optimizer
config = GPT2Config.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium", config=config)
optimizer = AdamW(model.parameters(), lr=5e-5)




In [9]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3


In [10]:
for epoch in range(num_epochs):
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")

    for batch in progress_bar:
        batch = batch.to(device)
        optimizer.zero_grad()

        outputs = model(input_ids=batch, labels=batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()

        progress_bar.set_postfix({'Loss': loss.item()})


Epoch 1/3:   0%|          | 0/258 [00:00<?, ?it/s]

  padded_batch = torch.stack([torch.cat([torch.tensor(seq, dtype=torch.long), torch.tensor([tokenizer.pad_token_id] * (max_len - len(seq)), dtype=torch.long)]) if seq is not None else torch.zeros(max_len, dtype=torch.long) for seq in batch])


Epoch 2/3:   0%|          | 0/258 [00:00<?, ?it/s]

Epoch 3/3:   0%|          | 0/258 [00:00<?, ?it/s]

In [11]:
from torch.nn import CrossEntropyLoss
from tqdm.auto import tqdm

# Set the model to evaluation mode
model.eval()

# Initialize CrossEntropyLoss for evaluation
eval_loss = CrossEntropyLoss()

# Initialize the evaluation dataloader
eval_dataloader = DataLoader(dataset, batch_size=2, collate_fn=collate_batch)

# Initialize a variable to accumulate total loss
total_loss = 0.0

# Iterate through the evaluation dataset
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    batch = batch.to(device)

    # No need to compute gradients during evaluation
    with torch.no_grad():
        outputs = model(input_ids=batch, labels=batch)
        loss = outputs.loss
        total_loss += loss.item()

# Calculate the average loss over the evaluation dataset
average_loss = total_loss / len(eval_dataloader)

# Calculate perplexity
perplexity = torch.exp(torch.tensor(average_loss))

print(f"Perplexity: {perplexity.item()}")


Evaluating:   0%|          | 0/258 [00:00<?, ?it/s]

  padded_batch = torch.stack([torch.cat([torch.tensor(seq, dtype=torch.long), torch.tensor([tokenizer.pad_token_id] * (max_len - len(seq)), dtype=torch.long)]) if seq is not None else torch.zeros(max_len, dtype=torch.long) for seq in batch])


Perplexity: 1.5158318281173706


In [14]:
# Save the trained model
model.save_pretrained("streamlit/gpt-2_medium/gpt2_medium_crypto")
tokenizer.save_pretrained("streamlit/gpt-2_medium/gpt2_medium_crypto")

('gpt-2_medium/gpt2_medium_crypto\\tokenizer_config.json',
 'gpt-2_medium/gpt2_medium_crypto\\special_tokens_map.json',
 'gpt-2_medium/gpt2_medium_crypto\\vocab.json',
 'gpt-2_medium/gpt2_medium_crypto\\merges.txt',
 'gpt-2_medium/gpt2_medium_crypto\\added_tokens.json')

In [17]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the trained model and tokenizer
model_path = "gpt-2_medium/gpt2_medium_crypto"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Set the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Example question
sample_question = "What is a ledger?"

# Tokenize the question
tokenized_question = tokenizer.encode(sample_question, add_special_tokens=True)

# Convert the tokenized question to a PyTorch tensor
input_tensor = torch.tensor(tokenized_question, dtype=torch.long).unsqueeze(0).to(device)

# Generate a response from the loaded model
with torch.no_grad():
    generated_output = model.generate(input_ids=input_tensor, max_length=100, num_beams=5, temperature=0.7)

# Decode the generated tokens to get the final response
generated_response = tokenizer.decode(generated_output[0], skip_special_tokens=True)

# Print the results
print(f"Input Question: {sample_question}")
print(f"Generated Response: {generated_response}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input Question: What is a ledger?
Generated Response: What is a ledger? A ledger is a record of transactions across multiple blockchain platforms, providing transparency and security benefits. A ledger is a record of transactions across multiple blockchain platforms, providing transparency and security benefits.
