<a href="https://colab.research.google.com/github/Talha1818/Machine-Learning-Python-Notebook-TransData/blob/master/Gpt2_Text_Generation_AutoModelForCausalLM_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW
from datasets import load_dataset

# Step 1: Prepare your dataset
dataset = load_dataset('imdb')  # Example dataset, replace with your own

# Step 2: Load and tokenize the data
tokenizer = AutoTokenizer.from_pretrained('gpt2')  # Or any other GPT model
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenized_data = tokenizer.batch_encode_plus(dataset['train']['text'], padding="max_length", truncation=True, max_length=5)

# Step 3: Encode the data
input_ids = torch.tensor(tokenized_data['input_ids'])
attention_mask = torch.tensor(tokenized_data['attention_mask'])
labels = torch.tensor(dataset['train']['label'])



  0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [13]:
# Step 4: Prepare data loaders
batch_size = 4
train_dataset = torch.utils.data.TensorDataset(input_ids[:1000], attention_mask[:1000], labels[:1000])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_tokenized_data = tokenizer.batch_encode_plus(dataset['test']['text'], padding='max_length', truncation=True, max_length=5)
test_input_ids = torch.tensor(test_tokenized_data['input_ids'][:1000])
test_attention_mask = torch.tensor(test_tokenized_data['attention_mask'][:1000])
test_labels = torch.tensor(dataset['test']['label'][:1000])

test_dataset = torch.utils.data.TensorDataset(test_input_ids, test_attention_mask, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# Step 6: Load the GPT model for sequence classification
model = AutoModelForCausalLM.from_pretrained('gpt2')  # Or any other GPT model

# Attach a classification head to the model
classification_head = torch.nn.Linear(model.config.hidden_size, 2)  # 2 classes (e.g., positive/negative)
model.resize_token_embeddings(len(tokenizer))
model.add_module("classification_head", classification_head)


# Step 7: Fine-tune the GPT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)

num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids_batch, attention_mask_batch, labels_batch = batch
        input_ids_batch = input_ids_batch.to(device)
        attention_mask_batch = attention_mask_batch.to(device)
        labels_batch = labels_batch.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch, labels=input_ids_batch)  # Use input_ids_batch as labels
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {average_loss:.4f}")


Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]



Epoch 1/1 - Loss: 13.7788


In [14]:
# Step 8: Save the trained model
model.save_pretrained("gpt2_decoder_classification_model")

# Save the tokenizer
tokenizer.save_pretrained("gpt2_decoder_classification_model")

('gpt2_decoder_classification_model/tokenizer_config.json',
 'gpt2_decoder_classification_model/special_tokens_map.json',
 'gpt2_decoder_classification_model/vocab.json',
 'gpt2_decoder_classification_model/merges.txt',
 'gpt2_decoder_classification_model/added_tokens.json',
 'gpt2_decoder_classification_model/tokenizer.json')

In [15]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('./gpt2_decoder_classification_model')
model = AutoModelForCausalLM.from_pretrained('./gpt2_decoder_classification_model')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of the model checkpoint at ./gpt2_decoder_classification_model were not used when initializing GPT2LMHeadModel: ['classification_head.weight', 'classification_head.bias']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

In [20]:
prompt = "The movie"
max_length = 10  # Maximum number of tokens to generate
temperature = 0.5  # Controls the randomness of token generation

# Encode the prompt and generate the next sequence of tokens
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
output = model.generate(input_ids=input_ids, max_length=max_length, temperature=temperature)

# Decode the generated tokens back into text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The movie was shot in London, England. The
