In [13]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset class to load your text data
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size=128):
        self.examples = []
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
            for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
                self.examples.append(tokenized_text[i:i + block_size])

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return torch.tensor(self.examples[idx], dtype=torch.long)

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Path to your text file about human brain and function
data_file_path = "/content/file.txt"

# Create a custom dataset instance
dataset = TextDataset(data_file_path, tokenizer)

# Define training parameters
train_params = {
    "batch_size": 4,
    "shuffle": True,
    "num_workers": 0
}

# Create a DataLoader for the dataset
train_loader = DataLoader(dataset, **train_params)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
for epoch in range(3):  # Train for 3 epochs, you can adjust as needed
    for batch in train_loader:
        inputs, labels = batch.to(device), batch.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_gpt_model")


Epoch 1, Loss: 3.5302655696868896
Epoch 1, Loss: 3.211371898651123
Epoch 1, Loss: 3.202302932739258
Epoch 1, Loss: 3.369623899459839
Epoch 1, Loss: 3.2692081928253174
Epoch 1, Loss: 3.261125087738037
Epoch 1, Loss: 3.4480910301208496
Epoch 2, Loss: 2.9390673637390137
Epoch 2, Loss: 2.933866500854492
Epoch 2, Loss: 2.811767816543579
Epoch 2, Loss: 2.784903049468994
Epoch 2, Loss: 2.581923246383667
Epoch 2, Loss: 2.7124576568603516
Epoch 2, Loss: 2.5476086139678955
Epoch 3, Loss: 2.4683334827423096
Epoch 3, Loss: 2.645562171936035
Epoch 3, Loss: 2.4801926612854004
Epoch 3, Loss: 2.471576452255249
Epoch 3, Loss: 2.453244209289551
Epoch 3, Loss: 2.266193389892578
Epoch 3, Loss: 2.5392534732818604


In [14]:
tokenizer.save_pretrained("fine_tuned_gpt_model")

('fine_tuned_gpt_model/tokenizer_config.json',
 'fine_tuned_gpt_model/special_tokens_map.json',
 'fine_tuned_gpt_model/vocab.json',
 'fine_tuned_gpt_model/merges.txt',
 'fine_tuned_gpt_model/added_tokens.json')

In [19]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load fine-tuned model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("fine_tuned_gpt_model")
model = GPT2LMHeadModel.from_pretrained("fine_tuned_gpt_model")

# Set the model in evaluation mode
model.eval()

# Prompt for completion
prompt_text = "The human eye is a"

# Tokenize input text
input_ids = tokenizer.encode(prompt_text, return_tensors="pt")

# Generate text
# Generate text with greedy decoding
output_text = ""
num_sentences = 0
while num_sentences < 2:
    output = model.generate(
        input_ids,
        max_length=100,
        num_return_sequences=1,  # Use 1 for greedy decoding
        temperature=0.7,
        top_k=50,  # Adjust this parameter as needed for diversity in generated text
        top_p=0.95,  # Adjust this parameter as needed for diversity in generated text
        do_sample=True  # Necessary for using temperature
    )

    # Decode and append generated text
    output_text += tokenizer.decode(output[0], skip_special_tokens=True)

    # Count the number of sentence-ending tokens (period, question mark, exclamation mark)
    num_sentences += output_text.count(".") + output_text.count("?") + output_text.count("!")

    # Update input for next generation
    input_ids = tokenizer.encode(output_text, return_tensors="pt")

# Print the generated text
print(output_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The human eye is a specialized organ that acts like a camera, filtering light and providing visual information to the brain. The human eye is a complex organ that acts like a camera, filtering light and providing visual information to the brain. The human eye, like any other part of the body, is composed of thousands of cells, each carrying thousands of receptors and many different types of cells.

The human eye is the world's largest organ, and it contains thousands of cells, each carrying thousands of


In [23]:
# Generate text with greedy decoding
output = model.generate(
    input_ids,
    max_length=50,  # Set maximum length to accommodate the desired length of two sentences
    num_return_sequences=1,  # Generate one sequence at a time
    temperature=0.7,
    top_k=50,  # Adjust this parameter as needed for diversity in generated text
    top_p=0.95,  # Adjust this parameter as needed for diversity in generated text
    do_sample=True,  # Necessary for using temperature
    max_new_tokens=30  # Limit the additional tokens generated beyond the input length
)

# Decode and print generated text
for i, sample_output in enumerate(output):
    decoded_output = tokenizer.decode(sample_output, skip_special_tokens=True)
    sentences = decoded_output.split('.')
    if len(sentences) > 1:
        first_sentence = sentences[0] + '.'
        second_sentence = sentences[1] + '.'
        print(f"Generated text {i+1}: First sentence: {first_sentence} Second sentence: {second_sentence}\n")
    else:
        print("Generated text does not contain two sentences.")



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=30) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated text 1: First sentence: The human eye is a specialized organ that acts like a camera, filtering light and providing visual information to the brain. Second sentence:  The human eye is a complex organ that acts like a camera, filtering light and providing visual information to the brain.



In [12]:
!zip '/content/fine_tuned_gpt_model'


zip error: Nothing to do! (/content/fine_tuned_gpt_model.zip)
