In [34]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [35]:
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [36]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")  

In [37]:
model.eval()     

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [38]:
prompt = "What Is Natural Language Processing?"   

In [None]:
# def generate_text(prompt) :
#     encode_text = tokenizer(prompt, return_tensors = "pt")

#     output = model.generate(encode_text, max_length = 100, num_beams = 5, no_repeat_ngram_size  = 2, attention_mask=attention_mask, pad_token_id=tokenizer.eos_token_id,early_stopping = True)
#     output = tokenizer.decode(output[0], skip_special_tokens = True)
#     return output 

In [39]:
def generate_text(prompt) :
    # This line is correct, it produces a dictionary like {'input_ids': ..., 'attention_mask': ...}
    inputs = tokenizer(prompt, return_tensors = "pt") # Renamed for clarity, but your 'encode_text' works too if used correctly

    # This is the crucial line where you were making the mistake.
    # The '**inputs' unpacks the dictionary into keyword arguments.
    output = model.generate(
        **inputs, # <--- THIS IS WHERE 'input_ids' AND 'attention_mask' ARE PASSED CORRECTLY
        max_length = 100,
        num_beams = 5,
        no_repeat_ngram_size = 2,
        pad_token_id = tokenizer.eos_token_id, # This is correctly placed
        early_stopping = True
    )
    output = tokenizer.decode(output[0], skip_special_tokens = True)
    return output

In [40]:
generated_text = generate_text(prompt)  

In [64]:
print(generated_text)

What Is Natural Language Processing?

Natural language processing (NLP) is the process by which a computer learns to process information in a language. NLP refers to the ability of the computer to learn to read, write, and interpret information. It is a process in which the processing of information is done by the human brain. The process of learning to understand and understand information can take many forms, but the most important one is learning how to interpret it. This process is known as the "


In [43]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("text", data_files = {"train" : "shakespear.txt"}, split = "train")

Generating train split: 985 examples [00:00, 3873.50 examples/s]


In [69]:
dataset[1]["text"]

'Before we proceed any further, hear me speak.'

In [70]:
for i in range(5):
    print(f"Entry {i}: '{dataset[i]['text']}'")
    # You can also print the length to see if it's an empty string
    # print(f"Length of entry {i}: {len(dataset[i]['text'])}")

print("\n--- Finding the first non-empty entry ---")
found_non_empty = False
for i in range(dataset.num_rows):
    text = dataset[i]["text"].strip() # .strip() removes leading/trailing whitespace
    if text: # Check if the string is not empty after stripping whitespace
        print(f"First non-empty entry (index {i}):\n'{text}'")
        found_non_empty = True
        break
if not found_non_empty:
    print("No non-empty entries found in the dataset (unlikely for wikitext).")

Entry 0: 'First Citizen:'
Entry 1: 'Before we proceed any further, hear me speak.'
Entry 2: ''
Entry 3: 'All:'
Entry 4: 'Speak, speak.'

--- Finding the first non-empty entry ---
First non-empty entry (index 0):
'First Citizen:'


In [71]:
def tokenize_function(examples) : 
    model_input = tokenizer(examples["text"],  max_length = 512, padding = "max_length", truncation = True) 
    model_input["labels"] = model_input["input_ids"].copy()   
    return model_input    


In [72]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [73]:
tokenize_datasets = dataset.map(tokenize_function,  batched =True, remove_columns = ["text"])

Map: 100%|██████████| 985/985 [00:00<00:00, 1632.46 examples/s]


In [74]:
tokenize_datasets.features   

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [75]:
from transformers import Trainer, TrainingArguments 

In [77]:
training_args = TrainingArguments(output_dir = "./results", per_device_train_batch_size = 2, num_train_epochs = 1, save_steps = 10_000, save_total_limit = 2, logging_dir="./logs",
    logging_steps=500, report_to="none",)

In [78]:
trainer = Trainer(model = model, args =training_args, train_dataset = tokenize_datasets)

In [79]:
trainer.train()    

Step,Training Loss


KeyboardInterrupt: 

In [55]:
import sys
import os

print(f"Python executable running this code: {sys.executable}")
print(f"PYTHONPATH: {os.environ.get('PYTHONPATH', 'Not set')}")

Python executable running this code: d:\Prodigy InfoTech\Task  1\.venv\Scripts\python.exe
PYTHONPATH: Not set


In [65]:
prompt = "What Is Natural Language Processing?" 

In [66]:
generated_text = generate_text(prompt)  

In [67]:
print(generated_text)

What Is Natural Language Processing?


In [56]:
import gradio as gr    

In [57]:
def generate_text_gradio(prompt) :
    return generate_text(prompt)

In [58]:
interface = gr.Interface(fn = generate_text_gradio, inputs = "text", outputs = "text")

In [59]:
interface.launch()   

* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.


