In [3]:
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM

# Load the dataset
dataset = load_dataset('json', data_files={
    'train': r'C:\\Users\\HP pavilion\\OneDrive\\Desktop\\Agroxpert assist\\train1.json', 
    'test': r'C:\\Users\\HP pavilion\\OneDrive\\Desktop\\Agroxpert assist\\test.json'
})
# Verify dataset structure
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm



DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 184
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 68
    })
})


In [4]:
# Load GPT-2 Model and Tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [5]:
# Set padding token to EOS token for GPT-2
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))


Embedding(50257, 768)

In [6]:
# Tokenize the dataset
def tokenize_function(examples):
    combined_text = [f"Q: {q} A: {a}" for q, a in zip(examples['question'], examples['answer'])]
    return tokenizer(combined_text, padding="max_length", truncation=True)


In [7]:
# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [8]:
# Set PyTorch tensor format
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask"])
print(tokenized_datasets['train'][0])  # Debugging tokenized data



{'input_ids': tensor([   48,    25, 18435,  ..., 50256, 50256, 50256]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0])}


In [9]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Evaluate after each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=7,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./logs',
    save_strategy="epoch",  # Save checkpoint after each epoch
)


In [10]:
# Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal language modeling
)


In [11]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator
)


In [None]:
# Train the Model
trainer.train()


                                                  
 14%|█▍        | 46/322 [40:12<3:32:42, 46.24s/it]

{'eval_loss': 2.3470118045806885, 'eval_runtime': 128.5006, 'eval_samples_per_second': 0.529, 'eval_steps_per_second': 0.132, 'epoch': 1.0}


                                                    
 29%|██▊       | 92/322 [1:17:34<2:58:19, 46.52s/it]

{'eval_loss': 2.3578577041625977, 'eval_runtime': 130.7042, 'eval_samples_per_second': 0.52, 'eval_steps_per_second': 0.13, 'epoch': 2.0}


 36%|███▌      | 116/322 [1:38:02<2:56:13, 51.33s/it]

In [None]:
# Save the Model and Tokenizer
model.save_pretrained(r"C:\\Users\\HP pavilion\\OneDrive\\Desktop\\Agroxpert assist\\model-folder")
tokenizer.save_pretrained(r"C:\\Users\\HP pavilion\\OneDrive\\Desktop\\Agroxpert assist\\model-folder")


In [None]:
# Load the Fine-Tuned Model and Tokenizer
model_directory = r"C:\\Users\\HP pavilion\\OneDrive\\Desktop\\Agroxpert assist\\model-folder"
tokenizer = AutoTokenizer.from_pretrained(model_directory)
model = AutoModelForCausalLM.from_pretrained(model_directory)

print("Model is ready for inference or further fine-tuning!")
