In [None]:
# Install required libraries
!pip install transformers pandas -q

In [None]:
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from sklearn.model_selection import train_test_split
import torch

In [None]:
# Load dataset
url = "https://huggingface.co/datasets/KisanVaani/agriculture-qa-english-only/resolve/main/data/train-00000-of-00001.parquet"
df = pd.read_parquet(url).dropna().reset_index(drop=True)
df = df.rename(columns={"question": "question", "answers": "answer"})

In [None]:
# Prepare training format
df["text"] = df.apply(lambda row: f"Q: {row['question']}\nA: {row['answer']}", axis=1)
train_texts, val_texts = train_test_split(df["text"].tolist(), test_size=0.1, random_state=42)

In [None]:
# Tokenize data
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_batch(examples):
    return tokenizer(examples, truncation=True, padding='max_length', max_length=128)

train_encodings = tokenize_batch(train_texts)
val_encodings = tokenize_batch(val_texts)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx]),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx]),
        }

train_dataset = SimpleDataset(train_encodings)
val_dataset = SimpleDataset(val_encodings)


In [None]:
# Load model
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

# Reduce epochs and limit data volume
training_args = TrainingArguments(
    output_dir="./chatbot",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=500,
    logging_steps=100,
    save_total_limit=1,
    logging_dir="./logs",
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,3.1927
200,2.8291
300,2.6582
400,2.4865
500,2.3964
600,2.3525
700,2.2219
800,2.0975
900,2.0135
1000,1.9473


Step,Training Loss
100,3.1927
200,2.8291
300,2.6582
400,2.4865
500,2.3964
600,2.3525
700,2.2219
800,2.0975
900,2.0135
1000,1.9473


In [None]:
# Save model/tokenizer
model.save_pretrained("./chatbot")
tokenizer.save_pretrained("./chatbot")
print("Model saved in ./chatbot")

In [None]:
from transformers import AutoModelForCausalLM
from pathlib import Path

# Fully resolved local path
local_path = Path("./chatbot/checkpoint-450").resolve()

# Load model from local directory only
model = AutoModelForCausalLM.from_pretrained(
    local_path,
    trust_remote_code=True,
    use_safetensors=True,
    local_files_only=True
)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load model and tokenizer
model_path = "./chatbot/checkpoint-450"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
    use_safetensors=True
)
model.eval()

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Chat function using max_new_tokens
def chat_with_bot(user_input, context="You are a helpful agriculture assistant."):
    prompt = f"{context}\n\nUser: {user_input}\n\nBot:"
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    )
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=100,  # generate up to 100 tokens after the prompt
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        do_sample=True,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id
    )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded.split("Bot:")[-1].strip() if "Bot:" in decoded else decoded.strip()

# Test multiple questions
questions = [
    "How do I prevent soil erosion?",
    "What is the best time to plant maize?",
    "How can I improve soil fertility?",
    "What are common pests in tomato farming?",
    "How do I prepare organic compost?"
]

for q in questions:
    print(f"\nQuestion: {q}")
    print("Chatbot:", chat_with_bot(q))
