In [1]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'pandas'

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
# Load and prepare data
df = pd.read_csv('/content/drive/MyDrive/final_training_data (1).csv') # Removed extra quotes around the filepath
df = df.dropna()
df.drop_duplicates(inplace=True)
print(f"Total rows: {len(df)}")

Total rows: 32340


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [6]:
# Format data for LLM (prompt + target)
def make_prompt(example):
    return f"Complaint: {example['Complaint']} Predict:"

def make_target(example):
    return (
        f"Category: {example['Category']}; "
        f"Subcategory: {example['Subcategory']}; "
        f"Priority: {example['Priority']}; "
    )

In [7]:
df["prompt"] = df.apply(make_prompt, axis=1)
df["target"] = df.apply(make_target, axis=1)

In [8]:
# Split dataset
train_df, val_df = train_test_split(df[["prompt", "target"]], test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [9]:
# Load tokenizer and model
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [10]:
# Tokenize data
def preprocess(example):
    model_inputs = tokenizer(example["prompt"], max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(example["target"], max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)


Map:   0%|          | 0/29106 [00:00<?, ? examples/s]

Map:   0%|          | 0/3234 [00:00<?, ? examples/s]

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./llm_complaint_model",
    run_name="flan_t5_complaint_classification_v1",  # 👈 Add this line
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs"
)

In [13]:
# Trainer setup
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Seq2SeqTrainer(


In [14]:
# Train
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabonars30[0m ([33mabonars30-zagazig-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,5.7527
1000,0.3722
1500,0.0549
2000,0.0336
2500,0.0247
3000,0.0199
3500,0.017
4000,0.0144
4500,0.0128
5000,0.0114


TrainOutput(global_step=10917, training_loss=0.2936063739224167, metrics={'train_runtime': 4038.5275, 'train_samples_per_second': 21.621, 'train_steps_per_second': 2.703, 'total_flos': 8115793544871936.0, 'train_loss': 0.2936063739224167, 'epoch': 3.0})

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load the trained model
model_dir = "LLM_model"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

  from pandas.core import (


In [2]:
def pred(complaint_text, model, tokenizer, device="cpu"):
    model.to(device)
    model.eval()

    prompt = f"Complaint: {complaint_text}\nPredict:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=256)

    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result


In [3]:
example_complaint = "this hole in street in  front of my house is dangerous on driver"
output = pred(example_complaint, model, tokenizer)
print(output)


Category: Road and Traffic; Subcategory: Road Surface Damage; Priority: Critical; Report: This complaint relates to road surface damage under road and traffic and is marked as critical priority.


In [24]:
# Save the fine-tuned model
trainer.save_model("./llm_model")

# Save the tokenizer (needed for later predictions)
tokenizer.save_pretrained("./llm_tokenizer")


('./llm_tokenizer/tokenizer_config.json',
 './llm_tokenizer/special_tokens_map.json',
 './llm_tokenizer/spiece.model',
 './llm_tokenizer/added_tokens.json',
 './llm_tokenizer/tokenizer.json')