In [None]:
import pandas as pd

df = pd.read_parquet("hf://datasets/ButterChicken98/plantvillage-image-text-pairs/data/train-00000-of-00001.parquet")

In [None]:
df

In [None]:
df = df.drop(columns= ["image"])
df

In [None]:
df = df.explode("captions" , ignore_index=True)
df

In [None]:
len(df["caption"].unique().tolist())

In [None]:
df.to_csv("dataset.csv" , index = False)

In [None]:
df = pd.read_csv("dataset.csv")
df

In [None]:
# df = df.drop(columns= ["Unnamed: 0"])

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
encoder = LabelEncoder()
encoder.fit(df["caption"].tolist())
df["label"] = encoder.transform(df["caption"].tolist())
df

In [None]:
df_train, df_test = train_test_split(df, train_size=0.8)

In [None]:
from datasets import Dataset

In [None]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer

In [None]:
model_name = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def mapper_function(data):
  return tokenizer(data['captions'], truncation=True)

In [None]:
tokenized_train = train_dataset.map(mapper_function, batched=True)
tokenized_test = test_dataset.map(mapper_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=15)

In [None]:
!pip install evaluate

In [None]:
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
eval_metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(pred):
  # Access the true labels from the EvalPrediction object
  labels = pred.label_ids
  logits = pred.predictions
  prediction = np.argmax(logits, axis=1)
  return eval_metric.compute(predictions=prediction, references=labels)

In [None]:
training_arguments = TrainingArguments(output_dir="checkpoints",
                                       per_device_train_batch_size= 16,
                                       per_device_eval_batch_size= 16,
                                       learning_rate= 1e-4,
                                       num_train_epochs=5,
                                       weight_decay=0.01,
                                       logging_strategy="epoch",
                                       save_strategy="epoch",
                                       save_total_limit=2,
                                       report_to="none")


In [None]:
trainer = Trainer(model=model,
                  args=training_arguments,
                  train_dataset=tokenized_train,
                  eval_dataset=tokenized_test,
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  tokenizer= tokenizer)

In [None]:
trainer.train()

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')

In [None]:
#saving the model

save_dir = "/content/drive/MyDrive/main_model"
os.makedirs(save_dir, exist_ok=True)

final_model_dir = os.path.join(save_dir, "final_model")
os.makedirs(final_model_dir, exist_ok=True)

trainer.save_model(final_model_dir)
tokenizer.save_pretrained(final_model_dir)

checkpoints_dir = os.path.join(save_dir, "checkpoints")
os.makedirs(checkpoints_dir, exist_ok=True)

trainer.save_state()
!cp -r checkpoints {checkpoints_dir}

print("Final model saved at: {final_model_dir}")
print("Checkpoints saved at: {checkpoints_dir}")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

final_model_dir= "/content/drive/MyDrive/distilbert_plantvillage_text_model"

loaded_model = AutoModelForSequenceClassification.from_pretrained(final_model_dir)
loaded_tokenizer = AutoTokenizer.from_pretrained(final_model_dir)

trainer = Trainer(
    model=loaded_model,
    args=training_arguments,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=loaded_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,

)

In [None]:
predictions_output = trainer.predict(tokenized_test)
preds = np.argmax(predictions_output.predictions, axis=-1)

print("Predicted label indices:", preds[:20])
print("Decoded predictions:", encoder.inverse_transform(preds[:20]))

In [None]:
true_labels = df_test["label"].tolist()[:20] # Corrected to use .tolist()
decoded_true_labels = encoder.inverse_transform(true_labels)
decoded_true_labels

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
print(os.listdir("/content/drive/My Drive/Datasets/MyDatasets/PlantVillage"))

In [None]:
print(encoder.classes_)

In [None]:
if 'encoder' in globals():
    unique_class_names = encoder.classes_.tolist()
    print("--- Found Class Names ---")
    print(unique_class_names)
    print("-------------------------")
else:
    print("Error: 'encoder' variable not found in the environment.")