In [None]:
class_map = {'Apple___Apple_scab': 0, 'Apple___Black_rot': 1, 'Apple___Cedar_apple_rust': 2, 'Apple___healthy': 3,
             'Corn___Cercospora_leaf_spot Gray_leaf_spot': 4, 'Corn___Common_rust': 5, 'Corn___Northern_Leaf_Blight': 6, 'Corn___healthy': 7,
             'Durian___Algal_Leaf_Spot': 8, 'Durian___Leaf_Blight': 9, 'Durian___Leaf_Spot': 10, 'Durian___healthy': 11,
             'Grape___Black_rot': 12, 'Grape___Esca_(Black_Measles)': 13, 'Grape___Leaf_blight_(Isariopsis_Leaf_Spot)': 14, 'Grape___healthy': 15,
             'OilPalm___brown_spots': 16, 'OilPalm___healthy': 17, 'OilPalm___white_scale': 18,
             'Orange___Haunglongbing_(Citrus_greening)': 19,
             'Pepper_bell___Bacterial_spot': 20, 'Pepper_bell___healthy': 21,
             'Potato___Early_blight': 22, 'Potato___Late_blight': 23, 'Potato___healthy': 24,
             'Rice___Bacterialblight': 25, 'Rice___Blast': 26, 'Rice___Brownspot': 27, 'Rice___Tungro': 28,
             'Soybean___healthy': 29,
             'Strawberry___Leaf_scorch': 30, 'Strawberry___healthy': 31,
             'Tomato___Bacterial_spot': 32, 'Tomato___Early_blight': 33, 'Tomato___Late_blight': 34, 'Tomato___Leaf_Mold': 35, 'Tomato___Septoria_leaf_spot': 36, 'Tomato___Spider_mites Two-spotted_spider_mite': 37, 'Tomato___Target_Spot': 38, 'Tomato___Tomato_Yellow_Leaf_Curl_Virus': 39, 'Tomato___Tomato_mosaic_virus': 40, 'Tomato___healthy': 41}

In [None]:
import pandas as pd

df = pd.read_csv("/kaggle/input/hackathon-online-agriculture-classification/train.csv")

reverse_class_map = {v: k for k, v in class_map.items()}
df['full_label'] = df['label'].map(reverse_class_map)

df['parent_label'] = df['full_label'].str.split('___').str[0]

df['parent_label'] = df['parent_label'].astype('category').cat.codes

display(df)
    
df.to_csv("updated_train.csv", index=False)

In [None]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os

class ImageClassificationDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        img_id = self.annotations.iloc[index, 0]
        img_path = os.path.join(self.img_dir, f"{img_id}.jpg")
        image = Image.open(img_path).convert("RGB")
        label = int(self.annotations.iloc[index, 1])

        if self.transform:
            image = self.transform(image)

        return image, label

In [None]:
class HierarchicalImageClassificationDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        # Read image ID using column name
        img_id = self.annotations.loc[index, 'id']
        img_path = os.path.join(self.img_dir, f"{img_id}.jpg")
        
        # Load image
        image = Image.open(img_path).convert("RGB")
        
        # Read parent class label and subclass label using column names
        parent_label = int(self.annotations.loc[index, 'parent_label'])
        subclass_label = int(self.annotations.loc[index, 'label'])

        if self.transform:
            image = self.transform(image)

        return image, (parent_label, subclass_label)

In [None]:
from torchvision import transforms
from torch.utils.data import DataLoader, random_split
from sklearn.model_selection import StratifiedKFold

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

img_dir = "/kaggle/input/hackathon-online-agriculture-classification/Images"
train_csv = "/kaggle/working/updated_train.csv"
dataset = HierarchicalImageClassificationDataset(csv_file=train_csv, img_dir=img_dir, transform=transform)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=32, shuffle=False)

In [None]:
class TestDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        img_id = self.annotations.iloc[index, 0]
        img_path = os.path.join(self.img_dir, f"{img_id}.jpg")
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, img_id

# Load the test dataset
test_csv = "/kaggle/input/hackathon-online-agriculture-classification/test.csv"
test_dataset = TestDataset(csv_file=test_csv, img_dir=img_dir, transform=transform)

# Create DataLoader for test set
test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import timm
from torchvision import models
from tqdm.auto import tqdm

# model = models.resnet18()
# model.fc = nn.Linear(model.fc.in_features, num_classes) 

class ResNet18Hierarchical(nn.Module):
    def __init__(self, num_parent_classes, num_subclasses):
        super(ResNet18Hierarchical, self).__init__()
        
        # Load the pre-trained ResNet18 model
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        
        # Capture the number of features in the original fully connected layer
        num_features = self.resnet.fc.in_features
        
        # Replace the original fully connected layer with an Identity layer
        self.resnet.fc = nn.Identity()
        
        # New fully connected layers for parent class and subclass
        self.fc_parent = nn.Linear(num_features, num_parent_classes)
        self.fc_subclass = nn.Linear(num_features, num_subclasses)
    
    def forward(self, x):
        # Extract features using the pre-trained ResNet18 model
        x = self.resnet(x)
        
        # Classify into parent class and subclass
        parent_class_logits = self.fc_parent(x)
        subclass_logits = self.fc_subclass(x)
        
        return parent_class_logits, subclass_logits

class ViTHierarchical(nn.Module):
    def __init__(self, num_parent_classes, num_subclasses):
        super(ViTHierarchical, self).__init__()
        # Load a pre-trained Vision Transformer model
        self.vit = timm.create_model('vit_base_patch16_224', pretrained=True)
        
        # Replace the classifier head with Identity to get the feature vector
        self.vit.head = nn.Identity()
        
        # Define separate fully connected layers for parent class and subclass
        self.fc_parent = nn.Linear(self.vit.num_features, num_parent_classes)
        self.fc_subclass = nn.Linear(self.vit.num_features, num_subclasses)
    
    def forward(self, x):
        # Extract features from the pre-trained Vision Transformer
        x = self.vit(x)
        
        # Classify into parent class and subclass
        parent_class_logits = self.fc_parent(x)
        subclass_logits = self.fc_subclass(x)
        
        return parent_class_logits, subclass_logits
    
num_parent_classes = 12
num_subclasses = 42
# model = ResNet18Hierarchical(num_parent_classes, num_subclasses)
model = ViTHierarchical(num_parent_classes, num_subclasses)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs for training.")
    model = nn.DataParallel(model)
model.to(device)

In [None]:
from sklearn.metrics import f1_score

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
num_epochs = 20

best_val_f1 = 0.0

for epoch in range(num_epochs):
    model.train()
    for images, (parent_labels, subclass_labels) in tqdm(train_loader):
        images = images.to(device)
        parent_labels = parent_labels.to(device)
        subclass_labels = subclass_labels.to(device)
        
        optimizer.zero_grad()
        
        parent_logits, subclass_logits = model(images)

        loss_parent = criterion(parent_logits, parent_labels)
        loss_subclass = criterion(subclass_logits, subclass_labels)

        loss = loss_parent + loss_subclass

        loss.backward()
        optimizer.step()
        
    
    model.eval()
    val_loss = 0.0
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for images, (parent_labels, subclass_labels) in val_loader:
            images, labels = images.to(device), subclass_labels.to(device)
            
            parent_logits, subclass_logits = model(images)
            
            loss = criterion(subclass_logits, labels)
            val_loss += loss.item()

            _, predicted = torch.max(subclass_logits, 1)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    val_loss /= len(val_loader)
    val_f1 = f1_score(all_labels, all_predictions, average='macro')

    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {loss.item():.4f}, "
          f"Validation Loss: {val_loss:.4f}, "
          f"Validation F1 Macro: {val_f1:.4f}")

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), "best_model.pth")

In [None]:
submit_df = pd.read_csv("/kaggle/input/hackathon-online-agriculture-classification/submit.csv")

model.eval()

predictions = []

with torch.no_grad():
    for images, img_ids in tqdm(test_loader):
        images = images.to(device)
        parent_logits, subclass_logits = model(images)
        _, predicted = torch.max(subclass_logits, 1)
        predictions.extend(predicted.cpu().numpy())

submit_df['predict'] = predictions

submit_df.to_csv("submission.csv", index=False)