In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.datasets import ImageFolder
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
from PIL import Image

from google.colab import drive

In [None]:
drive.mount('/content/drive')

train_path = '/content/drive/My Drive/BTT Team BHA/Kaggle BHA Data/train.csv'
test_path = '/content/drive/My Drive/BTT Team BHA/Kaggle BHA Data/test.csv'

Mounted at /content/drive


## Define image transformations for preprocessing
- Resize all images to 224x224 pixels
- Convert images to PyTorch tensors
- Normalize pixel values using ImageNet

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

## Load the training dataset using ImageFolder
- Automatically assigns labels based on subfolder names and applies transformations to each image

In [None]:
train_dir = '/content/drive/My Drive/BTT Team BHA/Kaggle BHA Data/train/train'
train_dataset = ImageFolder(root=train_dir, transform=transform)

## Define a custom dataset class for test images without labels

In [None]:
class TestDataset(Dataset):
    def __init__(self, img_dir, transform=None):
        self.img_dir = img_dir
        self.image_files = [f for f in os.listdir(img_dir) if f.endswith(('.png', '.jpg', '.jpeg'))]
        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.image_files[idx])
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, self.image_files[idx]

In [None]:
test_dir = '/content/drive/My Drive/BTT Team BHA/Kaggle BHA Data/test/test'
test_dataset = TestDataset(test_dir, transform=transform)

## Create DataLoader instances for training & test datasets
- **train_loader:** Loads the training data and shuffles the data to ensure randomness during training
- **test_loader:** Loads the test data without shuffling

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Read the training CSV file to extract the labels

In [None]:
num_classes = len(set(pd.read_csv(train_path).iloc[:, 3]))
num_classes

21

## Load the pre-trained ResNet18 model

In [None]:
model = models.resnet18(pretrained=True)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:01<00:00, 29.9MB/s]


## Define loss function and optimizer

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

KeyboardInterrupt: 

## Train ResNet18 model

In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

## Evaluate the model

In [None]:
model.eval()
predictions = []

with torch.no_grad():
    for images, filenames in test_loader:
        images = images.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)

        for filename, pred in zip(filenames, predicted.cpu().numpy()):
            predictions.append((filename, train_dataset.classes[pred]))

NameError: name 'model' is not defined

## Save predictions to CSV

In [None]:
df = pd.DataFrame(predictions, columns=["md5hash", "label"])
df['md5hash'] = df['md5hash'].str.replace('.jpg', '', regex=False)
df.to_csv("test_predictions.csv", index=False)

print("Predictions saved to test_predictions.csv")

# Calculate Scores for Training Data

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def evaluate_model(loader, model, device):
    model.eval()

    true_labels = []
    pred_labels = []

    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)

            true_labels.extend(labels.cpu().numpy())
            pred_labels.extend(predicted.cpu().numpy())

    accuracy = accuracy_score(true_labels, pred_labels)

    precision = precision_score(true_labels, pred_labels, average='micro')
    recall = recall_score(true_labels, pred_labels, average='micro')
    f1 = f1_score(true_labels, pred_labels, average='micro')

    return accuracy, precision, recall, f1

In [None]:
train_accuracy, train_precision, train_recall, train_f1 = evaluate_model(train_loader, model, device)

In [None]:
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Training Precision: {train_precision * 100:.2f}%")
print(f"Training Recall: {train_recall * 100:.2f}%")
print(f"Training F1 Score: {train_f1 * 100:.2f}%")