In [1]:
import torch

In [2]:
import torchvision.models as models

# 加载预训练的 ResNet50 模型
resnet50 = models.resnet50(weights='IMAGENET1K_V2')

In [3]:
import os
import xml.etree.ElementTree as ET
def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    annotations = []
    for member in root.findall('object'):
        value = member.find('name').text
        bndbox = member.find('bndbox')
        xmin = int(bndbox.find('xmin').text)
        ymin = int(bndbox.find('ymin').text)
        xmax = int(bndbox.find('xmax').text)
        ymax = int(bndbox.find('ymax').text)
        annotations.append((value, (xmin, ymin, xmax, ymax)))
    return annotations


In [4]:
# Looking at one parsed XML file
xml_file = os.path.join('data', 'temp_train_annotations',  'L1010277.xml')
res = parse_xml(xml_file)
res

[('5CHF', (1726, 1654, 2486, 2468)),
 ('0.02EUR', (1919, 2978, 2433, 3551)),
 ('0.02EUR', (2569, 2361, 3143, 2918)),
 ('0.05EUR', (3079, 2904, 3759, 3471)),
 ('2EUR', (3069, 1364, 3676, 1994))]

In [5]:
# label string to integer mapping
import os
import xml.etree.ElementTree as ET

def collect_labels(xml_dir):
    labels = set()
    for xml_file in os.listdir(xml_dir):
        if xml_file.endswith('.xml'):
            tree = ET.parse(os.path.join(xml_dir, xml_file))
            root = tree.getroot()
            for member in root.findall('object'):
                labels.add(member.find('name').text)
    return labels

# 创建标签到索引的映射
label_set = collect_labels(os.path.join('data', 'temp_train_annotations'))
label_to_index = {label: idx for idx, label in enumerate(label_set)}

# 示例：显示映射
print(label_to_index)


{'0.05EUR': 0, '1EUR': 1, '0.1CHF': 2, '2CHF': 3, '5CHF': 4, '0.2EUR': 5, '1CHF': 6, '0.5CHF': 7, '0.05CHF': 8, '0.1EUR': 9, '0.5EUR': 10, '2EUR': 11, '0.2CHF': 12, '0.01EUR': 13, '0.02EUR': 14}


In [6]:
# the dataset class
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from PIL import Image

class CoinDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        # temp root = data/temp_train
        # temp anno = data/temp_train_annotations
        self.root_dir = root_dir
        self.transform = transform
        self.img_files = [os.path.join(root_dir, file) for file in os.listdir(root_dir) if file.endswith('.JPG')] # list of all image files
        self.annotations = [parse_xml(file.replace('.JPG', '.xml')) for file in self.img_files] # list of lists, each list contains tuples of value and bounding box coordinates

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx):
        img_path = self.img_files[idx]
        image = Image.open(img_path).convert('RGB')
        annotations = self.annotations[idx]
        
        coins = []
        for value, (xmin, ymin, xmax, ymax) in annotations:
            coin = image.crop((xmin, ymin, xmax, ymax))
            if self.transform:
                coin = self.transform(coin)
            coins.append((coin, value))
        
        return coins


In [7]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

dataset = CoinDataset(root_dir=os.path.join('data', 'temp_train'), transform=transform)


In [20]:
type(dataset[0]) # a list of tuples, each tuple represents a coin and its value
len(dataset[0])
type(dataset[0][0])
dataset[0][0][0]

tensor([[[1.4612, 1.4440, 1.4612,  ..., 1.4269, 1.4098, 1.3927],
         [1.4440, 1.4612, 1.4612,  ..., 1.4269, 1.3927, 1.3755],
         [1.4612, 1.4440, 1.4612,  ..., 1.4269, 1.4269, 1.3584],
         ...,
         [1.4440, 1.4269, 1.4269,  ..., 1.4440, 1.4440, 1.4269],
         [1.4269, 1.4269, 1.4269,  ..., 1.4269, 1.4269, 1.4440],
         [1.4098, 1.4098, 1.4269,  ..., 1.4269, 1.4440, 1.4612]],

        [[1.5532, 1.5357, 1.5532,  ..., 1.5532, 1.5532, 1.5357],
         [1.5357, 1.5532, 1.5532,  ..., 1.5532, 1.5357, 1.5182],
         [1.5532, 1.5357, 1.5532,  ..., 1.5707, 1.5707, 1.5182],
         ...,
         [1.4832, 1.4657, 1.4657,  ..., 1.5357, 1.5357, 1.5182],
         [1.4832, 1.4657, 1.4657,  ..., 1.5182, 1.5182, 1.5357],
         [1.4832, 1.4657, 1.4657,  ..., 1.5182, 1.5357, 1.5532]],

        [[1.7163, 1.6988, 1.7163,  ..., 1.7337, 1.6988, 1.6814],
         [1.6988, 1.7163, 1.7163,  ..., 1.7163, 1.6988, 1.6640],
         [1.7163, 1.6988, 1.7163,  ..., 1.7163, 1.7163, 1.

In [8]:
def collate_fn(batch):
    images = []
    labels = []
    # 遍历batch中的每一个元素，每个元素都是一个图片中的多个硬币
    for item in batch:
        for coin, label in item:
            images.append(coin)           # 添加每个硬币的图像
            labels.append(label_to_index[label])  # 使用映射转换标签

    # 将images列表转换为一个tensor，labels列表转换为一个tensor
    images = torch.stack(images)
    labels = torch.tensor(labels, dtype=torch.int64)

    return images, labels


data_loader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [10]:
import torchvision.models as models

# 加载预训练的 ResNet50 模型
model = models.resnet50(pretrained=True)

# 修改最后一个全连接层以匹配硬币分类的类别数
num_classes = len(label_to_index)  # 假设你已经有了标签索引
model.fc = torch.nn.Linear(model.fc.in_features, num_classes)

model = model.to(device)



In [11]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [12]:
def train_model(model, data_loader, criterion, optimizer, num_epochs=5):
    model.train()  # 将模型设置为训练模式
    for epoch in range(num_epochs):
        running_loss = 0.0
        for images, labels in data_loader:
            images, labels = images.to(device), labels.to(device)
            
            # 清除之前的梯度
            optimizer.zero_grad()
            
            # 正向传播
            outputs = model(images)
            
            # 计算损失
            loss = criterion(outputs, labels)
            
            # 反向传播
            loss.backward()
            
            # 优化器步骤
            optimizer.step()
            
            # 累加损失
            running_loss += loss.item()
        
        # 打印每个epoch的损失
        print(f'Epoch {epoch+1}, Loss: {running_loss/len(data_loader)}')


In [13]:
train_model(model, data_loader, criterion, optimizer, num_epochs=5)


Epoch 1, Loss: 2.6721158027648926
Epoch 2, Loss: 1.2421990036964417
Epoch 3, Loss: 0.6557349562644958
Epoch 4, Loss: 0.31325186789035797
Epoch 5, Loss: 0.16929294541478157


In [14]:
model.eval()  # 设置模型为评估模式

def predict_image(image_path, model, transform, device):
    image = Image.open(image_path).convert('RGB')
    image = transform(image)
    image = image.unsqueeze(0)  # 添加batch维度
    image = image.to(device)

    with torch.no_grad():
        output = model(image)
        probabilities = torch.nn.functional.softmax(output[0], dim=0)
        top_prob, top_catid = torch.topk(probabilities, 1)

    # 反转label_to_index字典
    index_to_label = {v: k for k, v in label_to_index.items()}

    return index_to_label[top_catid.item()], top_prob.item()

# 示例推理
test_image_path = os.path.join('data', '1.jpg')
predicted_class, probability = predict_image(test_image_path, model, transform, device)
print(f"Predicted class: {predicted_class}, Probability: {probability}")

Predicted class: 2CHF, Probability: 0.44478124380111694
