In [None]:
!pwd

In [None]:
!pip install datasets opencv-python numpy Pillow tqdm pandas -q

In [None]:
from datasets import load_dataset
from crop import crop_image
from pathlib import Path
import pandas as pd
from torch.utils.data import DataLoader


In [None]:
PROJECT_PATH = "/home/bio/lhz/NewChem"

In [None]:
IMG_PATH = Path(PROJECT_PATH) / "data/images"

In [None]:
CSV_PATH = Path(PROJECT_PATH) / "data/labels.csv"

In [None]:
!ls /home/bio/lhz/NewChem/data/images

## Crop images

In [None]:
OUTPUT_ROOT = Path(PROJECT_PATH) / "data/images_crop"
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

In [None]:
crop_image(IMG_PATH, OUTPUT_ROOT)

In [None]:
OUTPUT_ROOT

In [None]:
dataset = load_dataset(
    "imagefolder",
    data_dir=OUTPUT_ROOT,
)

In [None]:
dataset['train'].features['label'].names

In [None]:
def convert_label(example):
    # label is the folder name, but HFDataset turns it into an int class id.
    # Need to convert back.
    folder_names = dataset['train'].features['label'].names
    class_name = folder_names[example['label']]
    example['label'] = float(class_name)
    return example

In [None]:
dataset = dataset.map(convert_label)


In [None]:
print(dataset)
print(dataset['train'][0])

In [None]:
dataset['train'][0]['image']

In [None]:
pd.read_csv(CSV_PATH)

## VGG16

In [None]:
import torch
import torch.nn as nn
from torchvision import models, transforms

# 使用预训练 VGG16
vgg = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)

# 修改最后层为回归（1个输出：浓度）
vgg.classifier[6] = nn.Linear(4096, 1)

vgg = vgg.cuda()
vgg.eval()


In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(vgg.parameters(), lr=1e-4)


In [None]:
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, idx):
        sample = self.dataset[idx]
        img = preprocess(sample['image'])
        label = torch.tensor(sample['label'], dtype=torch.float32)
        return img, label

loader = DataLoader(MyDataset(dataset['train']), batch_size=16, shuffle=True)


In [None]:
vgg.train()
for epoch in range(5):  # 小 epoch 测试
    for imgs, labels in loader:
        imgs, labels = imgs.cuda(), labels.cuda().unsqueeze(1)
        optimizer.zero_grad()
        outputs = vgg(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch}, loss={loss.item():.4f}")

In [None]:
def predict(img):
    img = preprocess(img).unsqueeze(0).cuda()
    with torch.no_grad():
        pred = vgg(img)
    return pred.item()
