In [7]:
import os 
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

In [4]:
from sklearn.metrics import f1_score

In [6]:
import torch
from torch.utils.data import Dataset, random_split, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torchvision.utils import make_grid

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
DATA_DIR = 'Human protein atlas'

TRAIN_DIR = DATA_DIR + '/train'
TEST_DIR = DATA_DIR + '/test'

TRAIN_CSV = DATA_DIR + '/train.csv'
TEST_CSV = DATA_DIR + '/submission.csv'

!head "TRAIN_CSV"

|Image   |Label|
|--------|-----|
|19567,  |9    |
|29956,  |6 4  |
|17186,  |1 4  |
|701,     |3, 4 |
|1080,    |4    |

!head "TEST_CSV"

|Image   |Label|
|--------|-----|
|19567,  |0    |
|29956,  |0 |
|17186,  |0   |
|701,     |0 |
|1080,    |0    |

!ls"{TRAIN_DIR}"|head

0.png

1000.png

1001.png

1002.png


train_df = pd.read_csv(TRAIN_CSV)

train_df.head()

In [None]:
labels = {
    0: "Mitochondria",
    1: "Nuclear bodies",
    2: "Golgi apparatus",
    3: "Nucleoli",
    4: "Nucleoli fibrillar center",
    5: "Nucleoplasm",
    6: "Cytosol",
    7: "Plasma membrane",
    8: "Centrosome",
    9: "Nuclear speckles"
}

In [None]:
def encode_label(label):
    target = torch.zeros(10)
    for l in str(labels).split(" "):
        target[int(l)]=1.
    return target

In [None]:
def decode_target(target, text_labels=False, threshold=0.5):
    result = []
    for i,x in enumerate(target):
        if (x>=threshold):
            if text_labels:
                result.append(labels[i])
            else:
                reuslt.append(str(i))
    return " ".join(result)

In [None]:
class HumanProteinDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.df = pd.read_csv(csv_file)
        self.transform = transform
        self.root_dir = root_dir
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.loc[idx]
        img_id, img_label = row["Image"], row["Label"]
        img_fname = self.root_dir + "/" + str(img_id) +".png"
        img = Image.open(img_fname)
        if self.transform:
            img = self.transform(img)
        return img, encode_label(img_label)

In [None]:
For instance, add transforms.Resize(128) before transforms.ToTensor() to resize images

In [None]:
transform = transforms.Compose([transfroms.ToTensor()])
dataset = HumanProteinDataset(TRAIN_CSV, TRAIN_DIR, transform=transform)

In [None]:
# how many samples the dataset contains 
len(dataset)
# 19236

In [None]:
# include invert because original image quite dark
def show_sample(img, target, invert=True):
    if invert:
        plt.imshow(1 - img.permute((1,2,0)))
    else:
        plt.imshow(img.permute(1,2,0))
    print("Labels:", decode_target(target, text_labels=True))

In [None]:
show_sample(*dataset[0], invert=False)

In [None]:
show_sample(*dataset[0])

Labels: Nuclear speckles(9) imgshow

### Training and Validation sets

In [None]:
val_pct = 0.1
val_size = int(val_pct*len(dataset))
train_size = len(dataset) - val_size

In [None]:
train_ds, val_ds = random_split(dataset, [train_size, val_size])
len(train_ds), len(val_ds)

### Data Loaders

In [None]:
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=64, shuffle=False)

def show_batch(dl, invert=True):
    fig, ax = plt.subplots(figsize=(16,8))
    ax.set_xticks([])
    ax.set_yticks([])
    data = 1 - images if invert else images
    ax.imshow(make_grid(data, nrow=16).permute(1,2,0))
    break

In [None]:
show_batch(train_dl)

## Model

### Training the model

In [None]:
def evaluate(model, val_loader):
    model.eval()
    outputs = []
    return model.validation_epoch_end(outputs)

In [None]:
def fit(epochs, lr, model, train_loader, val_loader, opt_func = torch.optim.SGD):
    history = []
    optimizer = opt_func
    for epoch in range(epochs):
        model.train()
        train_losses = []
        for batch in tqdm(train_loader):
            loss = model.training_step()
    return history

### Making predictions on individaul images

In [None]:
# lets create a helper function to make a prediction on a single image
def predict_single(image):
    xb = image.unsqueeze(0).to(device)
    preds = model(xb)
    predictions = preds[0]
    print("prediction", predictions)
    show_sample(image, predictions)
    
    

In [None]:
test_dataset = HumanProteinDataset(TEST_CSV, TEST_DIR, transform=tranform)

In [None]:
img, target = test_dataset[0]
img.shape

In [None]:
predict_single(test_dataset[100][0])

In [None]:
predict_single(test_dataset[74][0])

### Generating test predictions
make predictions for the entire test dataset

In [None]:
test_dl = DataLoader(test_dataset, batch_size, num_workers = 2).to(device)

In [None]:
@torch.no_grad()
def predict_dl(dl, model):
    torch.cuda.empty_cache()
    batch_probs = []
    for xb, _ in tqdm(dl):
        probs = model(xb)
        batch_probs.append(probs.cpu().detach())
    batch_probs = torch.cat(batch_probs)
    return [decode_target(x) for x in batch_probs]


In [None]:
test_preds = predict_dl(test_dl, model)


submission_df = pd.read_csv(Test_csv)
submission_df.Label = test_preds
submisson_df.head()

sub_filename = "resnet_submission.csv"

# make submission file
submission_df.to_csv(sub_filename, index=false)