## Семинар 6 "Transfer Learning"

ФИО: __Иванов Иван Иванович__

## Задание

Предлагается поучаствовать в конкурсе https://www.kaggle.com/c/human-protein-atlas-image-classification 

Для зачета требуется получить значение f-меры на leaderboard не меньше 0.25 и прислать ноутбук с кодом и кратким отчетом: что пробовали, что сделали, мысли почему окончательная архитектура лучше остальных.

Называйте команду или своего юзера с суффиксом [sphere].

Также первые 3 человека получат бонусные 5, 3, 1 балл соответственно. (deadline: 23:59 19 ноября 2018). Скорее всего будут дополнительные плюшки для призеров конкурса.

При работе на сервере используйте данные из папки /mnt/disk/exch/human-protein-atlas-image-classification . 

### У kaggle есть удобное api

In [None]:
!pip install kaggle # use --user when working on server
## replace ~/.kaggle/kaggle.json with a file from your kaggle profile page
## Download data
!kaggle competitions download -c human-protein-atlas-image-classification
## Sumbit 
!kaggle competitions submit -c human-protein-atlas-image-classification -f submit.csv -m "submition description"

In [20]:
%matplotlib inline
import sys, os 
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torchvision.models import resnet50
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, datasets
from torch.nn import CrossEntropyLoss, Sequential, Linear, Sigmoid, Tanh, BCELoss, Softmax, BatchNorm1d
from torch.utils.data.sampler import Sampler, SubsetRandomSampler, WeightedRandomSampler
from PIL import Image # Replace by accimage when ready
from PIL.Image import FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM, ROTATE_90, ROTATE_180, ROTATE_270
from PIL.ImageEnhance import Color, Contrast, Brightness, Sharpness
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

In [3]:
csv_ = pd.read_csv("data/train.csv")

In [76]:
csv_.head()

Unnamed: 0,Id,Target
0,00070df0-bbc3-11e8-b2bc-ac1f6b6435d0,16 0
1,000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0,7 1 2 0
2,000a9596-bbc4-11e8-b2bc-ac1f6b6435d0,5
3,000c99ba-bba4-11e8-b2b9-ac1f6b6435d0,1
4,001838f8-bbca-11e8-b2bc-ac1f6b6435d0,18


In [2]:
class SubsetSampler(Sampler):
     """Samples elements from a given list of indices.
 
     Arguments:
         indices (list): a list of indices
     """
 
     def __init__(self, indices):
        self.num_samples = len(indices)
        self.indices = indices
 
     def __iter__(self):
        return iter(self.indices)
 
     def __len__(self):
        return self.num_samples


class MultilabelDataset(Dataset):
    """Dataset wrapping images and target labels for Kaggle

    Arguments:
        A CSV file path
        Path to image folder
        Extension of images
    """

    def __init__(self, csv_path, img_path, img_ext, transform=None, train=True):
    
        self.df = pd.read_csv(csv_path)
        assert self.df['Id'].apply(lambda x: os.path.isfile(img_path + x + img_ext)).all(), \
"Some images referenced in the CSV file were not found"
        
        self.mlb = MultiLabelBinarizer()
        self.img_path = img_path
        self.img_ext = img_ext
        self.transform = transform

        self.X = self.df['Id']
        if train:
            self.y = self.mlb.fit_transform(self.df['Target'].str.split()).astype(np.float32)
        else:
            self.y = self.df['Target']

    def X(self):
        return self.X
        
    def __getitem__(self, index):
        img = Image.open(self.img_path + self.X[index] + self.img_ext)
        img = img.convert('RGB')
        if self.transform is not None:
            img = self.transform(img)
        
        label = self.y[index]
        return img, label

    def __len__(self):
        return len(self.df.index)
    
    def getLabelEncoder(self):
        return self.mlb
    
    def getDF(self):
        return self.df

In [3]:
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize(256),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [4]:
X_train = MultilabelDataset('data/train.csv','data/train/','_green.png',
                                 data_transforms['train']
                                 )
X_val = MultilabelDataset('data/train.csv','data/train/','_green.png',
                                 data_transforms['val']
                                 )
X_val.mlb = X_train.mlb

In [5]:
train_idx, valid_idx = train_test_split(np.array(range(len(X_train))), test_size=0.2)

In [6]:
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetSampler(valid_idx)
batch_size=64

In [8]:
train_loader = DataLoader(X_train,
                          batch_size=batch_size,
                          sampler=train_sampler,
                          num_workers=4)

valid_loader = DataLoader(X_val,
                      batch_size=batch_size,
                      sampler=valid_sampler,
                      num_workers=4)

In [50]:
pretrained_model = resnet50(pretrained=True)
for param in pretrained_model.parameters():
    param.requires_grad = False

In [51]:
num_ftrs = pretrained_model.fc.in_features

### Замените ``pretrained_model.fc`` на новую полносвязную сеть

In [52]:
pretrained_model.fc = Sequential( ## Your code here )

In [53]:
use_cuda = False
if use_cuda:
    pretrained_model = pretrained_model.cuda(3)

In [13]:
dtype=torch.FloatTensor

def train(network, epochs, learning_rate, train_dataloader, test_dataloader, loss=BCELoss(), optim=torch.optim.Adam):
    train_loss_epochs = []
    test_loss_epochs = []
    optimizer = optim(network.parameters(), lr=learning_rate)
    try:
        for epoch in range(epochs):
            losses = []
            for X, y in train_dataloader:
                if use_cuda:
                    X = Variable(X).cuda(3)
                    y = Variable(y).cuda(3)
                else:
                    X = Variable(X)
                    y = Variable(y)
                
                optimizer.zero_grad()

                prediction = network(X)
                loss_batch = loss(prediction, y)
                losses.append(loss_batch.data[0].cpu())

                loss_batch.backward()
                optimizer.step()
  
            train_loss_epochs.append(np.mean(losses))
            losses = []    
            for X, y in test_dataloader:
                if use_cuda:
                    X = Variable(X).cuda(3)
                    y = Variable(y).cuda(3)
                else:
                    X = Variable(X)
                    y = Variable(y)
                
                prediction = network(X)
                loss_batch = loss(prediction, y)
                losses.append(loss_batch.data[0].cpu())
                
            test_loss_epochs.append(np.mean(losses))
            sys.stdout.write('\rEpoch {0}... (Train/Test) BCE: {1:.3f}/{2:.3f}'.format(
                        epoch, train_loss_epochs[-1], test_loss_epochs[-1]))
    except KeyboardInterrupt:
        pass
    plt.figure(figsize=(12, 5))
    plt.plot(train_loss_epochs[1:], label='Train')
    plt.plot(test_loss_epochs[1:], label='Test')
    plt.xlabel('Epochs', fontsize=16)
    plt.ylabel('Loss', fontsize=16)
    plt.legend(loc=0, fontsize=16)
    plt.grid('on')
    plt.show()

In [None]:
train(pretrained_model, ?, ?, train_loader, valid_loader)

## Validate your model

In [None]:
predictions_validation = []
targets = []
for X, y in valid_loader:
    if use_cuda:
        X = Variable(X).cuda(3)
    else:
        X = Variable(X)
   
    prediction = pretrained_model(X)
    predictions_validation.append(prediction.data.cpu().numpy())
    targets.append(y.cpu().numpy())

predictions_validation = np.concatenate(preds)
targets = np.concatenate(targets)

In [None]:
#find optimal threshold for probabilities
threshold = ?
print (f1_score(targets, (predictions_validation > threshold).astype(int), average="macro"))

## Make submission

In [None]:
csv_sub = pd.read_csv("data/sample_submission.csv")
csv_sub.columns = ["Id", "Target"]
csv_sub.to_csv("data/sample_submission_for_dataset.csv", index=False)

In [126]:
X_test = MultilabelDataset('data/sample_submission_for_dataset.csv','data/test/','_green.png',
                                  data_transforms['val'], train=False
                                 )
X_test.mlb = X_train.mlb
test_loader = DataLoader(X_test,
                          batch_size=batch_size,
                          num_workers=4)

In [None]:
predictions = []
for X, _ in test_loader:
    if use_cuda:
        X = Variable(X).cuda(3)
    else:
        X = Variable(X)
    
    prediction = pretrained_model(X)
    predictions.append(prediction.data.cpu().numpy())

predictions_test = np.concatenate(predictions)

In [135]:
csv_sub = pd.read_csv("data/sample_submission.csv")

In [136]:
csv_sub["Predicted"] = list(map(lambda t: " ".join([str(x) for x in t]), [list(map(int, t)) for t in X_train.mlb.inverse_transform((predictions_test > threshold))]))

In [94]:
csv_sub.to_csv("submit.csv", index=False)