In [None]:
import pandas as pd
import numpy as np
import os
import torch, torchvision
import gc
import sklearn
import time
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import IPython.display as display
import glob
from skimage.io import imread 
from skimage.transform import resize 
from tqdm import tqdm_notebook 
from torchvision import transforms
from torchvision import datasets
from torch.utils.data import DataLoader
import shutil
from torchvision import models
from torch import optim
from sklearn.metrics import accuracy_score

In [None]:
!pip install gdown

In [None]:
!gdown --id 1LQ48tW5iffY1qfV0Ej_IBUsjt-oBCuuo 
!gdown --id 1dVoI5c51SOybpuDqdNEvL509kqlFofFa
!gdown --id 1B9X5iD4elDYFlWv0cYdBrFSU4xsaliuT
!gdown --id 1xgYMO3BLwjHOReKZ8jLgIvViqklPrSvl

In [None]:
!unzip test_dataset.zip
!unzip train_dataset.zip
!rm -f test_dataset.zip
!rm -f train_dataset.zip
!rm -f adc.json

In [None]:
SEED = 123
BATCH_SIZE = 64

In [None]:
device = 'cuda' if torch.cuda.is_available() else torch.device('cpu')
print(device)

In [None]:
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')

In [None]:
train_dataset.info()

In [None]:
train_dataset = train_dataset.sample(frac=1, random_state=SEED)
train_dataset = train_dataset.sample(frac=1, random_state=42)

In [None]:
train_dataset.index = list(i for i in range(train_dataset.shape[0]))

In [None]:
train = train_dataset.iloc[0:train_dataset.shape[0]- train_dataset.shape[0]//10, ].copy()
valid = train_dataset.iloc[train_dataset.shape[0]- train_dataset.shape[0]//10:train_dataset.shape[0], ].copy()

In [None]:
os.makedirs('train_dataset/1', exist_ok=True)
os.makedirs('train_dataset/0', exist_ok=True)
os.makedirs('valid_dataset/1', exist_ok=True)
os.makedirs('valid_dataset/0', exist_ok=True)
os.makedirs('test_dataset/0', exist_ok=True)

In [None]:
for index, item in train.iterrows():
    if item[1] == 1:
        shutil.move('train_dataset/'+item[0], 'train_dataset/1')
    else:
        shutil.move('train_dataset/'+item[0], 'train_dataset/0')


In [None]:
for index, item in valid.iterrows():
    if item[1] == 1:
        shutil.move('train_dataset/'+item[0], 'valid_dataset/1')
    else:
        shutil.move('train_dataset/'+item[0], 'valid_dataset/0')

In [None]:
for index, item in test_dataset.iterrows():
    shutil.move('test_dataset/'+item[0], 'test_dataset/0')

In [None]:
# Image transformations
image_transforms = {
    # Train uses data augmentation
    'train':
    transforms.Compose([
        transforms.Resize(size=224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])  # Imagenet standards
    ]),
    # Validation does not use augmentation
    'valid':
    transforms.Compose([
        transforms.Resize(size=224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}


In [None]:
class ImageFolderWithPaths(datasets.ImageFolder):
    """Custom dataset that includes image file paths. Extends
    torchvision.datasets.ImageFolder
    """

    # override the __getitem__ method. this is the method that dataloader calls
    def __getitem__(self, index):
        # this is what ImageFolder normally returns 
        original_tuple = super(ImageFolderWithPaths, self).__getitem__(index)
        # the image file path
        path = self.imgs[index][0][15:]
        # make a new tuple that includes original and the path
        tuple_with_path = (original_tuple + (path,))
        return tuple_with_path

In [None]:
data = {
    'train':
    datasets.ImageFolder(root='train_dataset', transform=image_transforms['train']),
    'valid':
    datasets.ImageFolder(root='valid_dataset', transform=image_transforms['valid']),
    'test':
    ImageFolderWithPaths(root='test_dataset', transform=image_transforms['valid'])
}

# Dataloader iterators, make sure to shuffle
dataloaders = {
    'train': DataLoader(data['train'], batch_size=BATCH_SIZE, shuffle=True),
    'valid': DataLoader(data['valid'], batch_size=BATCH_SIZE, shuffle=True),
    'test':DataLoader(data['test'], batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
}

In [None]:
# Iterate through the dataloader once
trainiter = iter(dataloaders['train'])
features, labels = next(trainiter)
features.shape, labels.shape

In [None]:
!pip install --upgrade efficientnet-pytorch

In [None]:
from efficientnet_pytorch import EfficientNet

In [None]:
model = EfficientNet.from_pretrained('efficientnet-b0', num_classes=2)

In [None]:
j = 0
for param in model.parameters():
    j+=1
    if j > 185:
        param.requires_grad = False

In [None]:
# model.classifier = nn.Sequential(
#                     nn.Dropout(p=0.2, inplace=True),
#                     nn.Linear(in_features=1280, out_features=2, bias=True)
#                     )

In [None]:
# Move to gpu
model = model.to(device)

In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f'{total_params:,} total parameters.')
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f'{total_trainable_params:,} training parameters.')

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [None]:
def plot_without_attention(tr_err, ts_err, tr_acc, ts_acc):
    plt.clf()
    fig, axs = plt.subplots(1, 4, figsize=(20, 5))
    axs[0].plot(tr_err, label='tr_err')
    axs[0].plot(ts_err, label='ts_err')
    axs[0].legend()
    axs[1].plot(tr_acc, label='tr_acc')
    axs[1].plot(ts_acc, label='ts_acc')
    axs[1].legend()
    axs[2].axis('off')
    axs[3].axis('off')
    display.clear_output(wait=True)
    display.display(plt.gcf())

In [None]:
n_epochs_stop = 5
min_val_loss = np.Inf
epochs_no_improve = 0
i = 0
for epoch in range(6, 10):
    i = 0
    errs, accs = [], []
    val_loss = 0
    if epoch == 1:
        j = 0
        for param in model.parameters():
            j+=1
            if j > 190:
                param.requires_grad = False
            else:
                param.requires_grad = True
    
    if epoch == 2:
        j = 0
        for param in model.parameters():
            j+=1
            if j > 195:
                param.requires_grad = False
            else:
                param.requires_grad = True
    if epoch == 3:
        j = 0
        for param in model.parameters():
            j+=1
            if j > 200:
                param.requires_grad = False
            else:
                param.requires_grad = True
    if epoch == 6:
        j = 0
        for param in model.parameters():
            j+=1
            if j > 205:
                param.requires_grad = False
            else:
                param.requires_grad = True
    for data, targets in tqdm_notebook(dataloaders['train']):
        
        data = data.to(device)
        targets = targets.to(device)
        out = model(data)
        loss = criterion(out, targets)
        loss.backward()
        optimizer.step()
        pred = torch.argmax(out, dim = 1)
        accs.append(accuracy_score(pred.cpu().detach(), targets.cpu().detach()))
        if i % 15 == 0:
            print(np.mean(accs))
        i+=1
        torch.cuda.empty_cache()

    errs, accs = [], []
    with torch.no_grad():
        for data, targets in tqdm_notebook(dataloaders['valid']):

            data = data.to(device)
            targets = targets.to(device)
            out = model(data)
            loss = criterion(out, targets)
            pred = torch.argmax(out, dim = 1)
            accs.append(accuracy_score(pred.cpu().detach(), targets.cpu().detach()))
            if i % 5 == 0:
                print(np.mean(accs))
            i+=1
            val_loss += loss
            torch.cuda.empty_cache()
        
    val_loss = val_loss / len(dataloaders['valid'])
    if val_loss < min_val_loss:
        # Save the model
        torch.save(model, 'effb0')
        epochs_no_improve = 0
        min_val_loss = val_loss
    else:
        epochs_no_improve += 1
        # Check early stopping condition
        if epochs_no_improve == n_epochs_stop:
            print('Early stopping!')
            model = torch.load(checkpoint_path)

In [None]:

i = 0
accs = []
for data, targets in tqdm_notebook(dataloaders['valid']):

    data = data.to(device)
    targets = targets.to(device)
    out = model(data)
    loss = criterion(out, targets)
    loss.backward()
    optimizer.step()
    pred = torch.argmax(out, dim = 1)
    accs.append(accuracy_score(pred.cpu().detach(), targets.cpu().detach()))
    if i % 5 == 0:
        print(np.mean(accs))
    i+=1
    torch.cuda.empty_cache()
i = 0
accs = []
for data, targets in tqdm_notebook(dataloaders['train']):

    data = data.to(device)
    targets = targets.to(device)
    out = model(data)
    loss = criterion(out, targets)
    loss.backward()
    optimizer.step()
    pred = torch.argmax(out, dim = 1)
    accs.append(accuracy_score(pred.cpu().detach(), targets.cpu().detach()))
    if i % 5 == 0:
        print(np.mean(accs))
    i+=1
    torch.cuda.empty_cache()
i = 0
accs = []
for data, targets in tqdm_notebook(dataloaders['valid']):

    data = data.to(device)
    targets = targets.to(device)
    out = model(data)
    loss = criterion(out, targets)
    loss.backward()
    optimizer.step()
    pred = torch.argmax(out, dim = 1)
    accs.append(accuracy_score(pred.cpu().detach(), targets.cpu().detach()))
    if i % 5 == 0:
        print(np.mean(accs))
    i+=1
    torch.cuda.empty_cache()

In [None]:
predict = []
pathes = []

In [None]:
with torch.no_grad():
    predict = []
    pathes = []
    for data, targets,path  in tqdm_notebook(dataloaders['test']):
        pathes+=path
        data = data.to(device)
        targets = targets.to(device)
        out = model(data)
        pred = torch.argmax(out, dim = 1)
        pred = np.array(pred.cpu().detach())
        predict+= list(pred)
        torch.cuda.empty_cache()

In [None]:
np.mean(predict)

In [None]:
res = test_dataset.copy()
res['name'] = pathes
res['is_corrupted'] = predict
res = res.sort_values('name')
res

In [None]:
real_res = test_dataset.copy()
real_res = real_res.sort_values('name')
real_res['is_corrupted'] = list(res['is_corrupted'])
real_res= real_res.sort_index()
real_res

In [None]:
import pandas as pd
from PIL import Image
from os.path import join as pj

NOISE_MANHATTAN = 200
SEQUENCE_MAX_NORMAL = 0.5


def manhattan(a, b):
    return abs(a[0] - b[0]) + abs(a[1] - b[1]) + abs(a[2] - b[2])


def check_rows_for_equal(rgb_image):
    width, height = rgb_image.size
    for j in range(height):
        start_pixel = rgb_image.getpixel((0, j))
        is_equal = True
        for i in range(1, width):
            pixel = rgb_image.getpixel((i, j))
            if start_pixel != pixel:
                is_equal = False
                break
        if is_equal:
            return 1

    return 0


def check_rows_for_noise(rgb_image):
    width, height = rgb_image.size
    for j in range(1, height):
        cnt = 0
        for i in range(width):
            a = rgb_image.getpixel((i, j - 1))
            b = rgb_image.getpixel((i, j))
            cnt += manhattan(a, b) >= NOISE_MANHATTAN
        if cnt > width * SEQUENCE_MAX_NORMAL:
            return 1

    return 0


def check_cols_for_equal(rgb_image):
    width, height = rgb_image.size
    for i in range(width):
        start_pixel = rgb_image.getpixel((i, 0))
        is_equal = True
        for j in range(1, height):
            pixel = rgb_image.getpixel((i, j))
            if start_pixel != pixel:
                is_equal = False
                break
        if is_equal:
            return 1

    return 0


def check_cols_for_noise(rgb_image):
    width, height = rgb_image.size
    for i in range(1, width):
        cnt = 0
        for j in range(height):
            a = rgb_image.getpixel((i - 1, j))
            b = rgb_image.getpixel((i, j))
            cnt += manhattan(a, b) >= NOISE_MANHATTAN
        if cnt > height * SEQUENCE_MAX_NORMAL:
            return 1

    return 0


def is_noisy(image_path):
    with Image.open(image_path) as image:
        rgb_image = image.convert('RGB')
        checks = [
            check_rows_for_equal,
#             check_rows_for_noise,
            check_cols_for_equal,
#             check_cols_for_noise,
        ]

        for check in checks:
            if check(rgb_image):
                return 1

    return 0


In [None]:
data = pd.read_csv('train.csv')

for i in range(real_res.shape[0]):
    image_path = pj('test_dataset/0', real_res.at[i, 'name'])
    pe = real_res.at[i, 'is_corrupted']
    to_check = is_noisy(image_path)
    if to_check != pe and to_check == 1:
        print(image_path)
        real_res.at[i, 'is_corrupted'] = 1

In [None]:
real_res['is_corrupted'].mean()

In [None]:
#submit №7

real_res.to_csv('submit.csv')