In [1]:
# Imports

In [2]:
from fastai.vision.all import *
import os
import pandas as pd
import zipfile
import os
import pandas as pd
from torchvision.io import read_image
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import pickle



# Import files

In [3]:
files = get_image_files("lumos_datathon/archive/img_align_celeba")
len(files)

202599

# Exclude evaluation data

In [4]:
f_eval = pd.read_csv("eval_data_public.csv")
df_train = pd.read_csv("train_data.csv")
df_att = pd.read_csv("list_attr_celeba.csv").set_index("image_id")
df_att = df_att.replace({-1:0})
columns = df_att.columns

def get_labels(row):
    return ",".join(list(columns[row]))


df_eval = pd.read_csv("eval_data_public.csv")
s_eval = df_eval.set_index("id")["16_image_ids"]
df_train = df_train.set_index("id")[["16_image_ids","anomalous_image_id","attributes"]]
for col in columns:
    df_att[col] = df_att[col].apply(bool)
    
eval_set = set()
for idx in range(len(df_eval)):
    ids = df_eval.iloc[idx]["16_image_ids"].split(" ")
    eval_set = set.union(set(ids))
    
train_labels = list(set(df_att.index).difference(eval_set))
val_labels = list(eval_set)

# Define ImageDataLoaders for fast ai

In [5]:
all_label_list = list(df_att.apply(get_labels,axis=1))
train_label_list = list(df_att.loc[train_labels].apply(get_labels,axis=1))
val_label_list = list(df_att.loc[val_labels].apply(get_labels,axis=1))

path = "lumos_datathon/archive/img_align_celeba"
df_train2 = pd.DataFrame.from_dict({"name": train_labels, "labels": train_label_list})
train_dls = ImageDataLoaders.from_df(df_train2, path,label_delim=",")

full_df = pd.DataFrame.from_dict({"name": list(df_att.index), "labels": all_label_list})
full_dls = ImageDataLoaders.from_df(full_df, path,label_delim=",")

Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `number_workers` is changed to 0 to avoid getting stuck
Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `number_workers` is changed to 0 to avoid getting stuck


# Define Metrics and finetune the resnet34

In [None]:
f1_macro = F1ScoreMulti(thresh=0.5, average='macro')
f1_macro.name = 'F1(macro)'
f1_samples = F1ScoreMulti(thresh=0.5, average='samples')
f1_samples.name = 'F1(samples)'
learn = vision_learner(train_dls, resnet34, metrics=[partial(accuracy_multi, thresh=0.5) ,f1_macro,  f1_samples])

# learn.lr_find()
learn.fine_tune(1, 0.003) # TODO change to 6



epoch,train_loss,valid_loss,accuracy_multi,F1(macro),F1(samples),time


#

In [None]:
interp = Interpretation.from_learner(learn)
interp.plot_top_losses(10)

# Export model

In [None]:
learn.export(f"{os.getcwd()}/learners/multi_model_no_eval.pkl")

#

In [None]:
dl = learn.dls.test_dl(files)
dl.show_batch()
preds = learn.get_preds(dl=dl)
preds = preds[0]
preds

# Save predictions

In [None]:
with open('tensorpreds_all_no_eval.pkl', 'wb') as handle:
    pickle.dump(preds, handle, protocol=pickle.HIGHEST_PROTOCOL)
series_preds = pd.DataFrame.from_dict({"name":list(df_att.index),"preds":list(preds)}).set_index("name")["preds"]

# Pytorch


In [None]:
df_train

# Define the dataset and create dataloaders

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

def get_tensor_from_traindf(row):
    return torch.tensor(np.array([xid == row["anomalous_image_id"] for xid in row["16_image_ids"].split(" ")]).astype(float))
target_series = df_train.apply(get_tensor_from_traindf,axis=1)
df_train["target_series"]=target_series


learning_rate = 1e-3
batch_size = 64

class CustomDataset(Dataset):
    def __init__(self, series_preds, df_train):
        self.series_preds = series_preds
        self.df_train= df_train


    def __len__(self):
        return len(self.df_train)

    def __getitem__(self, idx):
        row = self.df_train.iloc[idx]
        image_ids = row["16_image_ids"].split(" ")
        target = row["target_series"]
        inp = torch.tensor(np.concatenate(self.series_preds.loc[image_ids]))
        return inp, target


# MAX_LEN = 10000
MAX_LEN = 157692
arr = np.arange(MAX_LEN)
np.random.shuffle(arr)
train_idx, val_idx = arr[:int(MAX_LEN*0.95)],arr[int(MAX_LEN*0.95):MAX_LEN]
train_dataset = CustomDataset(series_preds=series_preds,df_train=df_train.iloc[train_idx])
val_dataset = CustomDataset(series_preds=series_preds,df_train=df_train.iloc[val_idx])

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

# Define training and evaluation 

### of Neural Network that takes in label probabilities of 16 images and predicts the anomaly

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(16*40, 320),
            nn.ReLU(),
            nn.Linear(320, 160),
            nn.ReLU(),
            nn.Linear(160, 16),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
    
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X=X.to(device)
        y=y.to(device)
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            
def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X=X.to(device)
            y=y.to(device)
            pred = model(X)
            # print(pred.argmax(1))
            # print(y.argmax(1))
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    


# Train

In [None]:
epochs = 1
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(val_dataloader, model, loss_fn)
print("Done!")

# Save model

In [None]:
torch.save(model,"models/ep9_noleak.pkl")

# Inference

In [None]:
tensors=[]
for idx in range(len(df_eval)):
    row = df_eval.iloc[idx]
    image_ids = row["16_image_ids"].split(" ")
    inp = torch.tensor(np.concatenate(series_preds.loc[image_ids])).reshape((1,640))
    tensors.append(inp)
    
tensors = torch.cat(tensors,dim=0)
tensors = tensors.to(device)
with torch.no_grad():
    eval_preds = model(tensors)
eval_label_list = list(eval_preds.argmax(1))
eval_label_list = [x.item() for x in eval_label_list]
result_df = pd.DataFrame.from_dict({"id":list(df_eval.index),"anomalous_image_index":eval_label_list})
result_df = result_df.set_index("id")

# Save dataframe ready for submission

In [None]:
result_df.to_csv("result_csvs/ep9_noleak.csv")

# Misc

### Testing how many labels are in df_eval

In [None]:
df_eval

### Unzipping

In [None]:
with zipfile.ZipFile("lumos-datathon.zip", 'r') as zip_ref:
    zip_ref.extractall("lumos_datathon")