In [20]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from torchsummary import summary
import os
import time

from dataset import HackathonDataset
from convnet import ConvNet
from resnet import ResNet

from config import DATA_DIR, DEVICE, USE_RAW, AUTO_ROTATE

In [27]:
class Ensemble:
    
    def __init__(self, Model, device, n_estimators):
        self.Model = Model
        self.instances = [self.Model(device) for i in range(n_estimators)]
        self.performances = []
    
    def fit(self, train_dataloader, test_dataloader, n_epochs, print_frequency):
        for it, instance in enumerate(self.instances):
            print(f"\n=== Training instance {it+1}/{len(self.instances)} ===\n")
            score = instance.fit(train_dataloader, test_dataloader, n_epochs, print_frequency)
            self.performances.append(score)
    
    def predict(self, dataloader, percentage):
        good_instances = []
        for i in range(len(self.instances)):
            index = np.argmin(self.performances)
            good_instances.append(self.instances[index])
            self.instances.pop(index)
            self.performances.pop(index)
        predictions = [instance.predict(dataloader) for instance in good_instances]
        return np.mean(predictions, axis=0)

In [28]:
n_epochs = 3
n_estimators = 20
print_frequency = 3
batch_size = 8  # High batch size often happen to not converge... So we use small batches, even if slower
pred_batch_size = 128  # There is no problem of convergence for training batch size

In [29]:
#========================NOTE============================
# We often have to reset the model, because it won't converge. I don't know why, but it is useful to know
# If the training loss is stuck around 22 and the validation loss is stuck around 10,
# reset the model by running this cell again, and relaunch training
#========================END OF NOTE=====================

dataset = HackathonDataset(DATA_DIR + 'mixed_train.csv', DATA_DIR, USE_RAW, transform=True, auto_rotate=AUTO_ROTATE)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=os.cpu_count() - 2)
val_dataset = HackathonDataset(DATA_DIR + 'mixed_validation.csv', DATA_DIR, USE_RAW, auto_rotate=AUTO_ROTATE)
val_dataloader = DataLoader(val_dataset, batch_size=pred_batch_size, shuffle=False, num_workers=os.cpu_count() - 2)
model = Ensemble(ConvNet, DEVICE, n_estimators)

In [30]:
model.fit(dataloader, val_dataloader, n_epochs, print_frequency)


=== Training instance 1/2 ===

Epoch 1/1
Number of batches viewed : 2373
Current training loss : 8.633549655784899
Current validation loss : 7.920380415878896
Number of batches viewed : 4747
Current training loss : 7.390445281400737
Current validation loss : 5.449304066305086
Number of batches viewed : 7121
Current training loss : 6.927971976287443
Current validation loss : 6.931379663662647
The epoch took  37.17 seconds

=== Training instance 2/2 ===

Epoch 1/1
Number of batches viewed : 2373
Current training loss : 8.336663242437345
Current validation loss : 6.16724094631165
Number of batches viewed : 4747
Current training loss : 7.234367855778238
Current validation loss : 9.655395601678082
Number of batches viewed : 7121
Current training loss : 6.954547437982792
Current validation loss : 6.247616497550424
The epoch took  36.90 seconds


# Evaluation on Test Data

In [33]:
test_dataset = HackathonDataset(DATA_DIR + 'mixed_test.csv', DATA_DIR, USE_RAW, auto_rotate=AUTO_ROTATE)
test_dataloader = DataLoader(test_dataset, batch_size=pred_batch_size, shuffle=False, num_workers=os.cpu_count() - 2)

In [34]:
image_file_names = []
for val in test_dataloader:
    image_file_names += val['image_file_name']

predictions = model.predict(test_dataloader, 0.6)
kaggle_df = pd.DataFrame({'image_id': image_file_names,
                          'predicted_z': predictions})

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [12]:
kaggle_df.to_csv('predictions/prediction-' + datetime.now().strftime("%d-%m-%y:%H-%M") + '.csv', index=False)