# incomplete adaptive softmax model

Wasn't able to build the full model successfully but here's what i tried!

In [None]:
import torch
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm 


In [None]:
class AdaptiveSoftmax(nn.Module):
    def __init__(self, input_size, cutoff, reduce_factor=4):
        super().__init__()

        self.input_size = input_size
        self.cutoff = cutoff
        self.output_size = cutoff[0] + len(cutoff) - 1

        self.head = nn.Linear(input_size, self.output_size)
        self.tail = nn.ModuleList()

        for i in range(len(cutoff) - 1):
            if reduce_factor == 1:
                seq = nn.Linear(input_size, cutoff[i + 1] - cutoff[i])

            else:
                seq = nn.Sequential(
                    nn.Linear(input_size, input_size // reduce_factor ** i, False),
                    nn.Linear(
                        input_size // reduce_factor ** i, cutoff[i + 1] - cutoff[i]
                    ),
                )

            self.tail.append(seq)

    def set_target(self, target):
        self.id = []

        for i in range(len(self.cutoff) - 1):
            mask = target.ge(self.cutoff[i]).mul(target.lt(self.cutoff[i + 1]))

            if mask.any():
                self.id.append(mask.float().nonzero().squeeze(1))

            else:
                self.id.append(None)

    def forward(self, input, target=None):
        output = [self.head(input)]

        if target is not None:
            self.set_target(target)

        for i in range(len(self.id)):
            if self.id[i] is not None:
                output.append(self.tail[i](input.index_select(0, self.id[i])))

            else:
                output.append(None)

        return output

    def log_prob(self, input):
        with torch.no_grad():
            head_out = self.head(input)

            batch_size = head_out.size(0)
            prob = torch.empty(batch_size, self.cutoff[-1], device=input.device)

            lsm_head = F.log_softmax(head_out, 1)
            prob[:, : self.cutoff[0]].copy_(lsm_head[:, : self.cutoff[0]])

            for i in range(len(self.tail)):
                split = lsm_head[:, self.cutoff[0] + i].unsqueeze(1)
                lsm_tail = F.log_softmax(self.tail[i](input), 1)
                prob[:, self.cutoff[i] : self.cutoff[i + 1]].copy_(lsm_tail).add_(split)

        return prob


In [None]:
class AdaptiveLoss(nn.Module):
    def __init__(self, cutoff):
        super().__init__()

        self.cutoff = cutoff

    def remap_target(self, target):
        new_target = [target.clone()]

        for i in range(len(self.cutoff) - 1):
            mask = target.ge(self.cutoff[i]).mul(target.lt(self.cutoff[i + 1]))
            new_target[0][mask] = self.cutoff[0] + i

            if mask.any():
                new_target.append(target[mask].add(-self.cutoff[i]))

            else:
                new_target.append(None)

        return new_target

    def forward(self, input, target):
        batch_size = input[0].size(0)
        target = self.remap_target(target.data)

        output = 0.0

        for i in range(len(input)):
            if input[i] is not None:
                assert target[i].min() >= 0 and target[i].max() <= input[i].size(1)
                output = output + F.cross_entropy(
                    input[i], target[i], size_average=False
                )

        output /= batch_size

        return output

In [None]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=(3,3), padding=(1,1))
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3,3), padding=(1,1))
        self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3,3), padding=(1,1))
        self.conv4 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=(3,3), padding=(1,1))
        self.pool = nn.MaxPool2d(2,2)
        self.fc1 = nn.Linear(in_features=8*8*512, out_features=1024)
        self.fc2 = nn.Linear(in_features=1024, out_features=256)
        self.Dropout = nn.Dropout(0.25)
        self.fc3 = nn.Linear(in_features=256, out_features=100)

    def forward(self, x):
        x = F.relu(self.conv1(x)) 
        x = F.relu(self.conv2(x)) 
        x = self.pool(x) 
        x = self.Dropout(x)
        x = F.relu(self.conv3(x)) 
        x = F.relu(self.conv4(x)) 
        x = self.pool(x) 
        x = self.Dropout(x)
        x = x.view(-1, 8*8*512) 
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.Dropout(x)
        
        x =self.fc3(x)
        m = AdaptiveSoftmax(x.shape[1], [2000, 10000]).to(device)
        m.set_target(x.data)
        x = m(x)
        return x

In [None]:
num_epochs = 10
batch_size = 50
learning_rate = 3e-4
num_workers = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),

])

transform_test = transforms.Compose([
    transforms.ToTensor(),
])

cifar_trainset = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train)
trainset_loader = torch.utils.data.DataLoader(cifar_trainset, batch_size=batch_size,shuffle=True, num_workers=num_workers)

cifar_testset = datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test)
testset_loader = torch.utils.data.DataLoader(cifar_testset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [None]:
model = ConvNet().to(device)
criterion = AdaptiveLoss([2000, 10000])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(tqdm(trainset_loader)):

        data = data.to(device=device)
        targets = targets.to(device=device)

        scores = model(data)
        loss = criterion(scores,targets)

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()


In [None]:
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device)
            y = y.to(device=device)

            scores = model(x)
            
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

    model.train()
    return num_correct / num_samples


print(f"Accuracy on training set: {check_accuracy(trainset_loader, model)*100:.2f}")
print(f"Accuracy on test set: {check_accuracy(testset_loader, model)*100:.2f}")

This is the part where I am getting error, will try to figure out how to fix this!