<a href="https://colab.research.google.com/github/SamuelMiller413/Deep-Learning-Course/blob/main/dl_categorical_data_with_samuel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
N_CATEGORICAL_COLUMNS = 1
N_ROWS = 1000
N_CATEGORIES_PER_COLUMN = 5
N_LABELS = 3
EMBED_DIM = 16

In [3]:
data = torch.randint(0, N_CATEGORIES_PER_COLUMN, (N_ROWS, N_CATEGORICAL_COLUMNS)).squeeze()

In [4]:
data[:5] # Think about this as category-encoded but not one-hot encoded

tensor([1, 4, 1, 3, 0])

In [5]:
emb = nn.Embedding(N_CATEGORIES_PER_COLUMN, EMBED_DIM)

In [6]:
emb(data)

tensor([[ 0.1319, -0.3124,  0.3684,  ..., -0.9350, -1.2497, -1.4622],
        [-0.1704, -0.7308,  0.6043,  ..., -0.3238,  0.2762, -0.1124],
        [ 0.1319, -0.3124,  0.3684,  ..., -0.9350, -1.2497, -1.4622],
        ...,
        [ 1.7571, -1.2855, -1.0835,  ..., -1.1242, -0.9603, -0.1308],
        [ 0.1319, -0.3124,  0.3684,  ..., -0.9350, -1.2497, -1.4622],
        [ 0.1319, -0.3124,  0.3684,  ..., -0.9350, -1.2497, -1.4622]],
       grad_fn=<EmbeddingBackward0>)

In [7]:
emb.weight.data.shape
# we are projecting from 5 categories -> 16 continuous values

torch.Size([5, 16])

In [8]:
# if the category for a row is 4, embedding is doing the following:
emb.weight.data[4,:]

tensor([-0.1704, -0.7308,  0.6043, -0.0252, -0.8174, -0.4566, -1.7945, -0.3627,
         0.9852, -1.7076,  0.2936, -0.9353, -0.8309, -0.3238,  0.2762, -0.1124])

In [9]:
emb.weight.data

tensor([[-1.0977, -0.6235,  1.5395, -0.1441,  0.4608,  0.4374, -1.1307,  0.4807,
          0.0903,  0.3259,  0.5581, -0.7109,  0.7812, -0.8843,  0.1762,  0.0488],
        [ 0.1319, -0.3124,  0.3684,  1.0828,  2.1708,  0.7908, -0.6787, -0.7855,
         -1.6660, -1.6110,  0.7266, -0.3517, -0.2995, -0.9350, -1.2497, -1.4622],
        [ 1.7571, -1.2855, -1.0835, -0.1057, -1.1563, -0.7365, -1.2056, -0.6754,
          0.8086,  0.3433, -0.0413,  0.8291, -0.4298, -1.1242, -0.9603, -0.1308],
        [-2.2820, -0.3605, -1.6398, -0.5697,  0.9269,  0.5104, -0.1483, -0.2359,
         -0.1281,  0.2689,  1.9507,  0.1574,  2.1857,  0.4530,  0.3748,  1.8512],
        [-0.1704, -0.7308,  0.6043, -0.0252, -0.8174, -0.4566, -1.7945, -0.3627,
          0.9852, -1.7076,  0.2936, -0.9353, -0.8309, -0.3238,  0.2762, -0.1124]])

In [11]:
# If we did it the slow way with matmul, it would look like this...
torch.tensor([0,0,0,1,0]).float()@emb.weight.data

tensor([-2.2820, -0.3605, -1.6398, -0.5697,  0.9269,  0.5104, -0.1483, -0.2359,
        -0.1281,  0.2689,  1.9507,  0.1574,  2.1857,  0.4530,  0.3748,  1.8512])

In [12]:
data = torch.randint(0, N_CATEGORIES_PER_COLUMN, (N_ROWS, 3))

In [13]:
targets = torch.randn(N_ROWS)

In [14]:
data.shape

torch.Size([1000, 3])

In [15]:
# we need to tell our module how many catego
categories_dict = {
    0:5,
    1:5,
    2:5
}

In [16]:
class MultiCategoryEmbedding(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb_1 = nn.Embedding(5, EMBED_DIM)
        self.emb_2 = nn.Embedding(5, EMBED_DIM)
        self.emb_3 = nn.Embedding(5, EMBED_DIM)
        # if you have continuous data, create a single input for those:
        # self.lin_input = nn.Linear(n_continuous_input_features, hidden_dim)

        # Concat embeddings
        self.proj = nn.Linear(EMBED_DIM * 3, 1)
        # self.proj = nn.Linear(EMBED_DIM * 3 + hidden_dim, 1)
        # If you want to add embeddings, do this:
        # self.proj = nn.Linear(EMBED_DIM, 1)
        # with additional continuous features, you'd need to concatenate (or add)
        # them to your embeddings

    def forward(self, x):
        x1, x2, x3 = x[:, 0], x[:,1], x[:,2]
        # x1, x2, x3 = x
        x1 = self.emb_1(x1)
        x2 = self.emb_2(x2)
        x3 = self.emb_3(x3)
        # lin_input = self.lin_input(lin_features) # if we had continuous input
        cat = torch.cat((x1, x2, x3), dim=-1)
        # cat = torch.cat((x1, x2, x3, lin_input), dim=-1) # if we had continuous inputs
        cat = F.relu(cat)
        out = self.proj(cat)
        return out



In [17]:
model = MultiCategoryEmbedding()

In [18]:
class DummyDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        super().__init__()
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)
    def __getitem__(self, idx):
        return self.x[idx, :], self.y[idx]

In [19]:
ds = DummyDataset(data, targets)
dl = torch.utils.data.DataLoader(ds, batch_size=8)

In [20]:
x, y = next(iter(dl))

In [21]:
x

tensor([[3, 4, 4],
        [0, 1, 4],
        [0, 2, 0],
        [4, 1, 2],
        [4, 3, 4],
        [2, 4, 1],
        [0, 1, 4],
        [1, 0, 3]])

In [22]:
outputs = model(x)
outputs

tensor([[ 0.1831],
        [ 0.2394],
        [-0.6227],
        [ 0.7101],
        [ 0.5873],
        [ 0.2400],
        [ 0.2394],
        [ 0.4283]], grad_fn=<AddmmBackward0>)

In [24]:
y

tensor([ 1.3269,  0.2697, -0.9768,  0.3092,  0.2830,  1.3253, -0.8072, -0.0094])

In [25]:
loss = F.mse_loss(outputs, y)

  """Entry point for launching an IPython kernel.


In [26]:
loss.backward()

In [27]:
from torch import optim

In [28]:
opt = optim.Adam(model.parameters(), lr=1e-3)

In [29]:
opt.step()
opt.zero_grad()

In [30]:
nn.EmbeddingBag(3,EMBED_DIM)

EmbeddingBag(3, 16, mode=mean)

# Tabular


In [31]:
from fastai import tabular