In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [28]:
data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
        ("Give it to me".split(), "ENGLISH"),
        ("No creo que sea una buena idea".split(), "SPANISH"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

test_data = [("Yo creo que si".split(), "SPANISH"),
             ("it is lost on me".split(), "ENGLISH")]

# build vocabulary by assigning each unique qord it's own index
vocab = {}
labels = {}
for sent, lab in data + test_data:
    for word in sent:
        if word not in vocab:
            vocab[word] = len(vocab)
    if lab not in labels:
        labels[lab] = len(labels)

print(vocab)
print(labels)

VOCAB_SIZE = len(vocab)
NUM_LABELS = len(labels)

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}
{'SPANISH': 0, 'ENGLISH': 1}


In [29]:
class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(BoWClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec), dim=1)

def get_key(dictionary, val): 
    for key, value in dictionary.items(): 
         if val == value: 
            return key 
  
    return "key doesn't exist"
  

In [30]:
def make_bow_vector(sentence, vocab):
    vec = torch.zeros(VOCAB_SIZE)
    for word in sentence:
        vec[vocab[word]] += 1
    return vec.view(1, -1)


def make_target(lab, labels):
    return torch.LongTensor([labels[lab]])

In [31]:
model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

for param in model.parameters():
    print(param)

Parameter containing:
tensor([[-0.0961,  0.0703,  0.1912,  0.1877,  0.1613, -0.1460,  0.0187,  0.1140,
         -0.1740,  0.1363,  0.1667,  0.0098,  0.1887,  0.1470, -0.1912,  0.1458,
         -0.0378, -0.1044,  0.1218,  0.0467, -0.1201, -0.1559, -0.0839, -0.0589,
          0.1567,  0.1368],
        [ 0.0110, -0.0164,  0.0794,  0.0679, -0.0557,  0.0086, -0.1139, -0.0593,
         -0.0156,  0.1427, -0.0043,  0.1025, -0.1779, -0.1048,  0.1961,  0.1876,
         -0.1954, -0.0689, -0.1646, -0.1470, -0.1670,  0.0085, -0.1381,  0.0012,
          0.1302,  0.0108]], requires_grad=True)
Parameter containing:
tensor([-0.0245,  0.1824], requires_grad=True)


In [32]:
# Run on test data before we train, just to see a before-and-after
with torch.no_grad():
    for instance, label in test_data:
        bow_vec = make_bow_vector(instance, vocab)
        log_probs = model(bow_vec)
        print(log_probs)
        label_pred = get_key(labels, log_probs.view(1, -1).argmax(1).item())
        print('Predicted :: ' , label_pred, '  Ground truth :: ', label)

# Print the matrix column corresponding to "creo"
print(next(model.parameters())[:, vocab["creo"]])

tensor([[-0.7776, -0.6153]])
Predicted ::  ENGLISH   Ground truth ::  SPANISH
tensor([[-0.7039, -0.6825]])
Predicted ::  ENGLISH   Ground truth ::  ENGLISH
tensor([ 0.1667, -0.0043], grad_fn=<SelectBackward>)


In [33]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
for epoch in range(10):
    for instance, label in data:
        # Step 1. PyTorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Make our BOW vector and also we must wrap the target in a
        # Tensor as an integer. For example, if the target is SPANISH, then
        # we wrap the integer 0. The loss function then knows that the 0th
        # element of the log probabilities is the log probability
        # corresponding to SPANISH
        bow_vec = make_bow_vector(instance, vocab)
        target = make_target(label, labels)

        # Step 3. Run our forward pass.
        log_probs = model(bow_vec)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()


In [35]:
with torch.no_grad():
    for instance, label in test_data:
        bow_vec = make_bow_vector(instance, vocab)
        log_probs = model(bow_vec)
        print(log_probs)
        label_pred = get_key(labels, log_probs.view(1, -1).argmax(1).item())
        print('Predicted :: ' , label_pred, '  Ground truth :: ', label)
        
# Index corresponding to Spanish goes up, English goes down!
print(next(model.parameters())[:, word_to_ix["creo"]])

tensor([[-0.3510, -1.2174]])
Predicted ::  SPANISH   Ground truth ::  SPANISH
tensor([[-1.6799, -0.2063]])
Predicted ::  ENGLISH   Ground truth ::  ENGLISH
tensor([ 0.4006, -0.2383], grad_fn=<SelectBackward>)
