In [3]:
import torch

ModuleNotFoundError: No module named 'torch'

In [25]:
# Read dataset
def read_data():
    train = []
    with open("SH-TTC/train.tsv") as fin:
        for line in fin:
            label, text = line.strip().split("\t")
            tokens = text.split()  # could do better tokenization here
            train.append((label, tokens))

    dev = []
    with open("SH-TTC/dev.tsv") as fin:
        for line in fin:
            label, text = line.strip().split("\t")
            tokens = text.split()
            dev.append((label, tokens))
    
    return train, dev

train_data_raw, dev_data_raw = read_data()

In [8]:
train_data_raw[0]

In [9]:
def build_voc(data):
    """
    Build vocabulary mapping, reserving idx 0 for [UNK]
    """
    feat2idx = {}
    feat2idx["[UNK]"] = 0
    next_idx = 1
    for label, features in data:
        for feat in features:
            if feat not in feat2idx:
                feat2idx[feat] = next_idx
                next_idx += 1
    return feat2idx

feat2idx = build_voc(train_data_raw)  # only use train, not dev (why?)

In [26]:
def to_id(feat):
    return feat2idx.get(feat, feat2idx["[UNK]"])

In [13]:
VOC_SIZE = len(feat2idx)
VOC_SIZE

In [27]:
def process_data(raw_data):
    """
    Convert data to tensors
    """
    data = []
    for label, features in raw_data:
        # convert y
        if label == "SH":
            y = torch.Tensor([0])
        else:  # TTC
            y = torch.Tensor([1])

        # convert x
        x = torch.zeros(VOC_SIZE)
        for feat in features:
            x[to_id(feat)] += 1

        data.append((x, y))
    return data

In [28]:
train_data = process_data(train_data_raw)
dev_data = process_data(dev_data_raw)

In [29]:
train_data[0]

In [22]:
train_data[0][0].shape

In [23]:
train_data[0][1].shape

## Model!

Logistic regression is $\sigma(Wx + b)$. `Linear` is a PyTorch object that implements Wx + b, and `torch.sigmoid` is the sigmoid function.

In [30]:
lin = torch.nn.Linear(VOC_SIZE, 1)  # input = |V|, output = 1

In [31]:
# what are its parameters?
for p in lin.parameters():
    print(p)

In [33]:
# test out linear
features = ["hello", "this", "is", "a", "test"]
x = torch.zeros(VOC_SIZE)
for feat in features:
    x[to_id(feat)] += 1

In [34]:
lin(x)  # Wx + b

Notice that PyTorch automatically calculates the gradient (grad_fn). Handy!

In [35]:
torch.sigmoid(lin(x))  # sigmoid(Wx + b)

In [36]:
class LogisticRegressionClassifier(torch.nn.Module):
    def __init__(self, voc_size):
        super().__init__()
        self.linear = torch.nn.Linear(voc_size, 1)  # Wx + b
    
    def forward(self, x):  # special function called when model(x)
        h = self.linear(x)       # h = Wx + b
        return torch.sigmoid(h)  # y = sigmoid(h)

In [37]:
model = LogisticRegressionClassifier(VOC_SIZE)

In [38]:
# what are the model parameters? just the ones in linear
for p in model.parameters():
    print(p)

In [41]:
# get prediction for a single data point
# no_grad means we don't need to calculate gradients
# (do this when testing the model)
with torch.no_grad():
    x = train_data[0][0]
    print(model(x))

In [42]:
# setup the training
loss_func = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters())

In [43]:
# train!
for epoch in range(10):
    print("Epoch", epoch)

    for x, y in train_data:
        model.zero_grad()  # do this before running

        pred = model(x)
        loss = loss_func(pred, y)
        loss.backward()  # calculate gradients
        optimizer.step()  # updates thetas

    # after each epoch, check how we're doing
    # compute avg loss over train and dev sets
    with torch.no_grad():
        total_loss = 0
        for x, y in train_data:
            pred = model(x)
            loss = loss_func(pred, y)
            total_loss += loss
        print("train loss:", total_loss / len(train_data))

        total_loss = 0
        for x, y in dev_data:
            pred = model(x)
            loss = loss_func(pred, y)
            total_loss += loss
        print("dev loss:", total_loss / len(dev_data))

In [44]:
# look at some model predictions
import random

# run on dev data
preds = []
pred_labels = []
with torch.no_grad():
    for x, y in dev_data:
        pred = model(x)
        preds.append(pred.item())
        if pred.item() < 0.5:
            pred_labels.append("SH")
        else:
            pred_labels.append("TTC")

# print a sample of predictions
for _ in range(10):
    idx = random.randint(0, len(dev_data_raw))
    print(dev_data_raw[idx][1])
    print("Gold:", dev_data_raw[idx][0])
    print("Pred:", pred_labels[idx], preds[idx])
    print()

In [1]:
import evaluate  # conda install evaluate scikit-learn
precision = evaluate.load("precision")
recall = evaluate.load("recall")
accuracy = evaluate.load("accuracy")

In [25]:
# evaluate functions require numeric data
# so convert labels to 0 and 1
refs = []
for label, text in dev_data_raw:
    if label == "SH":
        refs.append(0)
    else:
        refs.append(1)

preds_binary = []
for label in pred_labels:
    if label == "SH":
        preds_binary.append(0)
    else:
        preds_binary.append(1)

In [None]:
print(precision.compute(references=refs, predictions=preds_binary))
print(recall.compute(references=refs, predictions=preds_binary))
print(accuracy.compute(references=refs, predictions=preds_binary))

## Your Tasks

Improve the performance of your logistic regression classifier! Try some of the following:
- train for more epochs
- use a different optimizer (ADAM is a good one)
- try a different learning rate, by passing it in as an argument, e.g. (`optim = SGD(lr=0.1)`)
- make better features (tokenization, bigrams, etc)

Note: if you want to train from scratch, you need to reinitialize your model and optimizer (`model = ...`, `optim = ...`). If you rerun the training loop without reinitializing the model and optimizer, it will print epoch 0 1 2 etc but actually will have continued where it left off.