In [100]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [101]:


import torch
import numpy as np
import os
import pandas as pd
import spacy
import sys
sys.path.append('../')
from classifierModel import EncoderModel, ClassifierModel
import torch.nn as nn
import torch.nn.functional as F
import pickle

from data_utils import (IndexVectorizer,
                        SpacyTokenizer,
                        TextDataset,
                        LMDataLoader,
                        CLFDataLoader)

In [102]:
BATCH_SIZE = 50

# GPU setup
use_gpu = torch.cuda.is_available()
device_num = 0
device = torch.device(f"cuda:{device_num}" if use_gpu else "cpu")
print(device)

cuda:0


In [103]:
DATA_DIR = '../../data/baird_data/'
TRAIN_PATH = os.path.join(DATA_DIR, 'clinton_train.csv')
VALID_PATH = os.path.join(DATA_DIR, 'clinton_test.csv')

train = pd.read_csv(TRAIN_PATH)
valid = pd.read_csv(VALID_PATH)
# Make labels numeric

In [126]:
train.shape[0]

1961

In [104]:
train.replace({'negative': 0, 'neutral': 1, 'positive': 2}, inplace=True)
valid.replace({'negative': 0, 'neutral': 1, 'positive': 2}, inplace=True)

In [105]:
vectorizer = pickle.load(open(os.path.join(DATA_DIR, "lm_vectorizer.pkl"), "rb"))
train_ds = TextDataset(data=train, vectorizer=vectorizer, 
                       text_col='text', label_col='label')
valid_ds = TextDataset(data=valid, vectorizer=vectorizer, 
                       text_col='text', label_col='label')

In [106]:
train_dl = CLFDataLoader(dataset=train_ds, batch_size=BATCH_SIZE)
valid_dl = CLFDataLoader(dataset=valid_ds, batch_size=BATCH_SIZE)

In [107]:
# Load the language model weights  
# but remove the decoder 
d = torch.load(DATA_DIR+"models/LM__2019-03-21.json")
del d["decoder.weight"]
del d["decoder.bias"]

In [121]:
# Generate the (document)Encoder model 
# from the language model weights
embedding_size = d['encoder.weight'].shape[1]
hidden_size = d['rnns.1.weight_hh_l0'].shape[1]
m = EncoderModel(device, vectorizer, hidden_size, embedding_size, bidirectional=False, tie_weights=True,
                batch_size=BATCH_SIZE)
m.load_state_dict(d)
m.requires_grad = False

In [118]:
# Put it together with the classification head
c = ClassifierModel(lm_hidden_size=embedding_size, hidden_size=200, output_size=3)
final = nn.Sequential(m,c)
if use_gpu:
    final = final.to(device)

In [119]:
def get_accuracy(pred_probs, true_class):
    '''Calculates average accuracy over batch'''
    pred_class = torch.argmax(pred_probs, dim=1)
    errors = pred_class == y
    return torch.mean(errors.type(torch.float)).item()

In [120]:
optimizer = torch.optim.Adam(final.parameters(), lr = 1e-5)
criterion = nn.CrossEntropyLoss()

n_epochs = 100

for epoch in range(n_epochs):
    final.train()
    epoch_train_accs = []
    m.init_hidden()
    for x, y in train_dl:
        x = x.to(device)
        y = y.to(device)
        if x.shape[0] != BATCH_SIZE:
            continue
        #m.init_hidden(bsz=x.shape[0])
        final.zero_grad()
        res = final(x)
        error = criterion(res, y)
        error.backward()
        optimizer.step()
        epoch_train_accs.append(get_accuracy(res, y))
        del error
    print('done training')
    # Validation accuracy
    with torch.no_grad():
        final.eval()
        epoch_train_acc = round(np.mean(epoch_train_accs), 3)
        valid_accs = []

        for x, y in valid_dl:
            x = x.to(device)
            y = y.to(device)
            if x.shape[0] != BATCH_SIZE:
                continue
            #m.init_hidden(bsz=x.shape[0])
            pred_prob = final(x)
            valid_accs.append(get_accuracy(pred_prob, y))
        valid_acc = round(np.mean(valid_accs), 3)
        print(f'Epoch {epoch}:\n\tTraining accuracy: {epoch_train_acc}\n\tValidation accuracy: {valid_acc}')

done training
Epoch 0:
	Training accuracy: 0.358
	Validation accuracy: 0.657
done training
Epoch 1:
	Training accuracy: 0.6
	Validation accuracy: 0.657
done training
Epoch 2:
	Training accuracy: 0.638
	Validation accuracy: 0.657
done training
Epoch 3:
	Training accuracy: 0.635
	Validation accuracy: 0.655
done training
Epoch 4:
	Training accuracy: 0.636
	Validation accuracy: 0.654
done training
Epoch 5:
	Training accuracy: 0.636
	Validation accuracy: 0.657
done training
Epoch 6:
	Training accuracy: 0.635
	Validation accuracy: 0.652
done training
Epoch 7:
	Training accuracy: 0.645
	Validation accuracy: 0.705


KeyboardInterrupt: 