In [1]:
#| default_exp run
#|  export

def yield_tokens(
        data_iter # Iterable database instance
):
    tokenizer = get_tokenizer('basic_english')
    for _, text in data_iter:
        yield tokenizer(text)

def split_string(string):
    parts = string[1:-1].split(",")
    parts = [part.strip().strip("'") for part in parts]
    return parts[0], parts[1]

def return_iters(_db:str # Path to db
                 ):
    train_iter = []
    test_iter = []
    file = open(_db, 'r', encoding='latin1')
    mapping = {
        "Libertarian Left": 1,
        "Libertarian Right": 2,
        "Authoritarian Left": 3,
        "Authoritarian Right": 4,
        "Centrist": 5,
        "Authoritarian Center": 6,
        "Left": 7,
        "Right": 8,
        "Libertarian Center": 9,
    }
    lines = file.readlines()
    for line in lines:
        opinion,text = split_string(line)
        train_iter+=[(mapping[opinion],text)]
        test_iter+=[(mapping[opinion],text)]
    train_iter = iter(train_iter)
    test_iter = iter(test_iter)
    file.close()
    return train_iter, test_iter

In [13]:
#|  export
from torchtext.data.utils import get_tokenizer
from Political_Compass_AI.model import TextClassificationModel
from Political_Compass_AI.training import train
from Political_Compass_AI.training import evaluate
from torchtext.data.functional import to_map_style_dataset
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
import time
import torch
import torch.optim as optim
import torch.utils.data
import pandas as pd

def collate_batch(
        batch
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    label_pipeline = lambda x: int(x) - 1
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

def train_model(
    db,
    batch_size,
    emsize,
    lr,
    optimizer_name,
    epochs
):
    global text_pipeline
    tokenizer = get_tokenizer('basic_english')
    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: int(x) - 1
    train_iter, test_iter = return_iters(db)
    vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])
    train_iter, test_iter = return_iters(db)
    dataloader = DataLoader(train_iter, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
    train_iter, test_iter = return_iters(db)
    num_class = len(set([label for (label, text) in train_iter]))
    vocab_size = len(vocab)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = TextClassificationModel(vocab_size, emsize, num_class).to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1)
    _optim = optimizer_name
    total_accu = None
    train_iter, test_iter = return_iters(db)
    train_dataset = to_map_style_dataset(train_iter)
    test_dataset = to_map_style_dataset(test_iter)
    num_train = int(len(train_dataset) * 0.95)
    split_train_, split_valid_ = \
        random_split(train_dataset, [num_train, len(train_dataset) - num_train])

    train_dataloader = DataLoader(split_train_, batch_size=batch_size,
                                  shuffle=True, collate_fn=collate_batch)
    valid_dataloader = DataLoader(split_valid_, batch_size=batch_size,
                                  shuffle=True, collate_fn=collate_batch)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size,
                                 shuffle=True, collate_fn=collate_batch)
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(train_dataloader, model, optimizer, epoch)
        accu_val = evaluate(valid_dataloader, model)
        if total_accu is not None and total_accu > accu_val:
            scheduler.step()
        else:
            total_accu = accu_val
        print('-' * 59)
        print('| end of epoch {:3d} | time: {:5.2f}s | '
              'valid accuracy {:8.3f} '.format(epoch,
                                               time.time() - epoch_start_time,
                                               accu_val))
        print('-' * 59)
    df_Log = {"Database_file":[],"Epochs":[],"LR":[],"Batch_Size":[],
              "Final_accu":[],"Optimzer":[],"Test Accuracy":[],"Embed Size":[],"Vocabulary Size":[]}
    accu_test = evaluate(test_dataloader,model)
    out = 'test accuracy {:8.3f}'.format(accu_test)
    df_Log["Database_file"].append(db)
    df_Log["Epochs"].append(str(epochs))
    df_Log["LR"].append(str(lr))
    df_Log["Batch_Size"].append(str(batch_size))
    df_Log["Final_accu"].append(str(accu_val))
    df_Log["Optimzer"].append(optimizer_name)
    df_Log["Test Accuracy"].append(accu_test)
    df_Log["Embed Size"].append(emsize)
    df_Log["Vocabulary Size"].append(vocab_size)
    dataframe = pd.DataFrame(df_Log)
    dataframe.to_csv('Run_Ledger.csv',mode='a', index=False,sep="\t")
    return model

In [15]:
args ={
    "db":"../../../data/9_labels/linear/full",
    "batch_size":32,
    "emsize":64,
    "lr":5,
    "optimizer_name":"Adagrad",
    "epochs":20
}
trained_model = train_model(**args)
torch.save(trained_model.state_dict(),"model.pth")

| epoch   1 |    50/ 5775 batches | accuracy    0.125
| epoch   1 |   100/ 5775 batches | accuracy    0.132
| epoch   1 |   150/ 5775 batches | accuracy    0.126
| epoch   1 |   200/ 5775 batches | accuracy    0.133
| epoch   1 |   250/ 5775 batches | accuracy    0.151
| epoch   1 |   300/ 5775 batches | accuracy    0.131
| epoch   1 |   350/ 5775 batches | accuracy    0.134
| epoch   1 |   400/ 5775 batches | accuracy    0.132
| epoch   1 |   450/ 5775 batches | accuracy    0.131
| epoch   1 |   500/ 5775 batches | accuracy    0.129
| epoch   1 |   550/ 5775 batches | accuracy    0.129
| epoch   1 |   600/ 5775 batches | accuracy    0.118
| epoch   1 |   650/ 5775 batches | accuracy    0.134
| epoch   1 |   700/ 5775 batches | accuracy    0.127
| epoch   1 |   750/ 5775 batches | accuracy    0.161
| epoch   1 |   800/ 5775 batches | accuracy    0.141
| epoch   1 |   850/ 5775 batches | accuracy    0.123
| epoch   1 |   900/ 5775 batches | accuracy    0.133
| epoch   1 |   950/ 5775 ba

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TextClassificationModel(vocab_size=100124, embed_dim=64, num_class=9).to(device)
model.load_state_dict(torch.load("model.pth"))
model.eval()
train,test = return_iters("../../../data/9_labels/linear/full")
test_dataset = to_map_style_dataset(test)

test_dataloader = DataLoader(test_dataset, batch_size=32,
                                 shuffle=True, collate_fn=collate_batch)
test_accuracy= evaluate(test_dataloader,model)
print(test_accuracy)

0.5294404943679277
