In [23]:
#| default_exp run
#|  export
from fastcore.script import call_parse
def split_string(string):
    # Removing the parentheses and splitting the string by comma
    parts = string[1:-1].split(",")
    # Removing the whitespace and quotes from the parts
    parts = [part.strip().strip("'") for part in parts]
    return parts[0], parts[1]

def return_iters(db:str # Path to db
                 ):
    train_iter = []
    test_iter = []
    file = open(db, 'r', encoding='latin1')
    mapping = {
        "Libertarian Left": 1,
        "Libertarian Right": 2,
        "Authoritarian Left": 3,
        "Authoritarian Right": 4,
    }
    lines = file.readlines()
    for line in lines:
        opinion,text = split_string(line)
        train_iter+=[(mapping[opinion],text)]
        test_iter+=[(mapping[opinion],text)]
    train_iter = iter(train_iter)
    test_iter = iter(test_iter)
    file.close()
    return train_iter, test_iter

In [24]:
#|  export
from torchtext.data.utils import get_tokenizer
# from Political_Compass_AI.data_processing import return_iters
# from Political_Compass_AI.data_processing import split_string
from Political_Compass_AI.data_processing import yield_tokens
from Political_Compass_AI.data_processing import collate_batch
from Political_Compass_AI.model import TextClassificationModel
from Political_Compass_AI.training import train
from Political_Compass_AI.training import evaluate
from torchtext.data.functional import to_map_style_dataset
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
import time
import torch

def collate_batch(
        batch
):
    global text_pipeline
    global db
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    label_pipeline = lambda x: int(x) - 1
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

@call_parse
def run(
    _db:str # dn path to run alignment distribution

):
    global text_pipeline
    global db
    db=_db
    tokenizer = get_tokenizer('basic_english')
    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: int(x) - 1
    train_iter, test_iter = return_iters(db)
    vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    train_iter, test_iter = return_iters(db)
    dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)
    train_iter, test_iter = return_iters(db)
    num_class = len(set([label for (label, text) in train_iter]))
    vocab_size = len(vocab)
    emsize = 128
    LR = 5
    model = TextClassificationModel(vocab_size, emsize, num_class).to(device)
    BATCH_SIZE = 32
    run_ledger = open("Run_Ledger.txt", 'a')
    criterion = torch.nn.CrossEntropyLoss()
    # optimizer = torch.optim.SGD(model.parameters(), lr=LR)
    optimizer = torch.optim.Adagrad(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
    function = "Linear with  weight init"
    _optim = "Adamgrad"
    total_accu = None
    train_iter, test_iter = return_iters(db)

    train_dataset = to_map_style_dataset(train_iter)
    test_dataset = to_map_style_dataset(test_iter)
    num_train = int(len(train_dataset) * 0.95)
    split_train_, split_valid_ = \
        random_split(train_dataset, [num_train, len(train_dataset) - num_train])

    train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                                 shuffle=True, collate_fn=collate_batch)
    first_flag = True
    EPOCHS = 20

    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train(train_dataloader, model, optimizer, epoch)
        accu_val = evaluate(valid_dataloader, model)
        if total_accu is not None and total_accu > accu_val:
            scheduler.step()
        else:
            total_accu = accu_val
        if first_flag:
            run_ledger.write("Database file: " + db + "\t" + "Epochs:" + str(EPOCHS) + "\t" + "LR: " + str(
                LR) + "\t" + "Batch Size: " + str(BATCH_SIZE) + "\tinit accu_val:" + str(accu_val) + "\n")
            first_flag = False
        print('-' * 59)
        print('| end of epoch {:3d} | time: {:5.2f}s | '
              'valid accuracy {:8.3f} '.format(epoch,
                                               time.time() - epoch_start_time,
                                               accu_val))
        print('-' * 59)
    run_ledger.write("Final accu:\t" + str(accu_val) + "\n\n")
    accu_test = evaluate(test_dataloader,model)
    out = 'test accuracy {:8.3f}'.format(accu_test)
    print(out)
    run_ledger.write(out + "\tfunction:" + function + "\tOptimzer:" + _optim + '\n')
    run_ledger.close()

In [28]:
run("../uniqueDB.txt")
#torch.save(model.state_dict(), <path_to>)
# model.load_state_dict(torch.load(<path_to>))


cuda
| epoch   1 |    50/  991 batches | accuracy    0.306
| epoch   1 |   100/  991 batches | accuracy    0.292
| epoch   1 |   150/  991 batches | accuracy    0.327
| epoch   1 |   200/  991 batches | accuracy    0.316
| epoch   1 |   250/  991 batches | accuracy    0.311
| epoch   1 |   300/  991 batches | accuracy    0.321
| epoch   1 |   350/  991 batches | accuracy    0.321
| epoch   1 |   400/  991 batches | accuracy    0.316
| epoch   1 |   450/  991 batches | accuracy    0.318
| epoch   1 |   500/  991 batches | accuracy    0.315
| epoch   1 |   550/  991 batches | accuracy    0.328
| epoch   1 |   600/  991 batches | accuracy    0.316
| epoch   1 |   650/  991 batches | accuracy    0.325
| epoch   1 |   700/  991 batches | accuracy    0.321
| epoch   1 |   750/  991 batches | accuracy    0.321
| epoch   1 |   800/  991 batches | accuracy    0.351
| epoch   1 |   850/  991 batches | accuracy    0.333
| epoch   1 |   900/  991 batches | accuracy    0.319
| epoch   1 |   950/  9

In [15]:
def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

mapping = {
1:"Libertarian Left",
2:"Libertarian Right",
3:"Authoritarian Left",
4:"Authoritarian Right",
}
model = model.to("cpu")
# ex_text_str = """
# """
# https://old.reddit.com/r/PoliticalCompassMemes/comments/x774os/conservative_you_say_sounds_fine_to_me/inbbz52/
ex_text_str = """
deo's mom
"""
print("This is a %s comment" % mapping[predict(ex_text_str, text_pipeline)])

This is a Libertarian Right comment
