In [1]:
#| default_exp run
#|  export
from fastcore.script import call_parse



def split_string(string):
    # Removing the parentheses and splitting the string by comma
    parts = string[1:-1].split(",")
    # Removing the whitespace and quotes from the parts
    parts = [part.strip().strip("'") for part in parts]
    return parts[0], parts[1]

def return_iters(db:str # Path to db
                 ):
    train_iter = []
    test_iter = []
    file = open(db, 'r', encoding='latin1')
    mapping = {
        "Libertarian Left": 1,
        "Libertarian Right": 2,
        "Authoritarian Left": 3,
        "Authoritarian Right": 4,
    }
    lines = file.readlines()
    for line in lines:
        opinion,text = split_string(line)
        train_iter+=[(mapping[opinion],text)]
        test_iter+=[(mapping[opinion],text)]
    train_iter = iter(train_iter)
    test_iter = iter(test_iter)
    file.close()
    return train_iter, test_iter

In [2]:
#|  export
from torchtext.data.utils import get_tokenizer
# from Political_Compass_AI.data_processing import return_iters
# from Political_Compass_AI.data_processing import split_string
from Political_Compass_AI.data_processing import yield_tokens
from Political_Compass_AI.data_processing import collate_batch
from Political_Compass_AI.training import train
from Political_Compass_AI.training import evaluate
from Political_Compass_AI.model import TextClassificationModel
from torchtext.data.functional import to_map_style_dataset
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
import time
import torch
import optuna
from optuna.trial import TrialState
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torchvision import datasets
from torchvision import transforms
import pandas as pd
def define_model(trial,vocab_size, emsize, num_class):
    model = TextClassificationModel(vocab_size, emsize, num_class)
    return model
def collate_batch(
        batch
):
    global text_pipeline
    global db
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    label_pipeline = lambda x: int(x) - 1
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

def objective(
    trial,

):
    global text_pipeline
    global db
    # BATCH_SIZE = trial.suggest_int('n_epochs', 8, 64)
    BATCH_SIZE = 32
    db="../uniqueDB.txt"
    tokenizer = get_tokenizer('basic_english')
    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: int(x) - 1
    train_iter, test_iter = return_iters(db)
    vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])

    train_iter, test_iter = return_iters(db)
    dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
    train_iter, test_iter = return_iters(db)
    num_class = len(set([label for (label, text) in train_iter]))
    vocab_size = len(vocab)
    emsize = trial.suggest_int("em_size",64,128,32)
    LR = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = define_model(trial,vocab_size, emsize, num_class).to(device)
    run_ledger = open("Run_Ledger.txt", 'a')
    criterion = torch.nn.CrossEntropyLoss()
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD","Adagrad"])
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
    _optim = optimizer_name
    total_accu = None
    train_iter, test_iter = return_iters(db)

    train_dataset = to_map_style_dataset(train_iter)
    test_dataset = to_map_style_dataset(test_iter)
    num_train = int(len(train_dataset) * 0.95)
    split_train_, split_valid_ = \
        random_split(train_dataset, [num_train, len(train_dataset) - num_train])

    train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                                 shuffle=True, collate_fn=collate_batch)
    first_flag = True
    # EPOCHS = trial.suggest_int('n_epochs', 20, 50)
    EPOCHS = 20


    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train(train_dataloader, model, optimizer, epoch)
        accu_val = evaluate(valid_dataloader, model)
        if total_accu is not None and total_accu > accu_val:
            scheduler.step()
        else:
            total_accu = accu_val
        print('-' * 59)
        print('| end of epoch {:3d} | time: {:5.2f}s | '
              'valid accuracy {:8.3f} '.format(epoch,
                                               time.time() - epoch_start_time,
                                               accu_val))
        print('-' * 59)
        trial.report(accu_val, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    return accu_val
    df_Log = {"Database_file":[],"Epochs":[],"LR":[],"Batch_Size":[],
              "Final_accu":[],"Optimzer":[],"accu_test":[]}

    accu_test = evaluate(test_dataloader,model)
    out = 'test accuracy {:8.3f}'.format(accu_test)
    df_Log["Database_file"].append(db)
    df_Log["Epochs"].append(str(EPOCHS))
    df_Log["LR"].append( str(LR))
    df_Log["Batch_Size"].append(str(BATCH_SIZE))
    df_Log["Final_accu"].append(str(accu_val))
    df_Log["Optimzer"].append(optimizer_name)
    df_Log["accu_test"].append(accu_test)
    dataframe = pd.DataFrame(df_Log)
    dataframe.to_csv('Run_Ledger.csv',mode='a', index=False,sep="\t")

In [3]:
# run("../uniqueDB.txt")
#torch.save(model.state_dict(), <path_to>)
# model.load_state_dict(torch.load(<path_to>))
from optuna.pruners import MedianPruner

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=None)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2023-03-27 19:03:52,671][0m A new study created in memory with name: no-name-14a9b8d0-2238-4bf1-9ea6-07df27fcbb0b[0m


| epoch   1 |    50/  991 batches | accuracy    0.265
| epoch   1 |   100/  991 batches | accuracy    0.293
| epoch   1 |   150/  991 batches | accuracy    0.325
| epoch   1 |   200/  991 batches | accuracy    0.327
| epoch   1 |   250/  991 batches | accuracy    0.320
| epoch   1 |   300/  991 batches | accuracy    0.333
| epoch   1 |   350/  991 batches | accuracy    0.352
| epoch   1 |   400/  991 batches | accuracy    0.332
| epoch   1 |   450/  991 batches | accuracy    0.362
| epoch   1 |   500/  991 batches | accuracy    0.344
| epoch   1 |   550/  991 batches | accuracy    0.343
| epoch   1 |   600/  991 batches | accuracy    0.350
| epoch   1 |   650/  991 batches | accuracy    0.358
| epoch   1 |   700/  991 batches | accuracy    0.364
| epoch   1 |   750/  991 batches | accuracy    0.361
| epoch   1 |   800/  991 batches | accuracy    0.358
| epoch   1 |   850/  991 batches | accuracy    0.361
| epoch   1 |   900/  991 batches | accuracy    0.359
| epoch   1 |   950/  991 ba

[32m[I 2023-03-27 19:17:01,362][0m Trial 0 finished with value: 0.4110245656081486 and parameters: {'em_size': 128, 'lr': 7.077611784333668e-05, 'optimizer': 'Adam'}. Best is trial 0 with value: 0.4110245656081486.[0m


-----------------------------------------------------------
| end of epoch  20 | time: 41.50s | valid accuracy    0.411 
-----------------------------------------------------------
| epoch   1 |    50/  991 batches | accuracy    0.268
| epoch   1 |   100/  991 batches | accuracy    0.260
| epoch   1 |   150/  991 batches | accuracy    0.281
| epoch   1 |   200/  991 batches | accuracy    0.284
| epoch   1 |   250/  991 batches | accuracy    0.289
| epoch   1 |   300/  991 batches | accuracy    0.307
| epoch   1 |   350/  991 batches | accuracy    0.303
| epoch   1 |   400/  991 batches | accuracy    0.305
| epoch   1 |   450/  991 batches | accuracy    0.309
| epoch   1 |   500/  991 batches | accuracy    0.309
| epoch   1 |   550/  991 batches | accuracy    0.304
| epoch   1 |   600/  991 batches | accuracy    0.319
| epoch   1 |   650/  991 batches | accuracy    0.323
| epoch   1 |   700/  991 batches | accuracy    0.319
| epoch   1 |   750/  991 batches | accuracy    0.323
| epoch  

[33m[W 2023-03-27 19:21:01,375][0m Trial 1 failed with parameters: {'em_size': 64, 'lr': 4.500990885330019e-05, 'optimizer': 'Adam'} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "C:\Users\turet\anaconda3\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-2-67bc4e4d2ee7>", line 99, in objective
    train(train_dataloader, model, optimizer, epoch)
  File "c:\users\turet\documents\docs\school\semsetera2023\pcm\political_compass_ai\Political_Compass_AI\training.py", line 25, in train
    for idx, (label, text, offsets) in enumerate(dataloader):
  File "C:\Users\turet\anaconda3\lib\site-packages\torch\utils\data\dataloader.py", line 681, in __next__
    data = self._next_data()
  File "C:\Users\turet\anaconda3\lib\site-packages\torch\utils\data\dataloader.py", line 721, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  Fil

| epoch  13 |   750/  991 batches | accuracy    0.404


KeyboardInterrupt: 

In [15]:
# def predict(text, text_pipeline):
#     with torch.no_grad():
#         text = torch.tensor(text_pipeline(text))
#         output = model(text, torch.tensor([0]))
#         return output.argmax(1).item() + 1
#
# mapping = {
# 1:"Libertarian Left",
# 2:"Libertarian Right",
# 3:"Authoritarian Left",
# 4:"Authoritarian Right",
# }
# model = model.to("cpu")
# # ex_text_str = """
# # """
# # https://old.reddit.com/r/PoliticalCompassMemes/comments/x774os/conservative_you_say_sounds_fine_to_me/inbbz52/
# ex_text_str = """
# deo's mom
# """
# print("This is a %s comment" % mapping[predict(ex_text_str, text_pipeline)])

This is a Libertarian Right comment
