In [1]:
#| default_exp run
#|  export
from fastcore.script import call_parse



def split_string(string):
    # Removing the parentheses and splitting the string by comma
    parts = string[1:-1].split(",")
    # Removing the whitespace and quotes from the parts
    parts = [part.strip().strip("'") for part in parts]
    return parts[0], parts[1]

def return_iters(db:str # Path to db
                 ):
    train_iter = []
    test_iter = []
    file = open(db, 'r', encoding='latin1')
    mapping = {
        "Libertarian Left": 1,
        "Libertarian Right": 2,
        "Authoritarian Left": 3,
        "Authoritarian Right": 4,
    }
    lines = file.readlines()
    for line in lines:
        opinion,text = split_string(line)
        train_iter+=[(mapping[opinion],text)]
        test_iter+=[(mapping[opinion],text)]
    train_iter = iter(train_iter)
    test_iter = iter(test_iter)
    file.close()
    return train_iter, test_iter

In [10]:
#|  export
from torchtext.data.utils import get_tokenizer

from Political_Compass_AI.training import train
from Political_Compass_AI.training import evaluate
from Political_Compass_AI.model import TextClassificationModel
from torchtext.data.functional import to_map_style_dataset
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
import time
import torch
import optuna
from optuna.trial import TrialState
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torchvision import datasets
from torchvision import transforms
import pandas as pd

def yield_tokens(
        data_iter # Iterable database instance
):
    tokenizer = get_tokenizer('basic_english')
    for _, text in data_iter:
        yield tokenizer(text)

def define_model(trial,vocab_size, emsize, num_class):
    model = TextClassificationModel(vocab_size, emsize, num_class)
    return model
def collate_batch(
        batch
):
    global text_pipeline
    global db
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    label_pipeline = lambda x: int(x) - 1
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

def objective(
    trial,

):
    global text_pipeline
    global db
    # BATCH_SIZE = trial.suggest_int('n_epochs', 8, 64)
    BATCH_SIZE = 32
    db="../../../data/4_labels/full"
    tokenizer = get_tokenizer('basic_english')
    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: int(x) - 1
    train_iter, test_iter = return_iters(db)
    vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])

    train_iter, test_iter = return_iters(db)
    dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
    train_iter, test_iter = return_iters(db)
    num_class = len(set([label for (label, text) in train_iter]))
    vocab_size = len(vocab)
    # emsize = trial.suggest_int("em_size",64,128,32)
    emsize = 128
    # LR = trial.suggest_float("lr", 5, 6)
    LR=5
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = define_model(trial,vocab_size, emsize, num_class).to(device)
    run_ledger = open("Run_Ledger.txt", 'a')
    criterion = torch.nn.CrossEntropyLoss()
    # optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD","Adagrad"])
    # optimizer_name = trial.suggest_categorical("optimizer", ["RMSprop","Adagrad"])
    optimizer_name = trial.suggest_categorical("optimizer", ["Adagrad"])
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
    _optim = optimizer_name
    total_accu = None
    train_iter, test_iter = return_iters(db)

    train_dataset = to_map_style_dataset(train_iter)
    test_dataset = to_map_style_dataset(test_iter)
    num_train = int(len(train_dataset) * 0.95)
    split_train_, split_valid_ = \
        random_split(train_dataset, [num_train, len(train_dataset) - num_train])

    train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                                 shuffle=True, collate_fn=collate_batch)
    first_flag = True
    # EPOCHS = trial.suggest_int('n_epochs', 20, 50)
    EPOCHS = 20


    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train(train_dataloader, model, optimizer, epoch)
        accu_val = evaluate(valid_dataloader, model)
        if total_accu is not None and total_accu > accu_val:
            scheduler.step()
        else:
            total_accu = accu_val
        print('-' * 59)
        print('| end of epoch {:3d} | time: {:5.2f}s | '
              'valid accuracy {:8.3f} '.format(epoch,
                                               time.time() - epoch_start_time,
                                               accu_val))
        print('-' * 59)
        trial.report(accu_val, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    return accu_val
    df_Log = {"Database_file":[],"Epochs":[],"LR":[],"Batch_Size":[],
              "Final_accu":[],"Optimzer":[],"accu_test":[]}

    accu_test = evaluate(test_dataloader,model)
    out = 'test accuracy {:8.3f}'.format(accu_test)
    df_Log["Database_file"].append(db)
    df_Log["Epochs"].append(str(EPOCHS))
    df_Log["LR"].append( str(LR))
    df_Log["Batch_Size"].append(str(BATCH_SIZE))
    df_Log["Final_accu"].append(str(accu_val))
    df_Log["Optimzer"].append(optimizer_name)
    df_Log["accu_test"].append(accu_test)
    dataframe = pd.DataFrame(df_Log)
    dataframe.to_csv('Run_Ledger.csv',mode='a', index=False,sep="\t")

In [11]:
# run("../uniqueDB.txt")
#torch.save(model.state_dict(), <path_to>)
# model.load_state_dict(torch.load(<path_to>))
from optuna.pruners import MedianPruner

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10, timeout=None)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])


[I 2023-08-02 11:49:08,696] A new study created in memory with name: no-name-e61458d4-c2c8-43dc-94c1-c516f30ce500


| epoch   1 |    50/  991 batches | accuracy    0.292
| epoch   1 |   100/  991 batches | accuracy    0.319
| epoch   1 |   150/  991 batches | accuracy    0.315
| epoch   1 |   200/  991 batches | accuracy    0.338
| epoch   1 |   250/  991 batches | accuracy    0.325
| epoch   1 |   300/  991 batches | accuracy    0.292
| epoch   1 |   350/  991 batches | accuracy    0.304
| epoch   1 |   400/  991 batches | accuracy    0.344
| epoch   1 |   450/  991 batches | accuracy    0.307
| epoch   1 |   500/  991 batches | accuracy    0.322
| epoch   1 |   550/  991 batches | accuracy    0.319
| epoch   1 |   600/  991 batches | accuracy    0.334
| epoch   1 |   650/  991 batches | accuracy    0.344
| epoch   1 |   700/  991 batches | accuracy    0.341
| epoch   1 |   750/  991 batches | accuracy    0.336
| epoch   1 |   800/  991 batches | accuracy    0.338
| epoch   1 |   850/  991 batches | accuracy    0.351
| epoch   1 |   900/  991 batches | accuracy    0.321
| epoch   1 |   950/  991 ba

[I 2023-08-02 11:50:23,916] Trial 0 finished with value: 0.4343918514080288 and parameters: {'optimizer': 'Adagrad'}. Best is trial 0 with value: 0.4343918514080288.


-----------------------------------------------------------
| end of epoch  20 | time:  3.80s | valid accuracy    0.434 
-----------------------------------------------------------
| epoch   1 |    50/  991 batches | accuracy    0.305
| epoch   1 |   100/  991 batches | accuracy    0.278
| epoch   1 |   150/  991 batches | accuracy    0.304
| epoch   1 |   200/  991 batches | accuracy    0.331
| epoch   1 |   250/  991 batches | accuracy    0.323
| epoch   1 |   300/  991 batches | accuracy    0.323
| epoch   1 |   350/  991 batches | accuracy    0.320
| epoch   1 |   400/  991 batches | accuracy    0.305
| epoch   1 |   450/  991 batches | accuracy    0.338
| epoch   1 |   500/  991 batches | accuracy    0.326
| epoch   1 |   550/  991 batches | accuracy    0.334
| epoch   1 |   600/  991 batches | accuracy    0.307
| epoch   1 |   650/  991 batches | accuracy    0.364
| epoch   1 |   700/  991 batches | accuracy    0.316
| epoch   1 |   750/  991 batches | accuracy    0.306
| epoch  

[I 2023-08-02 11:51:31,236] Trial 1 finished with value: 0.4409826243259437 and parameters: {'optimizer': 'Adagrad'}. Best is trial 1 with value: 0.4409826243259437.


| epoch  20 |   950/  991 batches | accuracy    0.791
-----------------------------------------------------------
| end of epoch  20 | time:  3.08s | valid accuracy    0.441 
-----------------------------------------------------------
| epoch   1 |    50/  991 batches | accuracy    0.294
| epoch   1 |   100/  991 batches | accuracy    0.323
| epoch   1 |   150/  991 batches | accuracy    0.328
| epoch   1 |   200/  991 batches | accuracy    0.336
| epoch   1 |   250/  991 batches | accuracy    0.324
| epoch   1 |   300/  991 batches | accuracy    0.315
| epoch   1 |   350/  991 batches | accuracy    0.312
| epoch   1 |   400/  991 batches | accuracy    0.311
| epoch   1 |   450/  991 batches | accuracy    0.334
| epoch   1 |   500/  991 batches | accuracy    0.340
| epoch   1 |   550/  991 batches | accuracy    0.300
| epoch   1 |   600/  991 batches | accuracy    0.316
| epoch   1 |   650/  991 batches | accuracy    0.311
| epoch   1 |   700/  991 batches | accuracy    0.324
| epoch  

[I 2023-08-02 11:52:38,939] Trial 2 finished with value: 0.4284002396644697 and parameters: {'optimizer': 'Adagrad'}. Best is trial 1 with value: 0.4409826243259437.


-----------------------------------------------------------
| end of epoch  20 | time:  3.32s | valid accuracy    0.428 
-----------------------------------------------------------
| epoch   1 |    50/  991 batches | accuracy    0.296
| epoch   1 |   100/  991 batches | accuracy    0.308
| epoch   1 |   150/  991 batches | accuracy    0.309
| epoch   1 |   200/  991 batches | accuracy    0.318
| epoch   1 |   250/  991 batches | accuracy    0.304
| epoch   1 |   300/  991 batches | accuracy    0.319
| epoch   1 |   350/  991 batches | accuracy    0.324
| epoch   1 |   400/  991 batches | accuracy    0.310
| epoch   1 |   450/  991 batches | accuracy    0.300
| epoch   1 |   500/  991 batches | accuracy    0.326
| epoch   1 |   550/  991 batches | accuracy    0.344
| epoch   1 |   600/  991 batches | accuracy    0.335
| epoch   1 |   650/  991 batches | accuracy    0.330
| epoch   1 |   700/  991 batches | accuracy    0.336
| epoch   1 |   750/  991 batches | accuracy    0.332
| epoch  

[I 2023-08-02 11:53:47,549] Trial 3 finished with value: 0.4254044337926902 and parameters: {'optimizer': 'Adagrad'}. Best is trial 1 with value: 0.4409826243259437.


| epoch  20 |   950/  991 batches | accuracy    0.768
-----------------------------------------------------------
| end of epoch  20 | time:  2.95s | valid accuracy    0.425 
-----------------------------------------------------------
| epoch   1 |    50/  991 batches | accuracy    0.305
| epoch   1 |   100/  991 batches | accuracy    0.295
| epoch   1 |   150/  991 batches | accuracy    0.315
| epoch   1 |   200/  991 batches | accuracy    0.340
| epoch   1 |   250/  991 batches | accuracy    0.304
| epoch   1 |   300/  991 batches | accuracy    0.323
| epoch   1 |   350/  991 batches | accuracy    0.339
| epoch   1 |   400/  991 batches | accuracy    0.319
| epoch   1 |   450/  991 batches | accuracy    0.323
| epoch   1 |   500/  991 batches | accuracy    0.327
| epoch   1 |   550/  991 batches | accuracy    0.326
| epoch   1 |   600/  991 batches | accuracy    0.319
| epoch   1 |   650/  991 batches | accuracy    0.333
| epoch   1 |   700/  991 batches | accuracy    0.339
| epoch  

[I 2023-08-02 11:54:52,725] Trial 4 finished with value: 0.4092270820850809 and parameters: {'optimizer': 'Adagrad'}. Best is trial 1 with value: 0.4409826243259437.


| epoch   1 |    50/  991 batches | accuracy    0.280
| epoch   1 |   100/  991 batches | accuracy    0.323
| epoch   1 |   150/  991 batches | accuracy    0.292
| epoch   1 |   200/  991 batches | accuracy    0.319
| epoch   1 |   250/  991 batches | accuracy    0.335
| epoch   1 |   300/  991 batches | accuracy    0.328
| epoch   1 |   350/  991 batches | accuracy    0.329
| epoch   1 |   400/  991 batches | accuracy    0.332
| epoch   1 |   450/  991 batches | accuracy    0.325
| epoch   1 |   500/  991 batches | accuracy    0.323
| epoch   1 |   550/  991 batches | accuracy    0.306
| epoch   1 |   600/  991 batches | accuracy    0.326
| epoch   1 |   650/  991 batches | accuracy    0.318
| epoch   1 |   700/  991 batches | accuracy    0.324
| epoch   1 |   750/  991 batches | accuracy    0.311
| epoch   1 |   800/  991 batches | accuracy    0.328
| epoch   1 |   850/  991 batches | accuracy    0.314
| epoch   1 |   900/  991 batches | accuracy    0.331


[I 2023-08-02 11:54:56,973] Trial 5 pruned. 


| epoch   1 |   950/  991 batches | accuracy    0.338
-----------------------------------------------------------
| end of epoch   1 | time:  3.14s | valid accuracy    0.321 
-----------------------------------------------------------
| epoch   1 |    50/  991 batches | accuracy    0.322
| epoch   1 |   100/  991 batches | accuracy    0.293
| epoch   1 |   150/  991 batches | accuracy    0.326
| epoch   1 |   200/  991 batches | accuracy    0.318
| epoch   1 |   250/  991 batches | accuracy    0.314
| epoch   1 |   300/  991 batches | accuracy    0.301
| epoch   1 |   350/  991 batches | accuracy    0.308
| epoch   1 |   400/  991 batches | accuracy    0.329
| epoch   1 |   450/  991 batches | accuracy    0.324
| epoch   1 |   500/  991 batches | accuracy    0.299
| epoch   1 |   550/  991 batches | accuracy    0.353
| epoch   1 |   600/  991 batches | accuracy    0.304
| epoch   1 |   650/  991 batches | accuracy    0.338
| epoch   1 |   700/  991 batches | accuracy    0.327
| epoch  

[I 2023-08-02 11:55:01,055] Trial 6 pruned. 


| epoch   1 |   950/  991 batches | accuracy    0.310
-----------------------------------------------------------
| end of epoch   1 | time:  3.02s | valid accuracy    0.288 
-----------------------------------------------------------
| epoch   1 |    50/  991 batches | accuracy    0.287
| epoch   1 |   100/  991 batches | accuracy    0.308
| epoch   1 |   150/  991 batches | accuracy    0.298
| epoch   1 |   200/  991 batches | accuracy    0.324
| epoch   1 |   250/  991 batches | accuracy    0.341
| epoch   1 |   300/  991 batches | accuracy    0.321
| epoch   1 |   350/  991 batches | accuracy    0.314
| epoch   1 |   400/  991 batches | accuracy    0.335
| epoch   1 |   450/  991 batches | accuracy    0.326
| epoch   1 |   500/  991 batches | accuracy    0.326
| epoch   1 |   550/  991 batches | accuracy    0.327
| epoch   1 |   600/  991 batches | accuracy    0.336
| epoch   1 |   650/  991 batches | accuracy    0.336
| epoch   1 |   700/  991 batches | accuracy    0.319
| epoch  

[I 2023-08-02 11:55:12,154] Trial 7 pruned. 


-----------------------------------------------------------
| end of epoch   3 | time:  3.64s | valid accuracy    0.369 
-----------------------------------------------------------
| epoch   1 |    50/  991 batches | accuracy    0.311
| epoch   1 |   100/  991 batches | accuracy    0.301
| epoch   1 |   150/  991 batches | accuracy    0.319
| epoch   1 |   200/  991 batches | accuracy    0.316
| epoch   1 |   250/  991 batches | accuracy    0.318
| epoch   1 |   300/  991 batches | accuracy    0.306
| epoch   1 |   350/  991 batches | accuracy    0.306
| epoch   1 |   400/  991 batches | accuracy    0.338
| epoch   1 |   450/  991 batches | accuracy    0.321
| epoch   1 |   500/  991 batches | accuracy    0.323
| epoch   1 |   550/  991 batches | accuracy    0.325
| epoch   1 |   600/  991 batches | accuracy    0.318
| epoch   1 |   650/  991 batches | accuracy    0.317
| epoch   1 |   700/  991 batches | accuracy    0.311
| epoch   1 |   750/  991 batches | accuracy    0.316
| epoch  

[I 2023-08-02 11:55:16,924] Trial 8 pruned. 


-----------------------------------------------------------
| end of epoch   1 | time:  3.52s | valid accuracy    0.332 
-----------------------------------------------------------
| epoch   1 |    50/  991 batches | accuracy    0.312
| epoch   1 |   100/  991 batches | accuracy    0.313
| epoch   1 |   150/  991 batches | accuracy    0.334
| epoch   1 |   200/  991 batches | accuracy    0.331
| epoch   1 |   250/  991 batches | accuracy    0.336
| epoch   1 |   300/  991 batches | accuracy    0.335
| epoch   1 |   350/  991 batches | accuracy    0.319
| epoch   1 |   400/  991 batches | accuracy    0.313
| epoch   1 |   450/  991 batches | accuracy    0.320
| epoch   1 |   500/  991 batches | accuracy    0.318
| epoch   1 |   550/  991 batches | accuracy    0.330
| epoch   1 |   600/  991 batches | accuracy    0.316
| epoch   1 |   650/  991 batches | accuracy    0.342
| epoch   1 |   700/  991 batches | accuracy    0.331
| epoch   1 |   750/  991 batches | accuracy    0.300
| epoch  

[I 2023-08-02 11:55:21,607] Trial 9 pruned. 


In [13]:
prune_ledger = open("prune.txt",'a')
prune_ledger.write("Study statistics: "+'\n')
prune_ledger.write('\n')
prune_ledger.write("  Number of finished trials:"+str( len(study.trials)))
prune_ledger.write('\n')
prune_ledger.write("  Number of pruned trials: "+str(len(pruned_trials)))
prune_ledger.write('\n')
prune_ledger.write("  Number of complete trials: "+str(len(complete_trials)))

prune_ledger.write("Best trial:")
prune_ledger.write('\n')
trial = study.best_trial

prune_ledger.write("  Value: "+ str(trial.value))
prune_ledger.write('\n')

prune_ledger.write("  Params: ")
prune_ledger.write('\n')
for key, value in trial.params.items():
    prune_ledger.write("    {}: {}".format(key, value))
    prune_ledger.write('\n')