# Sequence classification improving via GAN

In [1]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

<IPython.core.display.Javascript object>

In [3]:
%cd gan-plus-nlp-main/

/home/valperovich/projects/other/std/gan-plus-nlp-main


<IPython.core.display.Javascript object>

In [4]:
import os
import gc
import sys
import json
import numpy as np
import pandas as pd
import importlib as imp
import neptune as neptune
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

import torch
import torch.nn.functional as F

from typing import List, Dict

import warnings

warnings.simplefilter("ignore")
sys.path.append("..")
secret = json.load(open("secret.json"))

<IPython.core.display.Javascript object>

# Data Loading

In [53]:
from datasets import load_dataset

dataset_name = "vmalperovich/20ng"  # ag_news vmalperovich/QC vmalperovich/SST5 vmalperovich/20ng
dataset = load_dataset(dataset_name, ignore_verifications=True)
dataset = dataset.rename_column("label", "labels")
dataset

Found cached dataset 20ng (/home/valperovich/.cache/huggingface/datasets/vmalperovich___20ng/default/0.0.0/cb7b2bd44b9005a66152af78ef5bb91ca7d7babd62072d71848b705559ce64ac)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 9051
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 7532
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 2263
    })
})

<IPython.core.display.Javascript object>

In [54]:
# _indexes = np.random.permutation(len(dataset['train']))
# train_size = int(len(dataset['train']) * 0.65)
# # train_indexes = _indexes[:train_size]
# valid_indexes = _indexes[train_size:]
# dataset['validation'] = dataset['train'].select(valid_indexes)

# # if len(dataset['test']) > 10_000:
# #     _indexes = np.random.permutation(len(dataset['test']))
# #     dataset['test'] = dataset['test'].select(_indexes[:10_000])
# dataset

<IPython.core.display.Javascript object>

In [56]:
LABEL_NAMES = dataset["train"].features["labels"].names
get_ids2label = lambda ids: [LABEL_NAMES[t] for t in ids]
NUM_LABELS = dataset["train"].features["labels"].num_classes
LABEL_NAMES, NUM_LABELS

(['comp.sys.mac.hardware',
  'comp.graphics',
  'sci.space',
  'talk.politics.guns',
  'sci.med',
  'comp.sys.ibm.pc.hardware',
  'comp.os.ms-windows.misc',
  'rec.motorcycles',
  'misc.forsale',
  'alt.atheism',
  'rec.autos',
  'sci.electronics',
  'comp.windows.x',
  'rec.sport.hockey',
  'rec.sport.baseball',
  'talk.politics.mideast',
  'sci.crypt',
  'soc.religion.christian',
  'talk.politics.misc',
  'talk.religion.misc'],
 20)

<IPython.core.display.Javascript object>

In [10]:
dataset_df["text"].str.split().apply(len).describe(percentiles=[0.5, 0.7, 0.9, 0.95])

count     9051.000000
mean       289.114462
std        558.313261
min         14.000000
50%        176.000000
70%        260.000000
90%        507.000000
95%        772.000000
max      11821.000000
Name: text, dtype: float64

## Experiment

In [58]:
import torch
import torch.nn.functional as F

os.environ["CUDA_VISIBLE_DEVICES"] = "7"

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("There are %d GPU(s) available." % torch.cuda.device_count())
    print("We will use the GPU:", torch.cuda.get_device_name())
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")
print(torch.cuda.memory_allocated())

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2080 Ti
0


<IPython.core.display.Javascript object>

In [5]:
# model_name = "bert-base-cased"
# model_name = "distilbert-base-uncased"
model_name = "bert-base-uncased"
# model_name = "google/electra-base-discriminator"

<IPython.core.display.Javascript object>

In [7]:
import sys

try:
    del sys.modules["base"]
    del sys.modules["model"]
    del sys.modules["model.discriminator"]
    del sys.modules["model.generator"]
    del sys.modules["model.utils"]
    del sys.modules["trainer"]
except:
    print("pass")

gc.collect()
torch.cuda.empty_cache()

import model

model = imp.reload(model)

pass


<IPython.core.display.Javascript object>

In [60]:
CONFIG = dict(
    TASK="classification",
    encoder_name=model_name,
    frozen_backbone=False,
    batch_size=32,
    max_seq_length=172,
    noise_size=100,
    dataset_train_size=len(dataset["train"]),
    dataset_valid_size=len(dataset["validation"]),
    dataset_test_size=len(dataset["test"]),
    num_labels=NUM_LABELS,
    label_names=LABEL_NAMES,
    lr_discriminator=5e-5,
    lr_generator=5e-5,
    epsilon=1e-8,
    num_train_epochs=5,
    multi_gpu=False,
    dropout_rate=0.2,
    apply_scheduler=True,
    warmup_proportion_d=0.1,
    warmup_proportion_g=0.0,
    fake_label_index=-1,
    dataset=dataset_name,
    save_path="../weights/best_model.pth",
)

<IPython.core.display.Javascript object>

In [63]:
from copy import copy
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler



LABELED_SIZE = 200
UNLABELED_SIZE = 4000
FULL_SIZE = LABELED_SIZE + UNLABELED_SIZE
multiplier = int(np.log2(FULL_SIZE / LABELED_SIZE))
multiplier = max(1, multiplier)
print("Multiplier:", multiplier)

np.random.seed(42)


tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


def prepare_experiment_datasets(LABELED_SIZE, UNLABELED_SIZE, FULL_SIZE, multiplier):
    tokenize = lambda x: tokenizer(
        x["text"], truncation=True, max_length=CONFIG["max_seq_length"]
    )
    tokenized_dataset = dataset.map(tokenize, batched=True)
    tokenized_dataset = tokenized_dataset.remove_columns(["text"])
    tokenized_train_df = tokenized_dataset["train"].to_pandas()
    tokenized_train_df_labeled = tokenized_train_df.sample(LABELED_SIZE)
    tokenized_train_df_labeled["labeled_mask"] = True

    tokenized_train_df = tokenized_train_df.sample(UNLABELED_SIZE)
    tokenized_train_df["labeled_mask"] = False
    tokenized_train_df["labels"] = -100

    for _ in range(multiplier):
        tokenized_train_df = tokenized_train_df.append(tokenized_train_df_labeled)

    tokenized_dataset["train"] = Dataset.from_pandas(
        tokenized_train_df, preserve_index=False
    )
    tokenized_dataset["train_only_labeled"] = Dataset.from_pandas(
        tokenized_train_df_labeled, preserve_index=False
    )
    print(
        "TRAIN (FOR only discriminator):", len(tokenized_dataset["train_only_labeled"])
    )
    print("TRAIN (FOR GAN):", len(tokenized_dataset["train"]))
    return tokenized_dataset


tokenized_dataset = prepare_experiment_datasets(
    LABELED_SIZE, UNLABELED_SIZE, FULL_SIZE, multiplier
)
tokenized_dataset

Multiplier: 4


Map:   0%|          | 0/9051 [00:00<?, ? examples/s]

Map:   0%|          | 0/7532 [00:00<?, ? examples/s]

Map:   0%|          | 0/2263 [00:00<?, ? examples/s]

TRAIN (FOR only discriminator): 200
TRAIN (FOR GAN): 4800


DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'labeled_mask'],
        num_rows: 4800
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7532
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2263
    })
    train_only_labeled: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'labeled_mask'],
        num_rows: 200
    })
})

<IPython.core.display.Javascript object>

In [18]:
train_only_labeled_dataloader = DataLoader(
    tokenized_dataset["train_only_labeled"],
    batch_size=CONFIG["batch_size"],
    sampler=RandomSampler(tokenized_dataset["train_only_labeled"]),
    collate_fn=data_collator,
    pin_memory=True,
)

train_dataloader = DataLoader(
    tokenized_dataset["train"],
    batch_size=CONFIG["batch_size"],
    sampler=RandomSampler(tokenized_dataset["train"]),
    collate_fn=data_collator,
    pin_memory=True,
)

valid_dataloader = DataLoader(
    tokenized_dataset["validation"],
    batch_size=CONFIG["batch_size"],
    sampler=SequentialSampler(tokenized_dataset["validation"]),
    collate_fn=data_collator,
    pin_memory=True,
)

test_dataloader = DataLoader(
    tokenized_dataset["test"],
    batch_size=32,
    sampler=SequentialSampler(tokenized_dataset["test"]),
    collate_fn=data_collator,
    pin_memory=True,
)

### Train only discriminator

In [34]:
torch.cuda.empty_cache()
gc.collect()
from trainer import trainer as trainer_module
from trainer import gan_trainer as gan_trainer_module

trainer_module = imp.reload(trainer_module)
gan_trainer_module = imp.reload(gan_trainer_module)

In [35]:
from copy import copy

BASE_CONFIG = copy(CONFIG)
BASE_CONFIG["GAN"] = False
BASE_CONFIG["gan_training"] = False
BASE_CONFIG["num_labels"] = NUM_LABELS
BASE_CONFIG["num_train_epochs"] = 5
BASE_CONFIG["LABELED_SIZE"] = LABELED_SIZE

discriminator = model.DiscriminatorForSequenceClassification(**BASE_CONFIG)

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [36]:
from trainer import trainer

trainer = imp.reload(trainer)

BASE_CONFIG["num_train_examples"] = len(train_only_labeled_dataloader.dataset)
trainer = trainer.TrainerSequenceClassification(
    config=BASE_CONFIG,
    discriminator=discriminator,
    train_dataloader=train_only_labeled_dataloader,
    valid_dataloader=valid_dataloader,
    device=device,
)


Trainable layers 199


In [37]:
%%time
run = None
run = neptune.init_run(
    project=secret["neptune_project"], api_token=secret["neptune_token"], tags=['test']
)
run["config"] = trainer.config

for epoch_i in range(BASE_CONFIG["num_train_epochs"]):
    print(f"======== Epoch {epoch_i + 1} / {BASE_CONFIG['num_train_epochs']} ========")
    train_info = trainer.train_epoch(log_env=run)
    valid_metrics = trainer.validation(log_env=run)
# run.stop()

https://app.neptune.ai/vmalperovich/gan-in-nlp/e/GAN2-293


You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


	Train loss discriminator: 3.007
	Test loss discriminator: 3.010
	Test accuracy discriminator: 0.076
	Test f1 discriminator: 0.027
	Train loss discriminator: 2.915
	Test loss discriminator: 2.999
	Test accuracy discriminator: 0.068
	Test f1 discriminator: 0.023
	Train loss discriminator: 2.800
	Test loss discriminator: 2.945
	Test accuracy discriminator: 0.090
	Test f1 discriminator: 0.035
	Train loss discriminator: 2.606
	Test loss discriminator: 2.853
	Test accuracy discriminator: 0.110
	Test f1 discriminator: 0.042
	Train loss discriminator: 2.397
	Test loss discriminator: 2.777
	Test accuracy discriminator: 0.161
	Test f1 discriminator: 0.091
CPU times: user 43 s, sys: 2.26 s, total: 45.3 s
Wall time: 46.4 s


In [8]:
predict_info = trainer.predict(
    discriminator, test_dataloader, label_names=CONFIG["label_names"]
)
run["test"] = predict_info
run.stop()
predict_info

<IPython.core.display.Javascript object>

### Train via GAN

In [39]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [40]:
from copy import copy

GAN_CONFIG = copy(CONFIG)
GAN_CONFIG["GAN"] = True
GAN_CONFIG["gan_training"] = True
GAN_CONFIG["GAN_TYPE"] = "dummy"
GAN_CONFIG["mixed_fake_ratio"] = 0.2
GAN_CONFIG["LABELED_SIZE"] = LABELED_SIZE
GAN_CONFIG["UNLABELED_SIZE"] = UNLABELED_SIZE
GAN_CONFIG["num_train_epochs"] = 5


# GAN_CONFIG

In [41]:
generator = model.SimpleSequenceGenerator(
    input_size=CONFIG["noise_size"],
    output_size=discriminator.encoder.config.hidden_size,
)
discriminator = model.DiscriminatorForSequenceClassification(**GAN_CONFIG)
generator

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training with GAN mode on!
Default fake label index is -1


SimpleSequenceGenerator(
  (layers): Sequential(
    (0): Linear(in_features=100, out_features=768, bias=True)
    (1): LeakyReLU(negative_slope=0.2, inplace=True)
    (2): Dropout(p=0.2, inplace=False)
    (3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (4): Linear(in_features=768, out_features=768, bias=True)
    (5): LeakyReLU(negative_slope=0.2, inplace=True)
    (6): Dropout(p=0.2, inplace=False)
    (7): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
)

In [42]:
from trainer import gan_trainer as gan_trainer_module

gan_trainer_module = imp.reload(gan_trainer_module)

GAN_CONFIG["num_train_examples"] = len(train_dataloader.dataset)
gan_trainer = gan_trainer_module.GANTrainerSequenceClassification(
    config=GAN_CONFIG,
    discriminator=discriminator,
    generator=generator,
    train_dataloader=train_dataloader,
    valid_dataloader=valid_dataloader,
    device=device,
)

Trainable layers 199


In [43]:
%%time
run = None
tags = None
run = neptune.init_run(
    project=secret["neptune_project"], api_token=secret["neptune_token"], tags=tags
)
run["config"] = gan_trainer.config


for epoch_i in range(GAN_CONFIG["num_train_epochs"]):
    print(f"======== Epoch {epoch_i + 1} / {GAN_CONFIG['num_train_epochs']} ========")
    train_info = gan_trainer.train_epoch(log_env=run)
    result = gan_trainer.validation(log_env=run)
# run.stop()

https://app.neptune.ai/vmalperovich/gan-in-nlp/e/GAN2-294
	Train loss discriminator: 4.386
	Train loss generator: 0.541
	Test loss discriminator: 2.961
	Test accuracy discriminator: 0.104
	Test f1 discriminator: 0.031
	Train loss discriminator: 3.042
	Train loss generator: 0.849
	Test loss discriminator: 2.323
	Test accuracy discriminator: 0.309
	Test f1 discriminator: 0.275
	Train loss discriminator: 1.726
	Train loss generator: 0.836
	Test loss discriminator: 2.030
	Test accuracy discriminator: 0.412
	Test f1 discriminator: 0.367
	Train loss discriminator: 1.565
	Train loss generator: 0.831
	Test loss discriminator: 2.000
	Test accuracy discriminator: 0.433
	Test f1 discriminator: 0.409
	Train loss discriminator: 0.998
	Train loss generator: 0.806
	Test loss discriminator: 2.183
	Test accuracy discriminator: 0.455
	Test f1 discriminator: 0.448
CPU times: user 6min 1s, sys: 45.9 s, total: 6min 47s
Wall time: 6min 47s


In [9]:
predict_info = gan_trainer.predict(
    discriminator, test_dataloader, label_names=CONFIG["label_names"]
)
run["test"] = predict_info
run.stop()
predict_info

<IPython.core.display.Javascript object>

# Exeperiments

In [67]:
from trainer import trainer as trainer_module
from trainer import gan_trainer as gan_trainer_module


# model_name = "bert-base-cased"
model_name = "distilbert-base-uncased"
# model_name = "google/electra-small-discriminator"
# model_name = "google/electra-base-discriminator"
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tags = ["FINAL"]

<IPython.core.display.Javascript object>

In [68]:
CONFIG["encoder_name"] = model_name
CONFIG["noise_range"] = (-2, 2)
CONFIG["noise_range_str"] = str(CONFIG["noise_range"])
CONFIG["noise_type"] = "uniform"
CONFIG["num_train_epochs"] = 3
# CONFIG["gen_multiplier"] = 4
print(CONFIG["num_labels"])

NUM_TRIALS_GAN = 2
UNLABELED_SIZE = min(CONFIG["num_labels"] * 150, CONFIG["dataset_train_size"] - 1)
print(UNLABELED_SIZE)

20
3000


<IPython.core.display.Javascript object>

In [10]:
for per_label in tqdm_notebook([5, 10, 20, 50, 100]):
    LABELED_SIZE = CONFIG["num_labels"] * per_label
    print(f"\n\n\n****************{LABELED_SIZE}***********\n\n\n")

    CONFIG["per_label_samples"] = per_label
    try:
        del discriminator
    except:
        pass
    torch.cuda.empty_cache()
    gc.collect()
    FULL_SIZE = LABELED_SIZE + UNLABELED_SIZE
    multiplier = int(np.log2(FULL_SIZE / LABELED_SIZE)) - 1
    multiplier = max(1, multiplier)
    print("Multiplier:", multiplier)

    tokenized_dataset = prepare_experiment_datasets(
        LABELED_SIZE, UNLABELED_SIZE, FULL_SIZE, multiplier
    )

    train_only_labeled_dataloader = DataLoader(
        tokenized_dataset["train_only_labeled"],
        batch_size=CONFIG["batch_size"],
        sampler=RandomSampler(tokenized_dataset["train_only_labeled"]),
        collate_fn=data_collator,
        pin_memory=True,
    )

    train_dataloader = DataLoader(
        tokenized_dataset["train"],
        batch_size=CONFIG["batch_size"],
        sampler=RandomSampler(tokenized_dataset["train"]),
        collate_fn=data_collator,
        pin_memory=True,
    )

    valid_dataloader = DataLoader(
        tokenized_dataset["validation"],
        batch_size=CONFIG["batch_size"],
        sampler=SequentialSampler(tokenized_dataset["validation"]),
        collate_fn=data_collator,
        pin_memory=True,
    )

    test_dataloader = DataLoader(
        tokenized_dataset["test"],
        batch_size=32,
        sampler=SequentialSampler(tokenized_dataset["test"]),
        collate_fn=data_collator,
        pin_memory=True,
    )
    for _ in range(2):
        # NO GAN
        print("NO GAN...")
        try:
            del discriminator
        except:
            pass
        torch.cuda.empty_cache()
        gc.collect()
        BASE_CONFIG = copy(CONFIG)
        BASE_CONFIG["num_train_examples"] = len(train_only_labeled_dataloader.dataset)
        BASE_CONFIG["GAN"] = False
        BASE_CONFIG["gan_training"] = False
        BASE_CONFIG["LABELED_SIZE"] = LABELED_SIZE
        discriminator = model.DiscriminatorForSequenceClassification(**BASE_CONFIG)
        print(discriminator.encoder_name)
        trainer = trainer_module.TrainerSequenceClassification(
            config=BASE_CONFIG,
            discriminator=discriminator,
            train_dataloader=train_only_labeled_dataloader,
            valid_dataloader=valid_dataloader,
            device=device,
        )
        run = neptune.init_run(
            project=secret["neptune_project"],
            api_token=secret["neptune_token"],
            tags=tags,
        )
        run["config"] = trainer.config

        for epoch_i in range(BASE_CONFIG["num_train_epochs"]):
            print(f"== Epoch {epoch_i + 1} / {BASE_CONFIG['num_train_epochs']} ==")
            train_info = trainer.train_epoch(log_env=run)
            valid_metrics = trainer.validation(log_env=run)
        predict_info = trainer.predict(
            discriminator, test_dataloader, label_names=CONFIG["label_names"]
        )
        run["test"] = predict_info
        run.stop()

    for _ in range(NUM_TRIALS_GAN):
        # GAN
        print("GAN...")
        del discriminator
        gc.collect()
        torch.cuda.empty_cache()
        GAN_CONFIG = copy(CONFIG)
        GAN_CONFIG["GAN"] = True
        GAN_CONFIG["gan_training"] = True
        GAN_CONFIG["GAN_TYPE"] = "dummy"
        GAN_CONFIG["LABELED_SIZE"] = LABELED_SIZE
        GAN_CONFIG["UNLABELED_SIZE"] = UNLABELED_SIZE
        GAN_CONFIG["FULL_SIZE"] = FULL_SIZE
        discriminator = model.DiscriminatorForSequenceClassification(**GAN_CONFIG)
        generator = model.SimpleSequenceGenerator(
            input_size=CONFIG["noise_size"],
            output_size=discriminator.encoder.config.hidden_size,
        )

        GAN_CONFIG["num_train_examples"] = len(train_dataloader.dataset)
        gan_trainer = gan_trainer_module.GANTrainerSequenceClassification(
            config=GAN_CONFIG,
            discriminator=discriminator,
            generator=generator,
            train_dataloader=train_dataloader,
            valid_dataloader=valid_dataloader,
            device=device,
            save_path=CONFIG["save_path"],
        )
        run = neptune.init_run(
            project=secret["neptune_project"],
            api_token=secret["neptune_token"],
            tags=tags,
        )
        run["config"] = gan_trainer.config

        for epoch_i in range(GAN_CONFIG["num_train_epochs"]):
            print(f"== Epoch {epoch_i + 1} / {GAN_CONFIG['num_train_epochs']} ==")
            train_info = gan_trainer.train_epoch(log_env=run)
            result = gan_trainer.validation(log_env=run)
        discriminator.load_state_dict(torch.load(CONFIG["save_path"]))
        predict_info = gan_trainer.predict(
            discriminator, test_dataloader, label_names=CONFIG["label_names"]
        )
        run["test"] = predict_info
        run.stop()

<IPython.core.display.Javascript object>