In [1]:
import json
import argparse
import torch
from torch import nn
import os
import numpy as np

from src.slurm import init_signal_handler, init_distributed_mode
from src.data.loader import check_data_params, load_data
from src.utils import bool_flag, initialize_exp, set_sampling_probs, shuf_order
from src.model import check_model_params, build_model
from src.trainer import SingleTrainer
from src.evaluation.evaluator import SingleEvaluator

import apex
from src.fp16 import network_to_half

os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"

Argument inputs (works in JUPYTER environment ONLY)

In [3]:
## jupyter
## main parameters
def modified_params(params):
    params.exp_name = "cmlm"                       # experiment name
    params.dump_path = "./dumped/"                 # where to store the experiment

    ## data location / training objective
    params.data_path = "./data/processed/coco/"    # data location
    params.lngs = 'wiki'                           # lang pretraining source
    params.imgs = 'vg'                             # image pretraining source
    params.cmodal = 'cap-img'                      # multi-modal
    params.mlm_steps = 'wiki'                      # MLM objective for language
    params.ipm_steps = 'vg'                        # MLM objective for image
    params.cmlm_steps = 'cap-img'                  # CMLM objective
    
    ## image
    params.n_bbox = 36                             # Number of bounding boxes
    params.spatial_feat = 6                        # Number of spatial features
    params.features = 2048                         # Dimension of image feature vectors

    ## transformer parameters
    params.emb_dim = 1024                          # embeddings / model dimension
    params.n_layers = 6                            # number of layers
    params.n_heads = 8                             # number of heads
    params.dropout = 0.1                           # dropout
    params.attention_dropout = 0.1                 # attention dropout
    params.gelu_activation = True                  # GELU instead of ReLU

    ## optimization
    params.batch_size = 32                           # sequences per batch
    params.bptt_word = 256                           # sequences length for language
    params.bptt_img = 256                            # sequences length for image
    params.optimizer = "adam,lr=0.0001"              # optimizer
    params.epoch_size = 200000                       # number of sentences per epoch
    params.validation_metrics = "_valid_mlm_ppl,_ce"     # validation metric (when to save the best model)
    params.stopping_criterion = "_valid_mlm_ppl;_ce,10"  # end experiment if stopping criterion does not improve
    
#     params.stopping_criterion_image = "_cross_entropy,10"  # end experiment if stopping criterion does not improve
#     params.stopping_criterion_cmodal = "_combined,10"      # end experiment if stopping criterion does not improve
    
    return params

In [4]:
def get_parser():
    """
    Generate a parameters parser.
    """
    # parse parameters
    parser = argparse.ArgumentParser(description="Cross-modal learning")

    # main parameters
    parser.add_argument("--dump_path", type=str, default="./dumped/",
                        help="Experiment dump path")
    parser.add_argument("--exp_name", type=str, default="",
                        help="Experiment name")
    parser.add_argument("--save_periodic", type=int, default=0,
                        help="Save the model periodically (0 to disable)")
    parser.add_argument("--exp_id", type=str, default="",
                        help="Experiment ID")
    
    # float16
    parser.add_argument("--fp16", type=bool_flag, default=False,
                        help="Run model with float16")

    # only use an encoder (use a specific decoder for machine translation)
    parser.add_argument("--encoder_only", type=bool_flag, default=True,
                        help="Only use an encoder")
    
    # model parameters
    parser.add_argument("--emb_dim", type=int, default=512,
                        help="Embedding layer size")
    parser.add_argument("--n_layers", type=int, default=4,
                        help="Number of Transformer layers")
    parser.add_argument("--n_heads", type=int, default=8,
                        help="Number of Transformer heads")
    parser.add_argument("--dropout", type=float, default=0,
                        help="Dropout")
    parser.add_argument("--attention_dropout", type=float, default=0,
                        help="Dropout in the attention layer")
    parser.add_argument("--gelu_activation", type=bool_flag, default=False,
                        help="Use a GELU activation instead of ReLU")
    parser.add_argument("--share_inout_emb", type=bool_flag, default=True,
                        help="Share input and output embeddings")
    parser.add_argument("--sinusoidal_embeddings", type=bool_flag, default=False,
                        help="Use sinusoidal embeddings")
    
    # adaptive softmax
    parser.add_argument("--asm", type=bool_flag, default=False,
                        help="Use adaptive softmax")
    if parser.parse_known_args()[0].asm:
        parser.add_argument("--asm_cutoffs", type=str, default="8000,20000",
                            help="Adaptive softmax cutoffs")
        parser.add_argument("--asm_div_value", type=float, default=4,
                            help="Adaptive softmax cluster sizes ratio")
        
    # causal language modeling task parameters
    parser.add_argument("--context_size", type=int, default=0,
                        help="Context size (0 means that the first elements in sequences won't have any context)")
    
    # masked language modeling task parameters
    parser.add_argument("--word_pred", type=float, default=0.15,
                        help="Fraction of words for which we need to make a prediction")
    parser.add_argument("--word_sample_alpha", type=float, default=0,
                        help="Exponent for transforming word counts to probabilities (~word2vec sampling)")
    parser.add_argument("--word_mask_keep_rand", type=str, default="0.8,0.1,0.1",
                        help="Fraction of words to mask out / keep / randomize, among the words to predict")
    
    # masked image pretraining modeling task parameters
    parser.add_argument("--img_pred", type=float, default=0.15,
                        help="Fraction of imgs for which we need to make a prediction")
    parser.add_argument("--img_sample_alpha", type=float, default=0,
                        help="Exponent for transforming img counts to probabilities. Should be zero")
    parser.add_argument("--img_mask_keep_rand", type=str, default="0.8,0.1,0.1",
                        help="Fraction of img features to mask out / keep / randomize, among the features to predict")
    parser.add_argument("--n_bbox", type=int, default=36,
                        help="number of extracted bounding boxes")
    parser.add_argument("--features", type=int, default=2048,
                        help="dimension of feature vector for each region") 
    
    # input sentence noise
    parser.add_argument("--word_shuffle", type=float, default=0,
                        help="Randomly shuffle input words (0 to disable)")
    parser.add_argument("--word_dropout", type=float, default=0,
                        help="Randomly dropout input words (0 to disable)")
    parser.add_argument("--word_blank", type=float, default=0,
                        help="Randomly blank input words (0 to disable)")
    
    # data
    parser.add_argument("--data_path", type=str, default="",
                        help="Data path")
    parser.add_argument("--lngs", type=str, default="",
                        help="Languages sources , wiki")
    parser.add_argument("--imgs", type=str, default="",
                        help="Images sources , vg")
    parser.add_argument("--cmodal", type=str, default="",
                        help="Crossmodal sources , cap-img")
    parser.add_argument("--max_vocab", type=int, default=-1,
                        help="Maximum vocabulary size (-1 to disable)")
    parser.add_argument("--min_count", type=int, default=0,
                        help="Minimum vocabulary count")
    parser.add_argument("--lg_sampling_factor", type=float, default=-1,
                        help="Language sampling factor")
    
    # batch parameters for word
    parser.add_argument("--bptt_word", type=int, default=256,
                        help="Sequence length")
    parser.add_argument("--max_len", type=int, default=100,
                        help="Maximum length of sentences (after BPE)")
    parser.add_argument("--group_by_size", type=bool_flag, default=True,
                        help="Sort sentences by size during the training")
    parser.add_argument("--batch_size", type=int, default=32,
                        help="Number of sentences per batch")
    parser.add_argument("--max_batch_size", type=int, default=0,
                        help="Maximum number of sentences per batch (used in combination with tokens_per_batch, 0 to disable)")
    parser.add_argument("--tokens_per_batch", type=int, default=-1,
                        help="Number of tokens per batch (we can control using tokens_per_batch or batch_size)")
    
    # batch parameter for image
    parser.add_argument("--bptt_img", type=int, default=256,
                        help="Sequence length")
    
    # training parameters
    parser.add_argument("--split_data", type=bool_flag, default=False,
                        help="Split data across workers of a same node")
    parser.add_argument("--optimizer", type=str, default="adam,lr=0.0001",
                        help="Optimizer (SGD / RMSprop / Adam, etc.)")
    parser.add_argument("--clip_grad_norm", type=float, default=5,
                        help="Clip gradients norm (0 to disable)")
    parser.add_argument("--epoch_size", type=int, default=100000,
                        help="Epoch size / evaluation frequency (-1 for parallel data size)")
    parser.add_argument("--max_epoch", type=int, default=100000,
                        help="Maximum epoch size")
    parser.add_argument("--stopping_criterion", type=str, default="",
                        help="Stopping criterion, and number of non-increase before stopping the experiment")
    parser.add_argument("--validation_metrics", type=str, default="",
                        help="Validation metrics")
    
    # training coefficients
    parser.add_argument("--lambda_mlm", type=str, default="1",
                        help="Prediction coefficient (MLM)")
    parser.add_argument("--lambda_clm", type=str, default="1",
                        help="Causal coefficient (LM)")
    parser.add_argument("--lambda_pc", type=str, default="1",
                        help="PC coefficient")
    parser.add_argument("--lambda_ae", type=str, default="1",
                        help="AE coefficient")
    parser.add_argument("--lambda_mt", type=str, default="1",
                        help="MT coefficient")
    parser.add_argument("--lambda_bt", type=str, default="1",
                        help="BT coefficient")
    parser.add_argument("--lambda_ipm", type=str, default="1",
                        help="Caption coefficient")
    parser.add_argument("--lambda_cmlm", type=str, default="1",
                        help="Caption-image coefficient")
    
    # training steps
    parser.add_argument("--mlm_steps", type=str, default="",
                        help="Masked prediction steps (MLM / TLM)")
    parser.add_argument("--ipm_steps", type=str, default="",
                        help="Masked image prediction steps (IPM)")
    parser.add_argument("--cmlm_steps", type=str, default="",
                        help="Cross-modal prediction steps (CMLM)")
    parser.add_argument("--mt_steps", type=str, default="",
                        help="Machine translation steps")
    parser.add_argument("--ae_steps", type=str, default="",
                        help="Denoising auto-encoder steps")
    parser.add_argument("--bt_steps", type=str, default="",
                        help="Back-translation steps")
    parser.add_argument("--pc_steps", type=str, default="",
                        help="Parallel classification steps")
    
    # reload pretrained embeddings / pretrained model / checkpoint
    parser.add_argument("--reload_emb", type=str, default="",
                        help="Reload pretrained word embeddings")
    parser.add_argument("--reload_model", type=str, default="",
                        help="Reload a pretrained model")
    parser.add_argument("--reload_checkpoint", type=str, default="",
                        help="Reload a checkpoint")

    # beam search (for MT only)
    parser.add_argument("--beam_size", type=int, default=1,
                        help="Beam size, default = 1 (greedy decoding)")
    parser.add_argument("--length_penalty", type=float, default=1,
                        help="Length penalty, values < 1.0 favor shorter sentences, while values > 1.0 favor longer ones.")
    parser.add_argument("--early_stopping", type=bool_flag, default=False,
                        help="Early stopping, stop as soon as we have `beam_size` hypotheses, although longer ones may have better scores.")
    
    # evaluation
    parser.add_argument("--eval_bleu", type=bool_flag, default=False,
                        help="Evaluate BLEU score during MT training")
    parser.add_argument("--eval_only", type=bool_flag, default=False,
                        help="Only run evaluations")
    
    # debug
    parser.add_argument("--debug_train", type=bool_flag, default=False,
                        help="Use valid sets for train sets (faster loading)")
    parser.add_argument("--debug_slurm", type=bool_flag, default=False,
                        help="Debug multi-GPU / multi-node within a SLURM job")
    
    # multi-gpu / multi-node
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="Multi-GPU - Local rank")
    parser.add_argument("--master_port", type=int, default=-1,
                        help="Master port (for multi-node SLURM jobs)")
    
    return parser

In [5]:
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)
    
    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)
    
    # build model
    model = build_model(params, data['dico'])

    # distributed
    if params.multi_gpu:
        logger.info("Using nn.parallel.DistributedDataParallel ...")
        
        if params.encoder_only:
            model = nn.parallel.DistributedDataParallel(model, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True)
        else:
            encoder = nn.parallel.DistributedDataParallel(encoder, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True)
            decoder = nn.parallel.DistributedDataParallel(decoder, device_ids=[params.local_rank], output_device=params.local_rank, broadcast_buffers=True)

    # build trainer, reload potential checkpoints / build evaluator
    trainer = SingleTrainer(model, data, params)
    evaluator = SingleEvaluator(trainer, data, params)
        
    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" % trainer.epoch)

        trainer.n_sentences = 0
        trainer.n_images = 0
        trainer.n_pairs = 0

        while trainer.n_pairs < trainer.epoch_size:

            # MLM step for language
            for lang in params.mlm_steps:
                trainer.mlm_step(lang, params.lambda_mlm)
                
            # MLM step for image 
            for img in params.ipm_steps:
                trainer.ipm_step(img, params.lambda_ipm)
                
            # CMLM pretraining step
            trainer.cmlm_step('cap', 'img', params.lambda_cmlm)
          
            trainer.iter()

        logger.info("============ End of epoch %i ============" % trainer.epoch)
        
        return data, trainer
    
        # evaluate perplexity and cross_entropy (combined accuracy)
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)

In [6]:
# generate parser / parse parameters
parser = get_parser()
params = parser.parse_args(args=[])

# jupyter
params = modified_params(params)

# check parameters
check_data_params(params)
check_model_params(params)

# run experiment
data, trainer = main(params)

INFO - 08/21/19 01:33:41 - 0:00:00 - ae_steps: 
                                     asm: False
                                     attention_dropout: 0.1
                                     batch_size: 32
                                     beam_size: 1
                                     bptt_img: 256
                                     bptt_word: 256
                                     bt_steps: 
                                     clip_grad_norm: 5
                                     cmlm_steps: [('cap', 'img')]
                                     cmodal: cap-img
                                     command: python /home/woenyon.lai/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py '-f' '/home/woenyon.lai/.local/share/jupyter/runtime/kernel-a2e83343-8c80-40b1-b5d1-dde4bdf5d573.json' --exp_id "wt80at17cs"
                                     context_size: 0
                                     crossmodal: ['cap', 'img']
                                     crossmo

SLURM job: False
0 - Number of nodes: 1
0 - Node ID        : 0
0 - Local rank     : 0
0 - Global rank    : 0
0 - World size     : 1
0 - GPUs per node  : 1
0 - Master         : True
0 - Multi-node     : False
0 - Multi-GPU      : False
0 - Hostname       : gpudev001


INFO - 08/21/19 01:34:54 - 0:01:14 - 2768679201 words (52654 unique) in 43460511 sentences. 0 unknown words (0 unique) covering 0.00% of the data.

INFO - 08/21/19 01:36:15 - 0:02:35 - Loading data from ./data/processed/coco/wiki/en.valid.pth ...
INFO - 08/21/19 01:36:15 - 0:02:35 - 322114 words (52654 unique) in 5000 sentences. 1 unknown words (1 unique) covering 0.00% of the data.

INFO - 08/21/19 01:36:16 - 0:02:35 - Loading data from ./data/processed/coco/wiki/en.test.pth ...
INFO - 08/21/19 01:36:16 - 0:02:35 - 310141 words (52654 unique) in 5000 sentences. 0 unknown words (0 unique) covering 0.00% of the data.



INFO - 08/21/19 01:36:16 - 0:02:35 - Loading data from ./data/processed/coco/cap/cap.train.pth ...
INFO - 08/21/19 01:36:16 - 0:02:35 - 6230364 words (52654 unique) in 566435 sentences. 0 unknown words (0 unique) covering 0.00% of the data.

INFO - 08/21/19 01:36:16 - 0:02:36 - Loading data from ./data/processed/coco/cap/cap.valid.pth ...
INFO - 08/21/19 01:36:16 - 0:02:

x			: torch.Size([256, 32, 2048])
y			: torch.Size([1208])
spatial_x		: torch.Size([256, 32, 6])
pred_mask		: torch.Size([256, 32])
candidates_before	: torch.Size([1208, 2048])
tensor_i		: torch.Size([256, 32, 1024])
candidates_after	: torch.Size([1208, 1024])


RuntimeError: CUDA out of memory. Tried to allocate 2.75 GiB (GPU 0; 15.90 GiB total capacity; 13.27 GiB already allocated; 329.88 MiB free; 1.73 GiB cached)

In [None]:
params