# RATIO 2019 - Benchmarking Workshop

https://github.com/zihangdai/xlnet/blob/master/notebooks/colab_imdb_gpu.ipynb

### Setup

In [None]:
! conda env create -f environment.yml
! conda activate argmining19-ssc && conda install -y -c conda-forge git-lfs && git lfs install && git lfs pull

In [None]:
! pip install sentencepiece
! pip install absl-py
# ! pip install tensorflow-auto-detect
! pip install tensorflow-gpu

In [None]:
import os

if not os.path.exists('data/xlnet_cased_L-24_H-1024_A-16'):
    ! wget https://storage.googleapis.com/xlnet/released_models/cased_L-24_H-1024_A-16.zip
    ! mv cased_L-24_H-1024_A-16.zip data/
    ! cd data/ && unzip cased_L-24_H-1024_A-16.zip
else:
    print('Have XLNet model already!')

In [None]:
if not os.path.exists('xlnet'):
    ! git clone https://github.com/zihangdai/xlnet.git
else:
    print('Should have repo already!')
    ! cd xlnet && git pull

### GLUE STS-B reproduction ?

https://github.com/zihangdai/xlnet#1-sts-b-sentence-pair-relevance-regression-with-gpus

In [1]:
%%bash
cd xlnet/scripts
python download_glue_data.py --data_dir ../glue --tasks all

Downloading and extracting CoLA...
	Completed!
Downloading and extracting SST...
	Completed!
Processing MRPC...
Local MRPC data not specified, downloading data from https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt
	Completed!
Downloading and extracting QQP...
	Completed!
Downloading and extracting STS...
	Completed!
Downloading and extracting MNLI...
	Completed!
Downloading and extracting SNLI...
	Completed!
Downloading and extracting QNLI...
	Completed!
Downloading and extracting RTE...
	Completed!
Downloading and extracting WNLI...
	Completed!
Downloading and extracting diagnostic...
	Completed!


In [13]:
%%bash

export SCRIPT_DIR="xlnet/"
export GLUE_DIR="xlnet/glue"
export LARGE_DIR="data/xlnet_cased_L-24_H-1024_A-16"
export OUTPUT_DIR="data/xlnet-out/glue-stsb-out"
export CHECKPOINT_DIR="data/xlnet-chkp/glue-stsb-chkp"

CUDA_VISIBLE_DEVICES=0 python3 ${SCRIPT_DIR}run_classifier.py \
  --do_train=True \
  --do_eval=False \
  --task_name=sts-b \
  --data_dir=${GLUE_DIR}/STS-B \
  --output_dir=${OUTPUT_DIR} \
  --model_dir=${CHECKPOINT_DIR} \
  --uncased=False \
  --spiece_model_file=${LARGE_DIR}/spiece.model \
  --model_config_path=${LARGE_DIR}/xlnet_config.json \
  --init_checkpoint=${LARGE_DIR}/xlnet_model.ckpt \
  --max_seq_length=128 \
  --train_batch_size=4 \
  --num_hosts=1 \
  --num_core_per_host=1 \
  --learning_rate=5e-5 \
  --train_steps=2400 \
  --warmup_steps=120 \
  --save_steps=600 \
  --is_regression=True

I0711 11:26:43.831812 140232776353600 model_utils.py:36] Single device mode.
W0711 11:26:44.281025 140232776353600 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

I0711 11:26:44.281518 140232776353600 estimator.py:209] Using config: {'_model_dir': 'data/xlnet-chkp/glue-stsb-chkp', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 600, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
, '_keep_checkpoint_max': 0, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute':

In [21]:
%%bash

export SCRIPT_DIR="xlnet/"
export GLUE_DIR="xlnet/glue"
export LARGE_DIR="data/xlnet_cased_L-24_H-1024_A-16"
export OUTPUT_DIR="data/xlnet-out/glue-stsb-out"
export CHECKPOINT_DIR="data/xlnet-chkp/glue-stsb-chkp"

CUDA_VISIBLE_DEVICES=0 python3 ${SCRIPT_DIR}run_classifier.py \
  --do_train=False \
  --do_eval=True \
  --task_name=sts-b \
  --data_dir=${GLUE_DIR}/STS-B \
  --output_dir=${OUTPUT_DIR} \
  --model_dir=${CHECKPOINT_DIR} \
  --uncased=False \
  --spiece_model_file=${LARGE_DIR}/spiece.model \
  --model_config_path=${LARGE_DIR}/xlnet_config.json \
  --max_seq_length=128 \
  --eval_batch_size=8 \
  --num_hosts=1 \
  --num_core_per_host=1 \
  --eval_all_ckpt=True \
  --is_regression=True

# Expected performance: "eval_pearsonr 0.916+ "

I0711 11:40:55.583007 140643517712192 model_utils.py:36] Single device mode.
W0711 11:40:56.031778 140643517712192 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

I0711 11:40:56.032244 140643517712192 estimator.py:209] Using config: {'_model_dir': 'data/xlnet-chkp/glue-stsb-chkp', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
, '_keep_checkpoint_max': 0, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute'

### Convert data format

https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e

In [15]:
import csv

import pandas as pd

In [16]:
data_cross_path = 'data/same-side-classification/cross-topic/{}.csv'
data_within_path = 'data/same-side-classification/within-topic/{}.csv'

cross_traindev_df = pd.read_csv(data_cross_path.format('training'),
                                quotechar='"',
                                quoting=csv.QUOTE_ALL,
                                encoding='utf-8',
                                escapechar='\\',
                                doublequote=False,
                                index_col='id')
cross_test_df = pd.read_csv(data_cross_path.format('test'),
                            quotechar='"',
                            quoting=csv.QUOTE_ALL,
                            encoding='utf-8',
                            escapechar='\\',
                            doublequote=False,
                            index_col='id')

within_traindev_df = pd.read_csv(data_within_path.format('training'),
                                 quotechar='"',
                                 quoting=csv.QUOTE_ALL,
                                 encoding='utf-8',
                                 escapechar='\\',
                                 doublequote=False,
                                 index_col='id')
within_test_df = pd.read_csv(data_within_path.format('test'),
                             quotechar='"',
                             quoting=csv.QUOTE_ALL,
                             encoding='utf-8',
                             escapechar='\\',
                             doublequote=False,
                             index_col='id')

In [17]:
# Adding a tag for the topics in focus: "gay marriage" and "abortion"
def add_tag(row):
    title = row['topic'].lower().strip()
    if "abortion" in title:
        row['tag'] = 'abortion'
    elif "gay marriage" in title:
        row['tag'] = 'gay marriage'
    else:
        row['tag'] = 'NA'
    return row


cross_traindev_df = cross_traindev_df.apply(add_tag, axis=1)
cross_test_df = cross_test_df.apply(add_tag, axis=1)

within_traindev_df = within_traindev_df.apply(add_tag, axis=1)
within_test_df = within_test_df.apply(add_tag, axis=1)

In [18]:
from sklearn.model_selection import train_test_split


def get_train_test_sets(df, ratio=0.30, random_state=1):
    X = df[['argument1', 'argument2', 'argument1_id', 'argument2_id', 'topic']]
    y = df[['is_same_side']]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=ratio, random_state=random_state, shuffle=True)
    return X_train, X_test, y_train, y_test

In [19]:
X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df)
X_test = within_test_df
# X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df)
# X_test = cross_test_df

In [20]:
import os
from tqdm import tqdm_notebook as tqdm

DATA_DIR = 'data/xlnet-in'
if not os.path.exists(DATA_DIR):
    os.mkdir(DATA_DIR)
    
DATA_DIR = os.path.join(DATA_DIR, 'ssc-within')
# DATA_DIR = os.path.join(DATA_DIR, 'ssc-cross')
if not os.path.exists(DATA_DIR):
    os.mkdir(DATA_DIR)

train_df = X_train.join(y_train)
dev_df = X_dev.join(y_dev)
test_df = X_test

with open(os.path.join(DATA_DIR, 'train.tsv'), 'w', encoding='utf-8') as fh:
    fh.write("label\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
    for _, row in tqdm(train_df.iterrows()):
        fh.write("{}\t{}\t{}\n".format(
            (1 if row['is_same_side'] else 0), row['argument1'], row['argument2']))

with open(os.path.join(DATA_DIR, 'dev.tsv'), 'w', encoding='utf-8') as fh:
    fh.write("label\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
    for _, row in tqdm(dev_df.iterrows()):
        fh.write("{}\t{}\t{}\n".format(
            (1 if row['is_same_side'] else 0), row['argument1'], row['argument2']))

with open(os.path.join(DATA_DIR, 'test.tsv'), 'w', encoding='utf-8') as fh:
    fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
    for id_, row in tqdm(test_df.iterrows()):
        fh.write("{}\t{}\t{}\n".format(id_, row['argument1'], row['argument2']))

44732it [00:02, 16427.73it/s]
19171it [00:01, 16312.33it/s]
31475it [00:01, 17884.15it/s]


In [None]:
! head -n 2 data/xlnet-in/ssc-within/train.tsv

In [None]:
import numpy as np
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()


def search_tab(row):
    return "\t" in row['argument1'] or "\t" in row['argument2']


print(np.unique(train_df.progress_apply(search_tab, axis=1)))
print(np.unique(dev_df.progress_apply(search_tab, axis=1)))
print(np.unique(test_df.progress_apply(search_tab, axis=1)))

### Variables

In [62]:
TASK_NAME = 'ssc-within' #@param{type:"string"}
SCRIPTS_DIR = 'xlnet' #@param {type:"string"}
DATA_DIR = 'data/xlnet-in/ssc-within' #@param {type:"string"}
OUTPUT_DIR = 'data/xlnet-out/ssc-within2' #@param {type:"string"}
PRETRAINED_MODEL_DIR = 'data/xlnet_cased_L-24_H-1024_A-16' #@param {type:"string"}
CHECKPOINT_DIR = 'data/xlnet-chkp/ssc-within2' #@param {type:"string"}

DO_TRAIN = True
DO_EVAL = False

MAX_SEQ_LEN = '256'  # '128'
BATCH_SIZE = '3'  # '6'

### Run model (training & evaluation)

*not sure what **train/dev/test** data split is when using XLNet...*

##### Patch file `xlnet/run_classifier.py` for using our own data

See at the end a diff/patch snippet

--- 

```python
# at line: 343

class SSCProcessor(GLUEProcessor):
  def __init__(self):
    super(SSCProcessor, self).__init__()
    self.label_column = 0
    self.text_a_column = 1  # 3
    self.text_b_column = 2  # 4
```

---

```python
# in: def main(_)
# variable: processors = {}
# at line: 660

      'ssc-within': SSCProcessor,
      'ssc-cross': SSCProcessor,
```

---

In [None]:
train_command = "python3 " + SCRIPTS_DIR + "/run_classifier.py \
  --do_train=" + str(DO_TRAIN) + " \
  --do_eval=" + str(DO_EVAL) + " \
  --eval_all_ckpt=False \
  --task_name=" + TASK_NAME + " \
  --data_dir=" + DATA_DIR + " \
  --output_dir=" + OUTPUT_DIR + " \
  --model_dir=" + CHECKPOINT_DIR + " \
  --uncased=False \
  --spiece_model_file=" + PRETRAINED_MODEL_DIR + "/spiece.model \
  --model_config_path=" + PRETRAINED_MODEL_DIR + "/xlnet_config.json \
  --init_checkpoint=" + PRETRAINED_MODEL_DIR + "/xlnet_model.ckpt \
  --max_seq_length=" + MAX_SEQ_LEN + " \
  --train_batch_size=" + BATCH_SIZE + " \
  --eval_batch_size=" + BATCH_SIZE + " \
  --num_hosts=1 \
  --num_core_per_host=1 \
  --learning_rate=2e-5 \
  --train_steps=10000 \
  --warmup_steps=500 \
  --save_steps=1000 \
  --iterations=1000"

! {train_command}

I0712 11:36:55.190428 140190188955456 model_utils.py:36] Single device mode.
W0712 11:36:55.647490 140190188955456 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

I0712 11:36:55.648028 140190188955456 estimator.py:209] Using config: {'_model_dir': 'data/xlnet-chkp/ssc-within2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
, '_keep_checkpoint_max': 0, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': No

I0712 11:37:13.213042 140190188955456 run_classifier.py:436] Writing example 10000 of 19173
I0712 11:37:28.870205 140190188955456 run_classifier.py:480] Input tfrecord file data/xlnet-out/ssc-within2/spiece.model.len-256.dev.eval.tf_record
I0712 11:37:28.870457 140190188955456 run_classifier.py:789] Add data/xlnet-chkp/ssc-within2/model.ckpt-0 to eval list.
I0712 11:37:28.870515 140190188955456 run_classifier.py:789] Add data/xlnet-chkp/ssc-within2/model.ckpt-4000 to eval list.
I0712 11:37:28.870552 140190188955456 run_classifier.py:789] Add data/xlnet-chkp/ssc-within2/model.ckpt-500 to eval list.
I0712 11:37:28.870584 140190188955456 run_classifier.py:789] Add data/xlnet-chkp/ssc-within2/model.ckpt-2500 to eval list.
I0712 11:37:28.870616 140190188955456 run_classifier.py:789] Add data/xlnet-chkp/ssc-within2/model.ckpt-1500 to eval list.
I0712 11:37:28.870647 140190188955456 run_classifier.py:789] Add data/xlnet-chkp/ssc-within2/model.ckpt-3000 to eval list.
I0712 11:37:28.870677 1401

I0712 11:37:33.953444 140190188955456 estimator.py:1147] Done calling model_fn.
I0712 11:37:33.965065 140190188955456 evaluation.py:255] Starting evaluation at 2019-07-12T11:37:33Z
W0712 11:37:34.106655 140190188955456 deprecation.py:323] From /home/ekoerner/.conda/envs/argmining19-ssc/lib/python3.7/site-packages/tensorflow/python/ops/array_ops.py:1354: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
I0712 11:37:34.538035 140190188955456 monitored_session.py:240] Graph was finalized.
2019-07-12 11:37:34.538305: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2019-07-12 11:37:34.563815: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3600000000 Hz
2019-07-12 11:37:34.565003: I tensor

I0712 11:54:17.248011 140190188955456 model_utils.py:91]   name = model/transformer/layer_4/ff/layer_1/kernel:0, shape = (1024, 4096), *INIT_FROM_CKPT*
I0712 11:54:17.248044 140190188955456 model_utils.py:91]   name = model/transformer/layer_4/ff/layer_1/bias:0, shape = (4096,), *INIT_FROM_CKPT*
I0712 11:54:17.248076 140190188955456 model_utils.py:91]   name = model/transformer/layer_4/ff/layer_2/kernel:0, shape = (4096, 1024), *INIT_FROM_CKPT*
I0712 11:54:17.248109 140190188955456 model_utils.py:91]   name = model/transformer/layer_4/ff/layer_2/bias:0, shape = (1024,), *INIT_FROM_CKPT*
I0712 11:54:17.248141 140190188955456 model_utils.py:91]   name = model/transformer/layer_4/ff/LayerNorm/beta:0, shape = (1024,), *INIT_FROM_CKPT*
I0712 11:54:17.248172 140190188955456 model_utils.py:91]   name = model/transformer/layer_4/ff/LayerNorm/gamma:0, shape = (1024,), *INIT_FROM_CKPT*
I0712 11:54:17.248202 140190188955456 model_utils.py:91]   name = model/transformer/layer_5/rel_attn/q/ke

I0712 11:54:17.852534 140190188955456 monitored_session.py:240] Graph was finalized.
2019-07-12 11:54:17.853397: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: 
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.62
pciBusID: 0000:17:00.0
2019-07-12 11:54:17.853825: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 1 with properties: 
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.62
pciBusID: 0000:65:00.0
2019-07-12 11:54:17.853872: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudart.so.10.0
2019-07-12 11:54:17.853882: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10.0
2019-07-12 11:54:17.853889: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcufft.so.10.0
2019-07-12 11:54:17.853897: I tensorflow/stream_execut

I0712 12:11:01.813145 140190188955456 estimator.py:1147] Done calling model_fn.
I0712 12:11:01.824653 140190188955456 evaluation.py:255] Starting evaluation at 2019-07-12T12:11:01Z
I0712 12:11:02.273972 140190188955456 monitored_session.py:240] Graph was finalized.
2019-07-12 12:11:02.274851: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: 
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.62
pciBusID: 0000:17:00.0
2019-07-12 12:11:02.275284: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 1 with properties: 
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.62
pciBusID: 0000:65:00.0
2019-07-12 12:11:02.275330: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudart.so.10.0
2019-07-12 12:11:02.275340: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10.0
2019-07-12 12:11:0

I0712 12:27:48.631597 140190188955456 model_utils.py:91]   name = model/transformer/layer_16/ff/layer_2/bias:0, shape = (1024,), *INIT_FROM_CKPT*
I0712 12:27:48.631628 140190188955456 model_utils.py:91]   name = model/transformer/layer_16/ff/LayerNorm/beta:0, shape = (1024,), *INIT_FROM_CKPT*
I0712 12:27:48.631660 140190188955456 model_utils.py:91]   name = model/transformer/layer_16/ff/LayerNorm/gamma:0, shape = (1024,), *INIT_FROM_CKPT*
I0712 12:27:48.631691 140190188955456 model_utils.py:91]   name = model/transformer/layer_17/rel_attn/q/kernel:0, shape = (1024, 16, 64), *INIT_FROM_CKPT*
I0712 12:27:48.631726 140190188955456 model_utils.py:91]   name = model/transformer/layer_17/rel_attn/k/kernel:0, shape = (1024, 16, 64), *INIT_FROM_CKPT*
I0712 12:27:48.631760 140190188955456 model_utils.py:91]   name = model/transformer/layer_17/rel_attn/v/kernel:0, shape = (1024, 16, 64), *INIT_FROM_CKPT*
I0712 12:27:48.631795 140190188955456 model_utils.py:91]   name = model/transformer/la

I0712 12:27:49.114676 140190188955456 monitored_session.py:240] Graph was finalized.
2019-07-12 12:27:49.115597: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: 
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.62
pciBusID: 0000:17:00.0
2019-07-12 12:27:49.116039: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 1 with properties: 
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.62
pciBusID: 0000:65:00.0
2019-07-12 12:27:49.116084: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudart.so.10.0
2019-07-12 12:27:49.116093: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10.0
2019-07-12 12:27:49.116102: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcufft.so.10.0
2019-07-12 12:27:49.116109: I tensorflow/stream_execut

I0712 12:44:34.562369 140190188955456 model_utils.py:91]   name = model/transformer/layer_3/ff/LayerNorm/gamma:0, shape = (1024,), *INIT_FROM_CKPT*
I0712 12:44:34.562400 140190188955456 model_utils.py:91]   name = model/transformer/layer_4/rel_attn/q/kernel:0, shape = (1024, 16, 64), *INIT_FROM_CKPT*
I0712 12:44:34.562433 140190188955456 model_utils.py:91]   name = model/transformer/layer_4/rel_attn/k/kernel:0, shape = (1024, 16, 64), *INIT_FROM_CKPT*
I0712 12:44:34.562466 140190188955456 model_utils.py:91]   name = model/transformer/layer_4/rel_attn/v/kernel:0, shape = (1024, 16, 64), *INIT_FROM_CKPT*
I0712 12:44:34.562499 140190188955456 model_utils.py:91]   name = model/transformer/layer_4/rel_attn/r/kernel:0, shape = (1024, 16, 64), *INIT_FROM_CKPT*
I0712 12:44:34.562533 140190188955456 model_utils.py:91]   name = model/transformer/layer_4/rel_attn/o/kernel:0, shape = (1024, 16, 64), *INIT_FROM_CKPT*
I0712 12:44:34.562566 140190188955456 model_utils.py:91]   name = model/tran

I0712 12:44:35.050238 140190188955456 monitored_session.py:240] Graph was finalized.
2019-07-12 12:44:35.051127: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: 
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.62
pciBusID: 0000:17:00.0
2019-07-12 12:44:35.051563: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 1 with properties: 
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.62
pciBusID: 0000:65:00.0
2019-07-12 12:44:35.051610: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudart.so.10.0
2019-07-12 12:44:35.051619: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10.0
2019-07-12 12:44:35.051627: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcufft.so.10.0
2019-07-12 12:44:35.051634: I tensorflow/stream_execut

I0712 13:01:18.346111 140190188955456 evaluation.py:255] Starting evaluation at 2019-07-12T13:01:18Z
I0712 13:01:18.789971 140190188955456 monitored_session.py:240] Graph was finalized.
2019-07-12 13:01:18.790850: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: 
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.62
pciBusID: 0000:17:00.0
2019-07-12 13:01:18.791281: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 1 with properties: 
name: GeForce GTX 1080 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.62
pciBusID: 0000:65:00.0
2019-07-12 13:01:18.791325: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudart.so.10.0
2019-07-12 13:01:18.791335: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10.0
2019-07-12 13:01:18.791343: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Succes

In [None]:
help_command = "python3 " + SCRIPTS_DIR + "/run_classifier.py --help"

! {help_command}

---

```python
# %load xlnet/patch.diff
```

Patch in https://github.com/zihangdai/xlnet.git at index: a4ea77132e2954a0b3e6d8db5f97cd198b056c3a (origin/master)  
Also update some warnings in tensorflow.

```diff
diff --git a/function_builder.py b/function_builder.py
index 54cf894..15cf917 100644
--- a/function_builder.py
+++ b/function_builder.py
@@ -95,7 +95,7 @@ def two_stream_loss(FLAGS, features, labels, mems, is_training):
 
   initializer = xlnet_model.get_initializer()
 
-  with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
+  with tf.variable_scope("model", reuse=tf.compat.v1.AUTO_REUSE):
     # LM loss
     lm_loss = modeling.lm_loss(
         hidden=output,
@@ -153,7 +153,7 @@ def get_classification_loss(
 
   summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj)
 
-  with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
+  with tf.variable_scope("model", reuse=tf.compat.v1.AUTO_REUSE):
 
     if FLAGS.cls_scope is not None and FLAGS.cls_scope:
       cls_scope = "classification_{}".format(FLAGS.cls_scope)
@@ -196,7 +196,7 @@ def get_regression_loss(
 
   summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj)
 
-  with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
+  with tf.variable_scope("model", reuse=tf.compat.v1.AUTO_REUSE):
     per_example_loss, logits = modeling.regression_loss(
         hidden=summary,
         labels=label,
diff --git a/model_utils.py b/model_utils.py
index c8e4295..a6a4d40 100644
--- a/model_utils.py
+++ b/model_utils.py
@@ -24,20 +24,20 @@ def configure_tpu(FLAGS):
     tpu_cluster = None
     master = FLAGS.master
 
-  session_config = tf.ConfigProto(allow_soft_placement=True)
+  session_config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
   # Uncomment the following line if you hope to monitor GPU RAM growth
   # session_config.gpu_options.allow_growth = True
 
   if FLAGS.use_tpu:
     strategy = None
-    tf.logging.info('Use TPU without distribute strategy.')
+    tf.compat.v1.logging.info('Use TPU without distribute strategy.')
   elif FLAGS.num_core_per_host == 1:
     strategy = None
-    tf.logging.info('Single device mode.')
+    tf.compat.v1.logging.info('Single device mode.')
   else:
     strategy = tf.contrib.distribute.MirroredStrategy(
         num_gpus=FLAGS.num_core_per_host)
-    tf.logging.info('Use MirroredStrategy with %d devices.',
+    tf.compat.v1.logging.info('Use MirroredStrategy with %d devices.',
                     strategy.num_replicas_in_sync)
 
   per_host_input = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
@@ -64,36 +64,36 @@ def init_from_checkpoint(FLAGS, global_vars=False):
   if FLAGS.init_checkpoint is not None:
     if FLAGS.init_checkpoint.endswith("latest"):
       ckpt_dir = os.path.dirname(FLAGS.init_checkpoint)
-      init_checkpoint = tf.train.latest_checkpoint(ckpt_dir)
+      init_checkpoint = tf.compat.v1.train.latest_checkpoint(ckpt_dir)
     else:
       init_checkpoint = FLAGS.init_checkpoint
 
-    tf.logging.info("Initialize from the ckpt {}".format(init_checkpoint))
+    tf.compat.v1.logging.info("Initialize from the ckpt {}".format(init_checkpoint))
 
     (assignment_map, initialized_variable_names
     ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
     if FLAGS.use_tpu:
       def tpu_scaffold():
-        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
-        return tf.train.Scaffold()
+        tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map)
+        return tf.compat.v1.train.Scaffold()
 
       scaffold_fn = tpu_scaffold
     else:
-      tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+      tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map)
 
     # Log customized initialization
-    tf.logging.info("**** Global Variables ****")
+    tf.compat.v1.logging.info("**** Global Variables ****")
     for var in tvars:
       init_string = ""
       if var.name in initialized_variable_names:
         init_string = ", *INIT_FROM_CKPT*"
-      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+      tf.compat.v1.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                       init_string)
   return scaffold_fn
 
 
 def get_train_op(FLAGS, total_loss, grads_and_vars=None):
-  global_step = tf.train.get_or_create_global_step()
+  global_step = tf.compat.v1.train.get_or_create_global_step()
 
   # increase the learning rate linearly
   if FLAGS.warmup_steps > 0:
@@ -105,13 +105,13 @@ def get_train_op(FLAGS, total_loss, grads_and_vars=None):
 
   # decay the learning rate
   if FLAGS.decay_method == "poly":
-    decay_lr = tf.train.polynomial_decay(
+    decay_lr = tf.compat.v1.train.polynomial_decay(
         FLAGS.learning_rate,
         global_step=global_step - FLAGS.warmup_steps,
         decay_steps=FLAGS.train_steps - FLAGS.warmup_steps,
         end_learning_rate=FLAGS.learning_rate * FLAGS.min_lr_ratio)
   elif FLAGS.decay_method == "cos":
-    decay_lr = tf.train.cosine_decay(
+    decay_lr = tf.compat.v1.train.cosine_decay(
         FLAGS.learning_rate,
         global_step=global_step - FLAGS.warmup_steps,
         decay_steps=FLAGS.train_steps - FLAGS.warmup_steps,
@@ -128,7 +128,7 @@ def get_train_op(FLAGS, total_loss, grads_and_vars=None):
                      "training so far.")
 
   if FLAGS.weight_decay == 0:
-    optimizer = tf.train.AdamOptimizer(
+    optimizer = tf.compat.v1.train.AdamOptimizer(
         learning_rate=learning_rate,
         epsilon=FLAGS.adam_epsilon)
   else:
@@ -158,7 +158,7 @@ def get_train_op(FLAGS, total_loss, grads_and_vars=None):
         if "model/transformer/layer_{}/".format(l) in variables[i].name:
           abs_rate = FLAGS.lr_layer_decay_rate ** (n_layer - 1 - l)
           clipped[i] *= abs_rate
-          tf.logging.info("Apply mult {:.4f} to layer-{} grad of {}".format(
+          tf.compat.v1.logging.info("Apply mult {:.4f} to layer-{} grad of {}".format(
               abs_rate, l, variables[i].name))
           break
 
@@ -184,11 +184,11 @@ def clean_ckpt(_):
   for (name, shape) in var_list:
     if not name.startswith("global_step") and "adam" not in name.lower():
       var_values[name] = None
-      tf.logging.info("Include {}".format(name))
+      tf.compat.v1.logging.info("Include {}".format(name))
     else:
-      tf.logging.info("Exclude {}".format(name))
+      tf.compat.v1.logging.info("Exclude {}".format(name))
 
-  tf.logging.info("Loading from {}".format(input_ckpt))
+  tf.compat.v1.logging.info("Loading from {}".format(input_ckpt))
   reader = tf.contrib.framework.load_checkpoint(input_ckpt)
   for name in var_values:
     tensor = reader.get_tensor(name)
@@ -204,7 +204,7 @@ def clean_ckpt(_):
   assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)]
   global_step = tf.Variable(
       0, name="global_step", trainable=False, dtype=tf.int64)
-  saver = tf.train.Saver(tf.all_variables())
+  saver = tf.compat.v1.train.Saver(tf.all_variables())
 
   if not tf.gfile.Exists(output_model_dir):
     tf.gfile.MakeDirs(output_model_dir)
@@ -224,7 +224,7 @@ def clean_ckpt(_):
 def avg_checkpoints(model_dir, output_model_dir, last_k):
   tf.reset_default_graph()
 
-  checkpoint_state = tf.train.get_checkpoint_state(model_dir)
+  checkpoint_state = tf.compat.v1.train.get_checkpoint_state(model_dir)
   checkpoints = checkpoint_state.all_model_checkpoint_paths[- last_k:]
   var_list = tf.contrib.framework.list_variables(checkpoints[0])
   var_values, var_dtypes = {}, {}
@@ -237,7 +237,7 @@ def avg_checkpoints(model_dir, output_model_dir, last_k):
       tensor = reader.get_tensor(name)
       var_dtypes[name] = tensor.dtype
       var_values[name] += tensor
-    tf.logging.info("Read from checkpoint %s", checkpoint)
+    tf.compat.v1.logging.info("Read from checkpoint %s", checkpoint)
   for name in var_values:  # Average.
     var_values[name] /= len(checkpoints)
 
@@ -250,7 +250,7 @@ def avg_checkpoints(model_dir, output_model_dir, last_k):
   assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)]
   global_step = tf.Variable(
       0, name="global_step", trainable=False, dtype=tf.int64)
-  saver = tf.train.Saver(tf.all_variables())
+  saver = tf.compat.v1.train.Saver(tf.all_variables())
 
   # Build a model consisting only of variables, set them to the average values.
   with tf.Session() as sess:
@@ -276,12 +276,12 @@ def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
       name = m.group(1)
     name_to_variable[name] = var
 
-  init_vars = tf.train.list_variables(init_checkpoint)
+  init_vars = tf.compat.v1.train.list_variables(init_checkpoint)
 
   assignment_map = collections.OrderedDict()
   for x in init_vars:
     (name, var) = (x[0], x[1])
-    # tf.logging.info('original name: %s', name)
+    # tf.compat.v1.logging.info('original name: %s', name)
     if name not in name_to_variable:
       continue
     # assignment_map[name] = name
@@ -292,7 +292,7 @@ def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
   return (assignment_map, initialized_variable_names)
 
 
-class AdamWeightDecayOptimizer(tf.train.Optimizer):
+class AdamWeightDecayOptimizer(tf.compat.v1.train.Optimizer):
   """A basic Adam optimizer that includes "correct" L2 weight decay."""
 
   def __init__(self,
@@ -378,7 +378,7 @@ class AdamWeightDecayOptimizer(tf.train.Optimizer):
     if self.exclude_from_weight_decay:
       for r in self.exclude_from_weight_decay:
         if re.search(r, param_name) is not None:
-          tf.logging.info('Adam WD excludes {}'.format(param_name))
+          tf.compat.v1.logging.info('Adam WD excludes {}'.format(param_name))
           return False
     return True
 
diff --git a/modeling.py b/modeling.py
index a7d719c..d9b24b8 100644
--- a/modeling.py
+++ b/modeling.py
@@ -25,8 +25,8 @@ def gelu(x):
 def embedding_lookup(x, n_token, d_embed, initializer, use_tpu=True,
                      scope='embedding', reuse=None, dtype=tf.float32):
   """TPU and GPU embedding_lookup function."""
-  with tf.variable_scope(scope, reuse=reuse):
-    lookup_table = tf.get_variable('lookup_table', [n_token, d_embed],
+  with tf.compat.v1.variable_scope(scope, reuse=reuse):
+    lookup_table = tf.compat.v1.get_variable('lookup_table', [n_token, d_embed],
                                    dtype=dtype, initializer=initializer)
     if use_tpu:
       one_hot_idx = tf.one_hot(x, n_token, dtype=dtype)
@@ -61,7 +61,7 @@ def positionwise_ffn(inp, d_model, d_inner, dropout, kernel_initializer,
     raise ValueError('Unsupported activation type {}'.format(activation_type))
 
   output = inp
-  with tf.variable_scope(scope, reuse=reuse):
+  with tf.compat.v1.variable_scope(scope, reuse=reuse):
     output = tf.layers.dense(output, d_inner, activation=activation,
                              kernel_initializer=kernel_initializer,
                              name='layer_1')
@@ -79,7 +79,7 @@ def positionwise_ffn(inp, d_model, d_inner, dropout, kernel_initializer,
 
 def head_projection(h, d_model, n_head, d_head, kernel_initializer, name):
   """Project hidden states to a specific head with a 4D-shape."""
-  proj_weight = tf.get_variable('{}/kernel'.format(name),
+  proj_weight = tf.compat.v1.get_variable('{}/kernel'.format(name),
                                 [d_model, n_head, d_head], dtype=h.dtype,
                                 initializer=kernel_initializer)
   head = tf.einsum('ibh,hnd->ibnd', h, proj_weight)
@@ -91,7 +91,7 @@ def post_attention(h, attn_vec, d_model, n_head, d_head, dropout, is_training,
                    kernel_initializer, residual=True):
   """Post-attention processing."""
   # post-attention projection (back to `d_model`)
-  proj_o = tf.get_variable('o/kernel', [d_model, n_head, d_head],
+  proj_o = tf.compat.v1.get_variable('o/kernel', [d_model, n_head, d_head],
                            dtype=h.dtype, initializer=kernel_initializer)
   attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, proj_o)
 
@@ -258,7 +258,7 @@ def multihead_attn(q, k, v, attn_mask, d_model, n_head, d_head, dropout,
   """Standard multi-head attention with absolute positional embedding."""
 
   scale = 1 / (d_head ** 0.5)
-  with tf.variable_scope(scope, reuse=reuse):
+  with tf.compat.v1.variable_scope(scope, reuse=reuse):
     # attention heads
     q_head = head_projection(
         q, d_model, n_head, d_head, kernel_initializer, 'q')
@@ -286,7 +286,7 @@ def rel_multihead_attn(h, r, r_w_bias, r_r_bias, seg_mat, r_s_bias, seg_embed,
   """Multi-head attention with relative positional encoding."""
 
   scale = 1 / (d_head ** 0.5)
-  with tf.variable_scope(scope, reuse=reuse):
+  with tf.compat.v1.variable_scope(scope, reuse=reuse):
     if mems is not None and mems.shape.ndims > 1:
       cat = tf.concat([mems, h], 0)
     else:
@@ -323,7 +323,7 @@ def two_stream_rel_attn(h, g, r, mems, r_w_bias, r_r_bias, seg_mat, r_s_bias,
   """Two-stream attention with relative positional encoding."""
 
   scale = 1 / (d_head ** 0.5)
-  with tf.variable_scope(scope, reuse=False):
+  with tf.compat.v1.variable_scope(scope, reuse=False):
 
     # content based attention score
     if mems is not None and mems.shape.ndims > 1:
@@ -357,7 +357,7 @@ def two_stream_rel_attn(h, g, r, mems, r_w_bias, r_r_bias, seg_mat, r_s_bias,
     output_h = post_attention(h, attn_vec_h, d_model, n_head, d_head, dropout,
                               is_training, kernel_initializer)
 
-  with tf.variable_scope(scope, reuse=True):
+  with tf.compat.v1.variable_scope(scope, reuse=True):
     ##### g-stream
     # query-stream query head
     q_head_g = head_projection(
@@ -450,21 +450,21 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
     initializer: A tf initializer.
     scope: scope name for the computation graph.
   """
-  tf.logging.info('memory input {}'.format(mems))
+  tf.compat.v1.logging.info('memory input {}'.format(mems))
   tf_float = tf.bfloat16 if use_bfloat16 else tf.float32
-  tf.logging.info('Use float type {}'.format(tf_float))
+  tf.compat.v1.logging.info('Use float type {}'.format(tf_float))
 
   new_mems = []
-  with tf.variable_scope(scope):
+  with tf.compat.v1.variable_scope(scope):
     if untie_r:
-      r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head],
+      r_w_bias = tf.compat.v1.get_variable('r_w_bias', [n_layer, n_head, d_head],
                                  dtype=tf_float, initializer=initializer)
-      r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head],
+      r_r_bias = tf.compat.v1.get_variable('r_r_bias', [n_layer, n_head, d_head],
                                  dtype=tf_float, initializer=initializer)
     else:
-      r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head],
+      r_w_bias = tf.compat.v1.get_variable('r_w_bias', [n_head, d_head],
                                  dtype=tf_float, initializer=initializer)
-      r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head],
+      r_r_bias = tf.compat.v1.get_variable('r_r_bias', [n_head, d_head],
                                  dtype=tf_float, initializer=initializer)
 
     bsz = tf.shape(inp_k)[1]
@@ -525,8 +525,8 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
         scope='word_embedding')
 
     if inp_q is not None:
-      with tf.variable_scope('mask_emb'):
-        mask_emb = tf.get_variable('mask_emb', [1, 1, d_model], dtype=tf_float)
+      with tf.compat.v1.variable_scope('mask_emb'):
+        mask_emb = tf.compat.v1.get_variable('mask_emb', [1, 1, d_model], dtype=tf_float)
         if target_mapping is not None:
           word_emb_q = tf.tile(mask_emb, [tf.shape(target_mapping)[0], bsz, 1])
         else:
@@ -539,14 +539,14 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
     ##### Segment embedding
     if seg_id is not None:
       if untie_r:
-        r_s_bias = tf.get_variable('r_s_bias', [n_layer, n_head, d_head],
+        r_s_bias = tf.compat.v1.get_variable('r_s_bias', [n_layer, n_head, d_head],
                                    dtype=tf_float, initializer=initializer)
       else:
         # default case (tie)
-        r_s_bias = tf.get_variable('r_s_bias', [n_head, d_head],
+        r_s_bias = tf.compat.v1.get_variable('r_s_bias', [n_head, d_head],
                                    dtype=tf_float, initializer=initializer)
 
-      seg_embed = tf.get_variable('seg_embed', [n_layer, 2, n_head, d_head],
+      seg_embed = tf.compat.v1.get_variable('seg_embed', [n_layer, 2, n_head, d_head],
                                   dtype=tf_float, initializer=initializer)
 
       # Convert `seg_id` to one-hot `seg_mat`
@@ -583,7 +583,7 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
         r_s_bias_i = r_s_bias if not untie_r else r_s_bias[i]
         seg_embed_i = seg_embed[i]
 
-      with tf.variable_scope('layer_{}'.format(i)):
+      with tf.compat.v1.variable_scope('layer_{}'.format(i)):
         if inp_q is not None:
           output_h, output_g = two_stream_rel_attn(
               h=output_h,
@@ -660,16 +660,16 @@ def lm_loss(hidden, target, n_token, d_model, initializer, lookup_table=None,
             tie_weight=False, bi_data=True, use_tpu=False):
   """doc."""
 
-  with tf.variable_scope('lm_loss'):
+  with tf.compat.v1.variable_scope('lm_loss'):
     if tie_weight:
       assert lookup_table is not None, \
           'lookup_table cannot be None for tie_weight'
       softmax_w = lookup_table
     else:
-      softmax_w = tf.get_variable('weight', [n_token, d_model],
+      softmax_w = tf.compat.v1.get_variable('weight', [n_token, d_model],
                                   dtype=hidden.dtype, initializer=initializer)
 
-    softmax_b = tf.get_variable('bias', [n_token], dtype=hidden.dtype,
+    softmax_b = tf.compat.v1.get_variable('bias', [n_token], dtype=hidden.dtype,
                                 initializer=tf.zeros_initializer())
 
     logits = tf.einsum('ibd,nd->ibn', hidden, softmax_w) + softmax_b
@@ -696,7 +696,7 @@ def summarize_sequence(summary_type, hidden, d_model, n_head, d_head, dropout,
       Otherwise, one should specify a different `scope` for each task.
   """
 
-  with tf.variable_scope(scope, 'sequnece_summary', reuse=reuse):
+  with tf.compat.v1.variable_scope(scope, 'sequnece_summary', reuse=reuse):
     if summary_type == 'last':
       summary = hidden[-1]
     elif summary_type == 'first':
@@ -706,7 +706,7 @@ def summarize_sequence(summary_type, hidden, d_model, n_head, d_head, dropout,
     elif summary_type == 'attn':
       bsz = tf.shape(hidden)[1]
 
-      summary_bias = tf.get_variable('summary_bias', [d_model],
+      summary_bias = tf.compat.v1.get_variable('summary_bias', [d_model],
                                      dtype=hidden.dtype,
                                      initializer=initializer)
       summary_bias = tf.tile(summary_bias[None, None], [1, bsz, 1])
@@ -748,7 +748,7 @@ def classification_loss(hidden, labels, n_class, initializer, scope, reuse=None,
       the classification weights.
   """
 
-  with tf.variable_scope(scope, reuse=reuse):
+  with tf.compat.v1.variable_scope(scope, reuse=reuse):
     logits = tf.layers.dense(
         hidden,
         n_class,
@@ -766,7 +766,7 @@ def classification_loss(hidden, labels, n_class, initializer, scope, reuse=None,
 
 def regression_loss(hidden, labels, initializer, scope, reuse=None,
                     return_logits=False):
-  with tf.variable_scope(scope, reuse=reuse):
+  with tf.compat.v1.variable_scope(scope, reuse=reuse):
     logits = tf.layers.dense(
         hidden,
         1,
diff --git a/run_classifier.py b/run_classifier.py
index c6eb1ba..ed0a700 100644
--- a/run_classifier.py
+++ b/run_classifier.py
@@ -183,7 +183,7 @@ class DataProcessor(object):
   @classmethod
   def _read_tsv(cls, input_file, quotechar=None):
     """Reads a tab separated value file."""
-    with tf.gfile.Open(input_file, "r") as f:
+    with tf.io.gfile.GFile(input_file, "r") as f:
       reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
       lines = []
       for line in reader:
@@ -246,13 +246,13 @@ class GLUEProcessor(DataProcessor):
 
       # there are some incomplete lines in QNLI
       if len(line) <= a_column:
-        tf.logging.warning('Incomplete line, ignored.')
+        tf.compat.v1.logging.warning('Incomplete line, ignored.')
         continue
       text_a = line[a_column]
 
       if b_column is not None:
         if len(line) <= b_column:
-          tf.logging.warning('Incomplete line, ignored.')
+          tf.compat.v1.logging.warning('Incomplete line, ignored.')
           continue
         text_b = line[b_column]
       else:
@@ -262,7 +262,7 @@ class GLUEProcessor(DataProcessor):
         label = self.get_labels()[0]
       else:
         if len(line) <= self.label_column:
-          tf.logging.warning('Incomplete line, ignored.')
+          tf.compat.v1.logging.warning('Incomplete line, ignored.')
           continue
         label = line[self.label_column]
       examples.append(
@@ -309,7 +309,7 @@ class ImdbProcessor(DataProcessor):
     examples = []
     for label in ["neg", "pos"]:
       cur_dir = os.path.join(data_dir, label)
-      for filename in tf.gfile.ListDirectory(cur_dir):
+      for filename in tf.io.gfile.listdir(cur_dir):
         if not filename.endswith("txt"): continue
 
         path = os.path.join(cur_dir, filename)
@@ -340,6 +340,14 @@ class MnliMismatchedProcessor(MnliMatchedProcessor):
     self.test_file = "test_mismatched.tsv"
 
 
+class SSCProcessor(GLUEProcessor):
+  def __init__(self):
+    super(SSCProcessor, self).__init__()
+    self.label_column = 0
+    self.text_a_column = 3
+    self.text_b_column = 4
+
+
 class StsbProcessor(GLUEProcessor):
   def __init__(self):
     super(StsbProcessor, self).__init__()
@@ -367,13 +375,13 @@ class StsbProcessor(GLUEProcessor):
 
       # there are some incomplete lines in QNLI
       if len(line) <= a_column:
-        tf.logging.warning('Incomplete line, ignored.')
+        tf.compat.v1.logging.warning('Incomplete line, ignored.')
         continue
       text_a = line[a_column]
 
       if b_column is not None:
         if len(line) <= b_column:
-          tf.logging.warning('Incomplete line, ignored.')
+          tf.compat.v1.logging.warning('Incomplete line, ignored.')
           continue
         text_b = line[b_column]
       else:
@@ -383,7 +391,7 @@ class StsbProcessor(GLUEProcessor):
         label = self.get_labels()[0]
       else:
         if len(line) <= self.label_column:
-          tf.logging.warning('Incomplete line, ignored.')
+          tf.compat.v1.logging.warning('Incomplete line, ignored.')
           continue
         label = float(line[self.label_column])
       examples.append(
@@ -398,20 +406,20 @@ def file_based_convert_examples_to_features(
   """Convert a set of `InputExample`s to a TFRecord file."""
 
   # do not create duplicated records
-  if tf.gfile.Exists(output_file) and not FLAGS.overwrite_data:
-    tf.logging.info("Do not overwrite tfrecord {} exists.".format(output_file))
+  if tf.io.gfile.exists(output_file) and not FLAGS.overwrite_data:
+    tf.compat.v1.logging.info("Do not overwrite tfrecord {} exists.".format(output_file))
     return
 
-  tf.logging.info("Create new tfrecord {}.".format(output_file))
+  tf.compat.v1.logging.info("Create new tfrecord {}.".format(output_file))
 
-  writer = tf.python_io.TFRecordWriter(output_file)
+  writer = tf.io.TFRecordWriter(output_file)
 
   if num_passes > 1:
     examples *= num_passes
 
   for (ex_index, example) in enumerate(examples):
     if ex_index % 10000 == 0:
-      tf.logging.info("Writing example {} of {}".format(ex_index,
+      tf.compat.v1.logging.info("Writing example {} of {}".format(ex_index,
                                                         len(examples)))
 
     feature = convert_single_example(ex_index, example, label_list,
@@ -447,20 +455,20 @@ def file_based_input_fn_builder(input_file, seq_length, is_training,
 
 
   name_to_features = {
-      "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
-      "input_mask": tf.FixedLenFeature([seq_length], tf.float32),
-      "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
-      "label_ids": tf.FixedLenFeature([], tf.int64),
-      "is_real_example": tf.FixedLenFeature([], tf.int64),
+      "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+      "input_mask": tf.io.FixedLenFeature([seq_length], tf.float32),
+      "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+      "label_ids": tf.io.FixedLenFeature([], tf.int64),
+      "is_real_example": tf.io.FixedLenFeature([], tf.int64),
   }
   if FLAGS.is_regression:
-    name_to_features["label_ids"] = tf.FixedLenFeature([], tf.float32)
+    name_to_features["label_ids"] = tf.io.FixedLenFeature([], tf.float32)
 
-  tf.logging.info("Input tfrecord file {}".format(input_file))
+  tf.compat.v1.logging.info("Input tfrecord file {}".format(input_file))
 
   def _decode_record(record, name_to_features):
     """Decodes a record to a TensorFlow example."""
-    example = tf.parse_single_example(record, name_to_features)
+    example = tf.io.parse_single_example(record, name_to_features)
 
     # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
     # So cast all int64 to int32.
@@ -486,7 +494,7 @@ def file_based_input_fn_builder(input_file, seq_length, is_training,
     d = tf.data.TFRecordDataset(input_file)
     # Shard the dataset to difference devices
     if input_context is not None:
-      tf.logging.info("Input pipeline id %d out of %d",
+      tf.compat.v1.logging.info("Input pipeline id %d out of %d",
           input_context.input_pipeline_id, input_context.num_replicas_in_sync)
       d = d.shard(input_context.num_input_pipelines,
                   input_context.input_pipeline_id)
@@ -523,8 +531,8 @@ def get_model_fn(n_class):
           FLAGS, features, n_class, is_training)
 
     #### Check model parameters
-    num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
-    tf.logging.info('#params: {}'.format(num_params))
+    num_params = sum([np.prod(v.shape) for v in tf.compat.v1.trainable_variables()])
+    tf.compat.v1.logging.info('#params: {}'.format(num_params))
 
     #### load pretrained models
     scaffold_fn = model_utils.init_from_checkpoint(FLAGS)
@@ -540,16 +548,16 @@ def get_model_fn(n_class):
             'predictions': predictions,
             'weights': is_real_example
         }
-        accuracy = tf.metrics.accuracy(**eval_input_dict)
+        accuracy = tf.compat.v1.metrics.accuracy(**eval_input_dict)
 
-        loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
+        loss = tf.compat.v1.metrics.mean(values=per_example_loss, weights=is_real_example)
         return {
             'eval_accuracy': accuracy,
             'eval_loss': loss}
 
       def regression_metric_fn(
           per_example_loss, label_ids, logits, is_real_example):
-        loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
+        loss = tf.compat.v1.metrics.mean(values=per_example_loss, weights=is_real_example)
         pearsonr = tf.contrib.metrics.streaming_pearson_correlation(
             logits, label_ids, weights=is_real_example)
         return {'eval_loss': loss, 'eval_pearsonr': pearsonr}
@@ -634,7 +642,7 @@ def get_model_fn(n_class):
 
 
 def main(_):
-  tf.logging.set_verbosity(tf.logging.INFO)
+  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
 
   #### Validate flags
   if FLAGS.save_steps is not None:
@@ -642,13 +650,15 @@ def main(_):
 
   if FLAGS.do_predict:
     predict_dir = FLAGS.predict_dir
-    if not tf.gfile.Exists(predict_dir):
+    if not tf.io.gfile.exists(predict_dir):
       tf.gfile.MakeDirs(predict_dir)
 
   processors = {
       "mnli_matched": MnliMatchedProcessor,
       "mnli_mismatched": MnliMismatchedProcessor,
       'sts-b': StsbProcessor,
+      'ssc-within': SSCProcessor,
+      'ssc-cross': SSCProcessor,
       'imdb': ImdbProcessor,
       "yelp5": Yelp5Processor
   }
@@ -658,7 +668,7 @@ def main(_):
         "At least one of `do_train`, `do_eval, `do_predict` or "
         "`do_submit` must be True.")
 
-  if not tf.gfile.Exists(FLAGS.output_dir):
+  if not tf.io.gfile.exists(FLAGS.output_dir):
     tf.gfile.MakeDirs(FLAGS.output_dir)
 
   task_name = FLAGS.task_name.lower()
@@ -700,11 +710,11 @@ def main(_):
     train_file_base = "{}.len-{}.train.tf_record".format(
         spm_basename, FLAGS.max_seq_length)
     train_file = os.path.join(FLAGS.output_dir, train_file_base)
-    tf.logging.info("Use tfrecord file {}".format(train_file))
+    tf.compat.v1.logging.info("Use tfrecord file {}".format(train_file))
 
     train_examples = processor.get_train_examples(FLAGS.data_dir)
     np.random.shuffle(train_examples)
-    tf.logging.info("Num of train samples: {}".format(len(train_examples)))
+    tf.compat.v1.logging.info("Num of train samples: {}".format(len(train_examples)))
 
     file_based_convert_examples_to_features(
         train_examples, label_list, FLAGS.max_seq_length, tokenize_fn,
@@ -724,7 +734,7 @@ def main(_):
     else:
       eval_examples = processor.get_test_examples(FLAGS.data_dir)
 
-    tf.logging.info("Num of eval samples: {}".format(len(eval_examples)))
+    tf.compat.v1.logging.info("Num of eval samples: {}".format(len(eval_examples)))
 
   if FLAGS.do_eval:
     # TPU requires a fixed batch size for all batches, therefore the number
@@ -756,14 +766,14 @@ def main(_):
 
     # Filter out all checkpoints in the directory
     steps_and_files = []
-    filenames = tf.gfile.ListDirectory(FLAGS.model_dir)
+    filenames = tf.io.gfile.listdir(FLAGS.model_dir)
 
     for filename in filenames:
       if filename.endswith(".index"):
         ckpt_name = filename[:-6]
         cur_filename = join(FLAGS.model_dir, ckpt_name)
         global_step = int(cur_filename.split("-")[-1])
-        tf.logging.info("Add {} to eval list.".format(cur_filename))
+        tf.compat.v1.logging.info("Add {} to eval list.".format(cur_filename))
         steps_and_files.append([global_step, cur_filename])
     steps_and_files = sorted(steps_and_files, key=lambda x: x[0])
 
@@ -783,20 +793,20 @@ def main(_):
 
       eval_results.append(ret)
 
-      tf.logging.info("=" * 80)
+      tf.compat.v1.logging.info("=" * 80)
       log_str = "Eval result | "
       for key, val in sorted(ret.items(), key=lambda x: x[0]):
         log_str += "{} {} | ".format(key, val)
-      tf.logging.info(log_str)
+      tf.compat.v1.logging.info(log_str)
 
     key_name = "eval_pearsonr" if FLAGS.is_regression else "eval_accuracy"
     eval_results.sort(key=lambda x: x[key_name], reverse=True)
 
-    tf.logging.info("=" * 80)
+    tf.compat.v1.logging.info("=" * 80)
     log_str = "Best result | "
     for key, val in sorted(eval_results[0].items(), key=lambda x: x[0]):
       log_str += "{} {} | ".format(key, val)
-    tf.logging.info(log_str)
+    tf.compat.v1.logging.info(log_str)
 
   if FLAGS.do_predict:
     eval_file_base = "{}.len-{}.{}.predict.tf_record".format(
@@ -823,7 +833,7 @@ def main(_):
           yield_single_examples=True,
           checkpoint_path=FLAGS.predict_ckpt)):
         if pred_cnt % 1000 == 0:
-          tf.logging.info("Predicting submission for example: {}".format(
+          tf.compat.v1.logging.info("Predicting submission for example: {}".format(
               pred_cnt))
 
         logits = [float(x) for x in result["logits"].flat]
@@ -852,4 +862,4 @@ def main(_):
 
 
 if __name__ == "__main__":
-  tf.app.run()
+  tf.compat.v1.app.run()
diff --git a/xlnet.py b/xlnet.py
index 4341e24..dfd8885 100644
--- a/xlnet.py
+++ b/xlnet.py
@@ -60,7 +60,7 @@ class XLNetConfig(object):
       setattr(self, key, getattr(FLAGS, key))
 
   def init_from_json(self, json_path):
-    with tf.gfile.Open(json_path) as f:
+    with tf.io.gfile.GFile(json_path) as f:
       json_data = json.load(f)
       for key in self.keys:
         setattr(self, key, json_data[key])
@@ -74,7 +74,7 @@ class XLNetConfig(object):
     json_dir = os.path.dirname(json_path)
     if not tf.gfile.Exists(json_dir):
       tf.gfile.MakeDirs(json_dir)
-    with tf.gfile.Open(json_path, "w") as f:
+    with tf.io.gfile.GFile(json_path, "w") as f:
       json.dump(json_data, f, indent=4, sort_keys=True)
 
 
@@ -217,7 +217,7 @@ class XLNetModel(object):
         inp_q=inp_q)
     tfm_args.update(input_args)
 
-    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
+    with tf.compat.v1.variable_scope("model", reuse=tf.compat.v1.AUTO_REUSE):
       (self.output, self.new_mems, self.lookup_table
           ) = modeling.transformer_xl(**tfm_args)
 
@@ -240,7 +240,7 @@ class XLNetModel(object):
     xlnet_config = self.xlnet_config
     run_config = self.run_config
 
-    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
+    with tf.compat.v1.variable_scope("model", reuse=tf.compat.v1.AUTO_REUSE):
       summary = modeling.summarize_sequence(
           summary_type=summary_type,
           hidden=self.output,
```