# LearnAI - Introduction to Modern NLP with and CORD-19

**IMPORTANT: Make sure that you have GPU set as your Hardware Accelerator in `Runtime > Change runtime type` before running this Colab.**

## Setup

### Install HuggingFace's Transfomers library.

In [None]:
!git clone https://github.com/huggingface/transformers

import os
os.chdir('/content/transformers')
os.mkdir('/content/data')
!pip install .
!pip install -r ./examples/requirements.txt

os.chdir('/content/transformers/examples')

!pip install dict_to_obj


In [None]:
import os
import json
import tqdm
import torch
import random
import collections
import numpy as np
import pandas as pd
import tensorflow as tf
from dict_to_obj import DictToObj
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel, TFGPT2LMHeadModel

### Download CORD-19


In [None]:
# Download the train and test set.
!wget -nc -O /content/data/abstractstest.txt https://raw.githubusercontent.com/PubChimps/CORD-19/master/abstractstest.txt
!wget -nc -O /content/data/abstractstrain.txt https://raw.githubusercontent.com/PubChimps/CORD-19/master/abstractstrain3.txt


### Begin Finetuning

In [None]:
!python run_language_modeling.py \
    --output_dir='/content/transformers/output' \
    --model_type=gpt2 \
    --model_name_or_path=gpt2 \
    --num_train_epochs=1.0 \
    --do_train \
    --train_data_file=/content/data/abstractstrain.txt \
    --per_gpu_train_batch_size=2 \
    --block_size=512 \
    --gradient_accumulation_steps=5

## **Download and Preprocess CORD-19**
CORD-19 is a collection of json spread across 4 different sub-directories, dependending on the paper's license. This section transforms CORD-19 from json to a single string representing all of their abstracts, split into a train and test set. This section will be skipped during LearnAI.

### Download and unzip dataset


In [None]:
!wget -nc -O /content/CORD-19.zip https://ibm.box.com/shared/static/m8ualk8ke9bqxtzqz19osmnd5ryqeflu.zip
!unzip /content/CORD-19.zip -d /content/data

### Many jsons -> single string
The following functions crawl through the directories of the CORD-19 data set, load each file in the directory, and place it's abstract into Pandas, where it is converted to a single string

In [None]:
def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def load_files(dirname):
    filenames = os.listdir(dirname)
    raw_files = []

    for filename in filenames:
        filename = dirname + filename
        file = json.load(open(filename, 'rb'))
        raw_files.append(file)
    
    return raw_files

def generate_clean_df(all_files):
    cleaned_files = []
    for file in all_files:
        features = [
            format_body(file['abstract'])
        ]

        cleaned_files.append(features)
    col_names = ['abstract']
    clean_df = pd.DataFrame(cleaned_files, columns=col_names)
    clean_df.head()
    
    return clean_df

In [None]:

biomed_dir = '/content/data/biorxiv_medrxiv/biorxiv_medrxiv/'
comm_dir = '/content/data/comm_use_subset/comm_use_subset/'
custom_dir = '/content/data/custom_license/custom_license/'
noncomm_dir = '/content/data/noncomm_use_subset/noncomm_use_subset/'

directories = [biomed_dir,comm_dir,custom_dir,noncomm_dir]
df = pd.DataFrame()
for directory in directories:
    files = load_files(directory)
    df = df.append(generate_clean_df(files))
    

In [None]:
abstracts = pd.Series(df['abstract']).str.cat(sep=' ')
print(len(abstracts))

### Filter non-English Characters

#### Characters to ignore

In [None]:
CHARS = [
 '¦',
 '§',
 '¨',
 '©',
 'ª',
 '«',
 '®',
 '¯',
 '°',
 '±',
 '²',
 '³',
 '´',
 'µ',
 '¶',
 '·',
 'º',
 '»',
 '¼',
 '½',
 '¿',
 '×',
 'Ø',
 '÷',
 'ø',
 'Ɵ',
 'Ƶ',
 'ǁ',
 'ǆ',
 'Ǉ',
 'ǌ',
 'ʹ',
 'ʼ',
 'ˆ',
 'ˇ',
 'À',
 'Á',
 'Â',
 'Ã',
 'Ä',
 'Å',
 'Ç',
 'È',
 'É',
 'Ê',
 'Í',
 'Ð',
 'Ñ',
 'Ò',
 'Ó',
 'Ô',
 'Õ',
 'Ö',
 'Ú',
 'Û',
 'Ü',
 'Þ',
 'ß',
 'à',
 'á',
 'â',
 'ã',
 'ä',
 'å',
 'ç',
 'è',
 'é',
 'ê',
 'ë',
 'ì',
 'í',
 'î',
 'ï',
 'ð',
 'ñ',
 'ò',
 'ó',
 'ô',
 'õ',
 'ö',
 'ù',
 'ú',
 'û',
 'ü',
 'ý',
 'þ',
 'ÿ',
 'ā',
 'Ă',
 'ą',
 'Ć',
 'ć',
 'Č',
 'č',
 'ď',
 'Đ',
 'ē',
 'ę',
 'Ě',
 'ě',
 'Ğ',
 'Ĩ',
 'Į',
 'ı',
 'ĸ',
 'Ĺ',
 'ł',
 'ń',
 'Ň',
 'Ō',
 'ō',
 'Ő',
 'ő',
 'Ś',
 'ś',
 'ŝ',
 'ş',
 'Š',
 'š',
 'Ŭ',
 'ů',
 'ŵ',
 'Ŷ',
 'ź',
 'ż',
 'Ž',
 'ž',
 'Ɖ',
 'Ƌ',
 'ƌ',
 'Ɛ',
 'ƚ',
 'ǎ',
 'ǐ',
 'ǒ',
 'ǔ',
 'ǡ',
 'ș',
 'ɑ',
 'ɛ',
 'ɣ',
 'ʋ',
 '˘',
 '˚',
 '˛',
 '˝',
 '́',
 '̇',
 '͕',
 '͖',
 '͗',
 '͘',
 'ͬ',
 'Ͳ',
 'а',
 'б',
 'в',
 'г',
 'д',
 'е',
 'ж',
 'з',
 'и',
 'й',
 'к',
 'л',
 'м',
 'н',
 'о',
 'п',
 'р',
 'с',
 'т',
 'у',
 'ф',
 'х',
 'ц',
 'ч',
 'ш',
 'щ',
 'ы',
 'ь',
 'э',
 'ю',
 'я',
 'ӧ',
 'Յ',
 'Ն',
 '؉',
 '؊',
 '؋',
 '،',
 '؍',
 '؎',
 'ء',
 'آ',
 'أ',
 'ؤ',
 'إ',
 'ئ',
 'ا',
 'ب',
 'ة',
 'ت',
 'ث',
 'ج',
 'ح',
 'خ',
 'د',
 'ذ',
 'ر',
 'ز',
 'س',
 'ش',
 'ص',
 'ض',
 'ط',
 'ظ',
 'ع',
 'غ',
 'ف',
 'ق',
 'ك',
 'ل',
 'م',
 'ن',
 'ه',
 'و',
 'ى',
 'ي',
 'ً',
 'ٌ',
 'ٍ',
 'َ',
 'ُ',
 'ِ',
 'ّ',
 'ْ',
 'ܰ',
 'ܴ',
 '݅',
 '݇',
 'ݏ',
 'ݑ',
 'ݕ',
 'ߚ',
 'ߜ',
 'ߤ',
 'ߪ',
 'ଝ',
 'ଵ',
 'ଶ',
 '᭧',
 'ᮊ',
 'ᵒ',
 'Ḡ',
 'ỹ',
 '‖',
 '‚',
 '†',
 '‡',
 '•',
 '…',
 '‰',
 '′',
 '″',
 '⁄',
 '⁎',
 '⁶',
 '⁹',
 '₀',
 '€',
 '℃',
 'ℜ',
 '™',
 'Ω',
 'Ⅰ',
 'Ⅱ',
 'Ⅲ',
 '→',
 '↓',
 '↵',
 '⇑',
 '⌬',
 '⌿',
 '⍀',
 '␣',
 '␤',
 '␥',
 '␦',
 '■',
 '▪',
 '▶',
 '▸',
 '►',
 '○',
 '◗',
 '★',
 '☆',
 '✔',
 '✜',
 '✩',
 '➜',
 '⩾',
 '、',
 '・',
 'Ϳ',
 '΄',
 '·',
 'Ί',
 'Α',
 'Γ',
 'Ε',
 'Θ',
 'Ι',
 'Λ',
 'Μ',
 'ϩ',
 'Ϫ',
 'ϫ',
 'Ϭ',
 'ϭ',
 'Ϯ',
 'ϯ',
 'ϰ',
 'ϱ',
 'ϲ',
 'ϳ',
 'ϵ',
 'Ϸ',
 'Ͻ',
 'Ͼ',
 'Ј',
 'Љ',
 'Њ',
 'А',
 'Б',
 'В',
 'Д',
 'И',
 'К',
 'Н',
 'О',
 'Р',
 'С',
 'Т',
 'У',
 'Ф',
 'Х',
 'Ц',
 'Ч',
 'Ш',
 '中',
 '乌',
 '亏',
 '代',
 '何',
 '充',
 '冒',
 '吃',
 '國',
 '型',
 '子',
 '學',
 '寄',
 '寒',
 '山',
 '感',
 '扬',
 '方',
 '明',
 '是',
 '暑',
 '替',
 '板',
 '根',
 '桑',
 '民',
 '決',
 '熱',
 '狗',
 '理',
 '生',
 '福',
 '脊',
 '膽',
 '與',
 '良',
 '芳',
 '藍',
 '藥',
 '處',
 '補',
 '論',
 '醫',
 '钟',
 '間',
 '風',
 '首',
 '龍',
 '가',
 '각',
 '간',
 '감',
 '갑',
 '강',
 '같',
 '개',
 '객',
 '거',
 '걱',
 '건',
 '걸',
 '검',
 '것',
 '게',
 '겨',
 '격',
 '겪',
 '결',
 '겼',
 '경',
 '계',
 '고',
 '공',
 '과',
 '관',
 '교',
 '구',
 '국',
 '군',
 '그',
 '근',
 '글',
 '급',
 '기',
 '긴',
 '길',
 '까',
 '꺼',
 '꼈',
 '나',
 '낙',
 '난',
 '남',
 '났',
 '내',
 '넷',
 '년',
 '노',
 '높',
 '누',
 '느',
 '는',
 '능',
 '니',
 '다',
 '단',
 '달',
 '당',
 '대',
 '던',
 '도',
 '동',
 '되',
 '된',
 '두',
 '드',
 '든',
 '들',
 '등',
 '따',
 '때',
 '또',
 '라',
 '람',
 '램',
 '략',
 '량',
 '러',
 '렇',
 '레',
 '려',
 '력',
 '련',
 '령',
 '로',
 '록',
 '론',
 '롯',
 '료',
 '루',
 '률',
 '르',
 '른',
 '를',
 '리',
 '립',
 '마',
 '만',
 '말',
 '망',
 '매',
 '머',
 '멀',
 '메',
 '며',
 '면',
 '명',
 '모',
 '목',
 '못',
 '무',
 '문',
 '물',
 '미',
 '밀',
 '및',
 '바',
 '반',
 '받',
 '발',
 '방',
 '배',
 '백',
 '번',
 '범',
 '법',
 '별',
 '병',
 '보',
 '복',
 '본',
 '부',
 '분',
 '불',
 '비',
 '빈',
 '사',
 '산',
 '상',
 '생',
 '서',
 '석',
 '선',
 '설',
 '성',
 '세',
 '소',
 '속',
 '손',
 '쇄',
 '수',
 '순',
 '술',
 '슈',
 '스',
 '시',
 '식',
 '신',
 '실',
 '심',
 '써',
 '아',
 '악',
 '안',
 '않',
 '알',
 '았',
 '애',
 '야',
 '약',
 '양',
 '어',
 '언',
 '얼',
 '없',
 '었',
 '에',
 '여',
 '역',
 '연',
 '염',
 '였',
 '영',
 '예',
 '와',
 '왔',
 '외',
 '요',
 '욕',
 '용',
 '우',
 '운',
 '울',
 '움',
 '원',
 '월',
 '웠',
 '위',
 '유',
 '육',
 '율',
 '으',
 '은',
 '을',
 '음',
 '응',
 '의',
 '이',
 '인',
 '일',
 '임',
 '입',
 '있',
 '자',
 '작',
 '잘',
 '잠',
 '장',
 '재',
 '저',
 '적',
 '전',
 '절',
 '점',
 '접',
 '정',
 '제',
 '조',
 '족',
 '존',
 '종',
 '주',
 '준',
 '줄',
 '중',
 '증',
 '지',
 '직',
 '진',
 '질',
 '징',
 '차',
 '착',
 '찰',
 '참',
 '처',
 '척',
 '철',
 '첫',
 '청',
 '체',
 '쳐',
 '촉',
 '총',
 '최',
 '추',
 '축',
 '출',
 '충',
 '취',
 '측',
 '치',
 '칠',
 '코',
 '콩',
 '크',
 '타',
 '태',
 '택',
 '터',
 '토',
 '통',
 '트',
 '특',
 '파',
 '판',
 '퍼',
 '편',
 '평',
 '폐',
 '포',
 '폭',
 '푛',
 '표',
 '품',
 '프',
 '피',
 '하',
 '학',
 '한',
 '할',
 '함',
 '항',
 '해',
 '핵',
 '했',
 '행',
 '향',
 '헌',
 '험',
 '혀',
 '현',
 '형',
 '호',
 '혹',
 '홍',
 '화',
 '확',
 '환',
 '활',
 '황',
 '회',
 '효',
 '후',
 '휴',
 '흡',
 '\u202a',
 '\u202b',
 '\u202c',
 '\ue024',
 '\ue02c',
 '\ue02e',
 '\ue031',
 '\ue032',
 '\ue033',
 '\ue035',
 '\ue061',
 '\ue062',
 '\ue06d',
 '\ue152',
 '\uf020',
 '\uf02b',
 '\uf02d',
 '\uf02f',
 '\uf03d',
 '\uf044',
 '\uf046',
 '\uf05b',
 '\uf05d',
 '\uf061',
 '\uf062',
 '\uf063',
 '\uf065',
 '\uf067',
 '\uf06b',
 '\uf06c',
 '\uf06d',
 '\uf09f',
 '\uf0a2',
 '\uf0a3',
 '\uf0a7',
 '\uf0ae',
 '\uf0b0',
 '\uf0b4',
 '\uf0b7',
 '\uf0bb',
 '\uf0d7',
 '\uf0e0',
 '\uf6d9',
 '\uf761',
 '\uf762',
 '\uf764',
 '\uf765',
 '\uf766',
 '\uf767',
 '\uf768',
 '\uf769',
 '\uf76b',
 '\uf76c',
 '\uf76e',
 '\uf76f',
 '\uf770',
 '\uf772',
 '\uf773',
 '\uf774',
 '\uf775',
 '\uf776',
 '\uf777',
 '\uf778',
 '\uf779',
 '\uf77a',
'�']

#### Filtering String

In [None]:
for c in CHARS:
    abstracts = abstracts.replace(c, '')

In [None]:
len(abstracts)

In [None]:
abstracts_train = abstracts[:int(len(abstracts)*.9)]
abstracts_test = abstracts[int(len(abstracts)*.9):]

abstracts_file = open('/content/data/abstractstrain.txt', 'w')
n = abstracts_file.write(abstracts_train)
abstracts_file.close()

abstracts_file = open('/content/data/abstractstest.txt', 'w')
n = abstracts_file.write(abstracts_test)
abstracts_file.close()

## **Generating abstracts without Transformers**

In [None]:
with open('/content/data/abstractstrain.txt', 'r') as abstracts_file:
     abstractstrain = abstracts_file.read()

vocab = sorted(set(abstractstrain))

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in abstractstrain])

seq_length = 100
examples_per_epoch = len(abstractstrain)//(seq_length+1)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

dataset = sequences.map(split_input_target)
BATCH_SIZE = 128
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
vocab_size = len(vocab)
embedding_dim = 128
rnn_units = 512

model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)
EPOCHS=3
checkpoint_dir = '/content/tensorflow/output'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

with tf.device('/device:GPU:0'):
  history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

### Function to generate text

In [None]:
def generate_text(model, start_string):
  
    num_generate = 500
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []

    temperature = .5

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

### Creating new abstracts

In [None]:
print(generate_text(model, start_string="Abstract\n\nThe corona"))

## Generating abstracts with Transformers

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2LMHeadModel.from_pretrained('/content/transformers/output/', pad_token_id=tokenizer.eos_token_id, from_pt=True)

input_ids = tokenizer.encode('Abstract\n\nCovid-19', return_tensors='tf')
sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=150, 
    top_k=50, 
    top_p=0.92, 
    num_return_sequences=3
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))