# Natural Language Processing TP
###### Thomas Cochou

### BPE

In [1]:
from glob import glob
import os
import re
from collections import Counter, defaultdict

In [5]:
folder_name = []
corpus_str = ""

for dir in os.walk(".\corpus_sample"):
    folder_name.append(dir[1])

for name in folder_name[0] :
    text_path = "corpus_sample/"+name+"/"+name+".txt"
    if os.path.exists(text_path) :
        text_file = open(text_path, "r", encoding='utf-8')
        data = text_file.read()
        corpus_str = corpus_str + data


In [8]:
print(corpus_str)

Virchows Arch. A Path. Anat. and Histol. 366, 149--175 (1975)
9 by Springer-Verlag 1975

Etude histochimique et histoenzymologique
de l'infarctus exp6rimental du rat apr6s ligature
permanente ou temporaire de la coronaire gauche*
J . P. Camilleri, J . N. F a b i a n i , A. D e l o e h e et C. G u r d j i a n
Avec l'aide technique de Mme M. Douheret et
M. ~ . Wolfelsperger (photographies)
Laboratoire d'Anatemie pathologique, U.E.R. Broussais-H6tel-Dieu
(Prof. ag. J. Diebold), Paris
Laborateire d'Etude des Greffes et Proth~ses valvulaires et cardiaques
(Prof. ag. A. Carpentier) CNRS EI~A 420, Paris
l~egu le 18 octobre 1974
H i s t o c h e m i c a l a n d t t i s t o e n z y m a t i c S t u d y of E x p e r i m e n t a l M y o c a r d i a l I n f a r c t i o n
in t h e t~at b y T e m p o r a r y a n d P e r m a n e n t L i g a t i o n of t h e L e f t C o r o n a r y A r t e r y

Summary. The evolution of experimental myocardial infarction in the Rat with or without revascularization has 

In [78]:
def build_vocab(corpus: str) -> dict:
    """Step 1. Build vocab from text corpus"""

    corpus_splitted = corpus
    
    # Separate each char in word by space and add mark end of token
    tokens = [" ".join(word) + " </w>" for word in corpus_splitted.split()]
    
    # Count frequency of tokens in corpus
    vocab = Counter(tokens)  

    return vocab


def get_stats(vocab: dict) -> dict:
    """Step 2. Get counts of pairs of consecutive symbols"""

    pairs = defaultdict(int)
    for word, frequency in vocab.items():
        symbols = word.split()

        # Counting up occurrences of pairs
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += frequency

    return pairs


def merge_vocab(pair: tuple, v_in: dict) -> dict:
    """Step 3. Merge all occurrences of the most frequent pair"""
    
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    
    for word in v_in:
        # replace most frequent pair in all vocabulary
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]

    return v_out


vocab = build_vocab(corpus_str)  # Step 1

num_merges = 50  # Hyperparameter
for i in range(num_merges):

    pairs = get_stats(vocab)  # Step 2

    if not pairs:
        break

    # step 3
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)


In [None]:
print(best)
print(vocab)

### AI21 studio

In [79]:
import requests

In [87]:
# Authorization: Bearer YOUR_API_KEY
Authorization = "Bearer 5L9NUHBOVQkIllqNlKmkpQQgzqdZTF2T"

Virchows Arch. A Path. Anat. and Histol. 366, 149--175 (1975)
9 by Springer-Verlag 1975

Etude histochimique et histoenzymologique
de l'infarctus exp6rimental du rat apr6s ligature
permanente ou tempo


### Completation de texte
#### j1-large with high temperature
result in increasing creativity and variety

In [99]:
text_to_complete = "roses are red"

answer = requests.post(
    "https://api.ai21.com/studio/v1/j1-large/complete",
    headers={"Authorization": Authorization},
    json={
        "prompt": text_to_complete, 
        "numResults": 2, 
        "maxTokens": 8, 
        "stopSequences": ["."],
        "topKReturn": 0,
        "temperature": 8.0
    }
)

In [100]:
answer = answer.json()
print(answer["completions"][0]["data"]["text"])


KeyError: 'completions'

#### j1-jumbo with low temperature
result in low creativity and variety

In [101]:
text_to_complete = "roses are red"

answer = requests.post(
    "https://api.ai21.com/studio/v1/j1-jumbo/complete",
    headers={"Authorization": Authorization},
    json={
        "prompt": text_to_complete, 
        "numResults": 2, 
        "maxTokens": 8, 
        "stopSequences": ["."],
        "topKReturn": 0,
        "temperature": 0.0
    }
)

In [6]:
answer = answer.json()
print(answer["completions"][0]["data"]["text"])


NameError: name 'answer' is not defined

### mT5 fine-tuning

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
folder_name = []
corpus_str = ""

abstracts = []
texts = []


for dir in os.walk(".\corpus_sample_tei"):
    folder_name.append(dir[1])

for name in folder_name[0] :
    text_path = "corpus_sample_tei/"+name+"/"+name+".tei.xml"
    if os.path.exists(text_path) :
        text_file = open(text_path, "r", encoding='utf-8')
        data = text_file.read()
        
        abstract_str = data.split("<abstract xml:lang="fr">")[0]
        abstract_str = abstract_str.split("</abstract>")[0]
        
        abstracts.append(abstract_str)
        
        text_str = data.split("<body>")[0]
        text_str = text_str.split("</body>")[0]    
        
        texts.append(text_str)

X_train, X_test, y_train, y_test = train_test_split(abstracts, texts, test_size=0.33, random_state=42)

corpus_array_train = [X_train, y_train]        

train_df = pd.DataFrame(corpus_array_train)

corpus_array_test = [X_test, y_test]        

eval_df = pd.DataFrame(corpus_array_test)

In [5]:
# import
from simplet5 import SimpleT5

# instantiate
model = SimpleT5()

# load (supports t5, mt5, byT5 models)
model.from_pretrained("t5","t5-base")

# train
model.train(train_df=train_df, # pandas dataframe with 2 columns: source_text & target_text
            eval_df=eval_df, # pandas dataframe with 2 columns: source_text & target_text
            source_max_token_len = 512, 
            target_max_token_len = 128,
            batch_size = 8,
            max_epochs = 5,
            use_gpu = True,
            outputdir = "outputs",
            early_stopping_patience_epochs = 0,
            precision = 32
            )

# load trained T5 model
model.load_model("t5","path/to/trained/model/directory", use_gpu=False)

# predict
model.predict("input text for prediction")

Global seed set to 42


KeyboardInterrupt: 