In [1]:
import os
from glob import glob

In [2]:
os.listdir()

['.DS_Store',
 'requirements.txt',
 'README.md',
 'dataloader.py',
 '.ipynb_checkpoints',
 '.git',
 'data',
 'notebooks',
 'Runs.ipynb']

In [31]:
DATA_PATH = "data"

def csv_concat(dir):
    all_files = []
    for root, dirs, files in os.walk(dir):
        for name in files:
            all_files.append(os.path.join(root, name))
            
    csv_files = list(filter(lambda f: f.endswith('.csv'), all_files))
    
    return csv_files

csv_files = csv_concat(DATA_PATH)

In [8]:
import pandas as pd

In [9]:
# Read CSV files from List
df = pd.concat(map(pd.read_csv, csv_files))


In [56]:
def file_structuring(csv_files:list()):
    code_file_dict = dict()
    num_file = 0


    for i, f in enumerate(csv_files):
        df = pd.read_csv(f)
        p1, p2, p3 = (df[['notes', 'code', 'desc']], 
                      df[['aug_text', 'code', 'desc']], 
                      df[['paraphrase', 'code', 'desc']])

        p1.columns = ['notes', 'codes', 'desc']
        p2.columns = ['notes', 'codes', 'desc']
        p3.columns = ['notes', 'codes', 'desc']

        df = pd.concat([p1, p2, p3], axis=0, ignore_index=True)
        df = df[['notes', 'codes', 'desc']]
        code_file_dict[i + 1] = df

    df = pd.concat(code_file_dict.values())
    
    return df
    
df = file_structuring(csv_files)
df = df.sample(frac=1)
print(df.shape)
df.head()

(16779, 3)


Unnamed: 0,notes,codes,desc
20,5. Patient with Other Myositis developed a sev...,M60.8,Other myositis
379,"10. Operative Note: A surgical intervention, a...",M40.1,Other secondary kyphosis
1243,4. Procedure: Arthroscopic hip debridement. In...,M02.0,Arthropathy following intestinal bypass
13,5. Operative Note: The patient with spinal ent...,M46.0,Spinal enthesopathy
3751,"2. synovitis, erosions, and mycotic invasion w...",M01.6,Arthritis in mycoses


In [54]:
df[df.duplicated(['notes'], keep=False)]

Unnamed: 0,notes,codes,desc
165,6. Operative Note: A joint resurfacing procedu...,M46.2,Osteomyelitis of vertebra
189,The surgical intervention was performed on a p...,M62.3,Immobility syndrome (paraplegic)
207,"""Contracture release surgery was performed on ...",M62.4,Contracture of muscle
48,5. Operative Note: Sacroiliitis Sacroiliac Joi...,M46.1,"Sacroiliitis, not elsewhere classified"
190,There are 9. A surgical intervention was used ...,M46.2,Osteomyelitis of vertebra
...,...,...,...
406,4.,M45,Ankylosing spondylitis
3470,"The procedure consisted of an operation, thoro...",M01.3,Arthritis in other bacterial diseases classifi...
161,1. AS patient diagnosed with mild disease seve...,M45,Ankylosing spondylitis
522,3. A minimally-invagant procedure was carried ...,M40.4,Other lordosis


In [50]:
len(df.codes.unique())

63

In [57]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

  from .autonotebook import tqdm as notebook_tqdm


In [58]:
text = 'The quick brown fox jumps over the lazy dog .'
print(text)

The quick brown fox jumps over the lazy dog .


In [69]:
from transformers import pipeline
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

def text_aug(text, model="bert-base-uncased", action="substitute", 
             n_samples=1, max_length=130, min_length=30):
    
    if model == "bert-base-uncased":
        aug = naw.ContextualWordEmbsAug(
            model_path='bert-base-uncased', action=action)
        augmented_text = aug.augment(text, n=n_samples)
        
    if action == "summary":
        aug = nas.AbstSummAug(model_path='t5-base')
        augmented_text = aug.augment(text)
        
    if model == "facebook/bart-large-cnn":
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        out = summarizer(text, max_length=130, min_length=30, do_sample=False)
        augmented_text = out["summary_text"]

    return augmented_text


Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
['the quick talking fox jumps for their lazy dog.', 'one quick brown fox jumps for the fleeing dog.', 'the quick brown fox jumps into this wild dog.']


In [71]:
article = """
The history of natural language processing (NLP) generally started in the 1950s, although work can be 
found from earlier periods. In 1950, Alan Turing published an article titled "Computing Machinery and 
Intelligence" which proposed what is now called the Turing test as a criterion of intelligence. 
The Georgetown experiment in 1954 involved fully automatic translation of more than sixty Russian 
sentences into English. The authors claimed that within three or five years, machine translation would
be a solved problem. However, real progress was much slower, and after the ALPAC report in 1966, 
which found that ten-year-long research had failed to fulfill the expectations, funding for machine 
translation was dramatically reduced. Little further research in machine translation was conducted 
until the late 1980s when the first statistical machine translation systems were developed.
"""

aug = nas.AbstSummAug(model_path='t5-base')
augmented_text = aug.augment(article)
print("Original:")
print(article)
print("Augmented Text:")
print(augmented_text)

Downloading (…)lve/main/config.json: 100%|█████████████████████| 1.21k/1.21k [00:00<00:00, 1.61MB/s]
Downloading (…)ve/main/spiece.model: 100%|████████████████████████| 792k/792k [00:00<00:00, 904kB/s]
Downloading (…)/main/tokenizer.json: 100%|█████████████████████| 1.39M/1.39M [00:01<00:00, 1.27MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Downloading model.safetensors: 100%|█████████████████████████████| 892M/892M [01:22<00:00, 10.8MB/s]
Downloading (…)neration_config.json: 100%|██████████████████████████| 147/147 [00:00<00:00, 207kB/s]


Original:

The history of natural language processing (NLP) generally started in the 1950s, although work can be 
found from earlier periods. In 1950, Alan Turing published an article titled "Computing Machinery and 
Intelligence" which proposed what is now called the Turing test as a criterion of intelligence. 
The Georgetown experiment in 1954 involved fully automatic translation of more than sixty Russian 
sentences into English. The authors claimed that within three or five years, machine translation would
be a solved problem. However, real progress was much slower, and after the ALPAC report in 1966, 
which found that ten-year-long research had failed to fulfill the expectations, funding for machine 
translation was dramatically reduced. Little further research in machine translation was conducted 
until the late 1980s when the first statistical machine translation systems were developed.

Augmented Text:
['the history of natural language processing (NLP) generally started in the 

In [72]:
aug = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
text, augmented_text

Downloading (…)okenizer_config.json: 100%|███████████████████████| 28.0/28.0 [00:00<00:00, 45.2kB/s]
Downloading (…)lve/main/config.json: 100%|██████████████████████████| 483/483 [00:00<00:00, 800kB/s]
Downloading (…)solve/main/vocab.txt: 100%|████████████████████████| 232k/232k [00:00<00:00, 499kB/s]
Downloading (…)/main/tokenizer.json: 100%|████████████████████████| 466k/466k [00:00<00:00, 692kB/s]
Downloading model.safetensors: 100%|█████████████████████████████| 268M/268M [00:25<00:00, 10.6MB/s]


('The quick brown fox jumps over the lazy dog .',
 ['the noisy brown fox jumps alongside the lazy fox.'])