In [1]:
# Change directory
import os
# path = '/home/module/'
# os.chdir(path)
import glob
from sklearn.model_selection import train_test_split
from collections import Counter

# Utils

In [2]:
def Dataloader(files, batch_size=1):
    for path in files:
        with open(path, "r") as file:
            lines = []
            idx   = 0 
            
            for line in file:
                idx += 1
                lines.append(line.strip())

                # Full batch
                if(idx%batch_size==0):
                    yield lines
                    lines = []

            # Not full batch
            if(len(lines) != 0): yield lines

def get_filename(root_path, pattern='*'):
    return [f for f in glob.glob(root_path+pattern, recursive=True)]

def save_flatten_data(path_name, sentences):
    file   = open(path_name, 'w')
    Format = '{}\n'
    for data in sentences:
        file.writelines(Format.format(data[0]))
    file.close()

In [3]:
def save_nested_data(path_name, sentences):
    file   = open(path_name, 'w')
    Format = '{}\n'
    for sent in sentences:
        for token in sent:
            file.writelines(Format.format(token))
            
        file.writelines(Format.format(''))
    file.close()

# Create dicts

In [4]:
def get_counter_tokens(data):
    vocab = Counter()
    for idx, batch in enumerate(data):
        for sent in batch:
            sent = sent.split('|')
            vocab.update(sent)            
    return vocab

def get_counter_nested_tokens(data, tag=False):
    vocab = Counter()
    for idx, batch in enumerate(data):
        for line in batch:
            line = line.split('|')
            
            if(tag == True):
                vocab.update(line[1:])  
            else:
                vocab.update([line[0]])  
    return vocab

def get_counter_chars(data):
    vocab = Counter()
    for idx, batch in enumerate(data):
        
        #Update chars 
        for sent in batch:
            vocab.update(list(sent))
    return vocab

def Sort_dict_freq(counter_dict, MINCOUNT=1):
    Dict  = {(w,c) for w, c in counter_dict.items() if c >= MINCOUNT}
    Dict  = sorted(Dict, key=lambda item: item[1], reverse=True)
    Dict  = [w[0] for w in Dict]
    return Dict

def Save_vocab(path_name, Dict, Add_dict=None):
    file   = open(path_name, 'w')
    Format = '{}\n'
    
    # Save add dict
    if(Add_dict != None):
        for dict_ in Add_dict:
            file.writelines(Format.format(dict_))
    
    # Save dict
    for dict_ in Dict:
        file.writelines(Format.format(dict_))
    file.close
    
def Save_logs(path_name, Dict):
    file   = open(path_name, 'w')
    Format = '{}\n'
    
    # Save dict
    for dict_ in Dict.items():
        file.writelines(Format.format(dict_))
    file.close

In [5]:
def split_flatten_datas(directory, tag):
    print('\nFlatten     :',tag)
    path_data      = directory+'/flatten_ner/'+tag+'/'
    save_path_data = directory+'/flatten_ner/'+tag+'/datas/' 

    data_src       = Dataloader(get_filename(path_data, '*.src'), 1)
    data_trg       = Dataloader(get_filename(path_data, '*.trg'), 1)

    X_train, X_test, y_train, y_test = train_test_split(list(data_src), list(data_trg), test_size=0.2, random_state=42)
    X_train, X_val,  y_train, y_val  = train_test_split(X_train, y_train, test_size=0.25, random_state=42)                # 0.25 x 0.8 = 0.2

    print('Train datas :', len(X_train))
    print('Test  datas :', len(X_test))
    print('Valid datas :', len(X_val))
    
    ## Save dataset ##
    # Save train datas
    save_flatten_data(save_path_data+'train.src',X_train)
    save_flatten_data(save_path_data+'train.trg',y_train)

    # Save test datas
    save_flatten_data(save_path_data+'test.src',X_test)
    save_flatten_data(save_path_data+'test.trg',y_test)

    # Save valid datas
    save_flatten_data(save_path_data+'valid.src',X_val)
    save_flatten_data(save_path_data+'valid.trg',y_val)

    ## Create dicts ##
    # words
    filenames_src  = save_path_data+'words.txt'
    counter_dict   = get_counter_tokens(X_train)    #Input data and tokenizer
    Dict           = Sort_dict_freq(counter_dict)
    Save_vocab(filenames_src, Dict, ['pad','unk'])
    Save_logs(filenames_src+'-logs', counter_dict)
    print('Words       :', len(Dict)+2)

    # Tags
    filenames_trg  = save_path_data+'tags.txt'
    counter_dict   = get_counter_tokens(y_train)    #Input data and tokenizer
    Dict           = Sort_dict_freq(counter_dict)
    Save_vocab(filenames_trg, Dict, ['pad'])
    Save_logs(filenames_trg+'-logs', counter_dict)
    print('Tags        :', len(Dict)+1)

    # Save chars dicts
    # Chars
    # Load text from words dicts
    filenames_src  = save_path_data+'words.txt'  
    filenames_src  = get_filename(filenames_src)

    filenames_chs  = save_path_data+'chars.txt'
    data_words     = Dataloader(filenames_src)

    counter_dict   = get_counter_chars(data_words)
    Dict           = Sort_dict_freq(counter_dict)

    # Save
    Save_vocab(filenames_chs, Dict, ['pad','unk'])
    Save_logs(filenames_chs+'-logs', counter_dict)
    print('Chars       :', len(Dict)+2)

In [6]:
def split_nested_datas(directory, tag):
    print('\nNested      :',tag)
    path_data      = directory+'/nested_ner/'+tag+'/'
    save_path_data = directory+'/nested_ner/'+tag+'/datas/' 
    
    # Load dataset
    datas = Dataloader(get_filename(path_data, '*.data'))  
    
    # Get sentences and tokens
    sentences   = []
    temp_tokens = []
    for idx, token in enumerate(datas):
        if( token[0] == ''):
            sentences.append(temp_tokens)
            temp_tokens = []
        else:
            temp_tokens.append(token[0])

    # Train test splits 
    X_train, X_test = train_test_split(sentences, test_size=0.2,  random_state=42)
    X_train, X_val  = train_test_split(X_train,   test_size=0.25, random_state=42)              # 0.25 x 0.8 = 0.2
    
    print('Train datas :', len(X_train))
    print('Test  datas :', len(X_test))
    print('Valid datas :', len(X_val))
    
    save_nested_data(save_path_data+'train.txt',X_train)
    save_nested_data(save_path_data+'test.txt',X_test)
    save_nested_data(save_path_data+'valid.txt',X_val)
    
    # Crate words dicts
    filenames_src  = save_path_data+'words.txt'
    counter_dict   = get_counter_nested_tokens(X_train)    #Input data and tokenizer
    Dict           = Sort_dict_freq(counter_dict)
    Save_vocab(filenames_src, Dict, ['pad','unk'])
    Save_logs(filenames_src+'-logs', counter_dict)
    print('Words       :', len(Dict)+2)
    
    # Create tags dicts
    filenames_trg  = save_path_data+'tags.txt'
    counter_dict   = get_counter_nested_tokens(X_train, tag=True)    #Input data and tokenizer
    Dict           = Sort_dict_freq(counter_dict)
    Save_vocab(filenames_trg, Dict, ['pad'])
    Save_logs(filenames_trg+'-logs', counter_dict)
    print('Tags        :', len(Dict)+1)

    # Create chars dicts
    filenames_src  = get_filename(save_path_data+'words.txt')
    filenames_chs  = save_path_data+'chars.txt'
    data_words     = Dataloader(filenames_src)
    counter_dict   = get_counter_chars(data_words)
    Dict           = Sort_dict_freq(counter_dict)

    Save_vocab(filenames_chs, Dict, ['pad','unk'])
    Save_logs(filenames_chs+'-logs', counter_dict)
    print('Chars       :', len(Dict)+2)

In [7]:
state          = 'split_flatten_datas'
directory      = '/home/module/data/vistec_newmm4L'

if __name__ == '__main__':
    if(state=='split_flatten_datas'): 
        split_flatten_datas(directory,'maintags')
        split_flatten_datas(directory,'subtags')
        
        split_nested_datas(directory,'maintags')
        split_nested_datas(directory,'subtags')


Flatten     : maintags
Train datas : 1719
Test  datas : 573
Valid datas : 573
Words       : 21912
Tags        : 42
Chars       : 206

Flatten     : subtags
Train datas : 1719
Test  datas : 573
Valid datas : 573
Words       : 21912
Tags        : 389
Chars       : 206

Nested      : maintags
Train datas : 1719
Test  datas : 573
Valid datas : 573
Words       : 21912
Tags        : 42
Chars       : 206

Nested      : subtags
Train datas : 1719
Test  datas : 573
Valid datas : 573
Words       : 21912
Tags        : 399
Chars       : 206


In [8]:
# ''' : newmm
# Flatten     : maintags
# Train datas : 1200
# Test  datas : 400
# Valid datas : 400
# Words       : 18454
# Tags        : 387
# Chars       : 206

# Flatten     : subtags
# Train datas : 1200
# Test  datas : 400
# Valid datas : 400
# Words       : 18454
# Tags        : 42
# Chars       : 206

# Nested      : maintags
# Train datas : 1200
# Test  datas : 400
# Valid datas : 400
# Words       : 18454
# Tags        : 400
# Chars       : 206

# Nested      : subtags
# Train datas : 1200
# Test  datas : 400
# Valid datas : 400
# Words       : 18454
# Tags        : 42
# Chars       : 206
pass

In [9]:
# ## Attacut
# Flatten     : maintags
# Train datas : 1200
# Test  datas : 400
# Valid datas : 400
# Words       : 17459
# Tags        : 387
# Chars       : 206

# Flatten     : subtags
# Train datas : 1200
# Test  datas : 400
# Valid datas : 400
# Words       : 17459
# Tags        : 42
# Chars       : 206

# Nested      : maintags
# Train datas : 1200
# Test  datas : 400
# Valid datas : 400
# Words       : 17459
# Tags        : 402
# Chars       : 206

# Nested      : subtags
# Train datas : 1200
# Test  datas : 400
# Valid datas : 400
# Words       : 17459
# Tags        : 42
# Chars       : 206
pass

# Cleaned data

In [10]:
# Attacut
# Flatten     : maintags
# Train datas : 1077
# Test  datas : 360
# Valid datas : 359
# Words       : 16103
# Tags        : 42
# Chars       : 184

# Flatten     : subtags
# Train datas : 1077
# Test  datas : 360
# Valid datas : 359
# Words       : 16103
# Tags        : 374
# Chars       : 184

# Nested      : maintags
# Train datas : 1077
# Test  datas : 360
# Valid datas : 359
# Words       : 16103
# Tags        : 42
# Chars       : 184

# Nested      : subtags
# Train datas : 1077
# Test  datas : 360
# Valid datas : 359
# Words       : 16103
# Tags        : 388
# Chars       : 184

In [11]:
# Newmm

# Flatten     : maintags
# Train datas : 1077
# Test  datas : 360
# Valid datas : 359
# Words       : 17348
# Tags        : 42
# Chars       : 184

# Flatten     : subtags
# Train datas : 1077
# Test  datas : 360
# Valid datas : 359
# Words       : 17348
# Tags        : 373
# Chars       : 184

# Nested      : maintags
# Train datas : 1077
# Test  datas : 360
# Valid datas : 359
# Words       : 17348
# Tags        : 42
# Chars       : 184

# Nested      : subtags
# Train datas : 1077
# Test  datas : 360
# Valid datas : 359
# Words       : 17348
# Tags        : 385
# Chars       : 184
