In [1]:
import polars as pl
import spacy
import torch
from collections import namedtuple
from preprocess import load_data
from transformers import  AutoTokenizer

model_directory = "C:/Users/prl90/PycharmProjects/kaggle_lstm/bert_tiny/"


In [4]:

def load_data(file_path, train=True):
    df = pl.read_json(file_path).lazy()
    reshaped_df = (
        df
        .with_columns(
            token_keys=pl.col('tokens')
            .map_elements(lambda f: [i for i in range(len(f))]),
        )
        .explode(['tokens', 'trailing_whitespace', 'token_keys'])
        .with_columns(
            token_len=pl.col('tokens').str.len_bytes(),
        )
        .with_columns(
            token_len=pl.struct(['token_keys', 'token_len'])
            .map_elements(lambda f: [f['token_keys']] * f['token_len'])
        )
        .with_columns(
            token_len=pl.when(pl.col('trailing_whitespace') == True)
            .then(pl.col('token_len').list.concat(-1))
            .otherwise(pl.col('token_len')),
            tokens_rec=pl.when(pl.col('trailing_whitespace') == True)
            .then(pl.col('tokens').add(' '))
            .otherwise(pl.col('tokens'))
        )
        .group_by('document', maintain_order=True)
        .agg(
            tokens=pl.col('tokens'),
            token_key=pl.col('token_len').flatten(),
            tokens_rec=pl.col('tokens_rec')
        )
        .with_columns(
            token_rec=pl.col('tokens_rec')
            .list.join(separator='')
        )
    )
    if train:
        temp = ((reshaped_df
                 .join(df.select(['document', 'labels'])
                       , on='document')
                 )
        )
    else:
        temp = reshaped_df
    return temp

In [53]:
 temp = load_data('C:/Users/prl90/PycharmProjects/kaggle_lstm/data/train.json', train = True)
    

In [71]:

def tokenize( r, model_directory=model_directory,stride = 20, max_length = 450 ):
    txt = r['token_rec']
    tokenizer = AutoTokenizer.from_pretrained(model_directory)
    
    inputs = tokenizer(txt , 
                       return_tensors="pt", 
                       padding=True, 
                       truncation=True, 
                       return_offsets_mapping=True, 
                       return_overflowing_tokens = True,
                       stride = stride,
                       add_special_tokens=False,
                       max_length= max_length)
    
    offset = inputs['offset_mapping'].tolist()
    attention_mask = inputs['attention_mask'].tolist()
    input_ids = inputs['input_ids'].tolist()
    
    
    txt_= []
    attention_mask_=[]
    input_ids_=[]
    offset_=[]
    

    for i, j in enumerate(offset):
        txt_.append(tokenizer.convert_ids_to_tokens(inputs["input_ids"][i].squeeze()))
        attention_mask_.append(attention_mask[i])
        input_ids_.append(input_ids[i])
        offset_.append(j)
    
    return {'txt':txt_, 'attention_mask': attention_mask_, 'input_ids':input_ids_, 'offset': offset_ }
    
    

In [72]:
%%time

 train = (
    temp
    .drop('tokens_rec')
    .fetch(200)
    .with_columns(
        pl.struct('token_rec').map_elements(lambda f: tokenize(f)).alias('ll')
    )
    .unnest('ll')
    .drop('token_rec')
    .explode(['txt', 'attention_mask', 'input_ids','offset' ])
    
)

CPU times: total: 14.3 s
Wall time: 34.3 s


In [73]:
# import joblib
train 

document,tokens,token_key,labels,txt,attention_mask,input_ids,offset
i64,list[str],list[i64],list[str],list[str],list[i64],list[i64],list[list[i64]]
7,"[""Design"", ""Thinking"", … "" ""]","[0, 0, … 752]","[""O"", ""O"", … ""O""]","[""design"", ""thinking"", … ""/""]","[1, 1, … 1]","[2640, 3241, … 1013]","[[0, 6], [7, 15], … [2171, 2172]]"
7,"[""Design"", ""Thinking"", … "" ""]","[0, 0, … 752]","[""O"", ""O"", … ""O""]","[""way"", "","", … ""[PAD]""]","[1, 1, … 0]","[2126, 1010, … 0]","[[2078, 2081], [2081, 2082], … [0, 0]]"
10,"[""Diego"", ""Estrada"", … "" ""]","[0, 0, … 562]","[""B-NAME_STUDENT"", ""I-NAME_STUDENT"", … ""O""]","[""diego"", ""estrada"", … ""on""]","[1, 1, … 1]","[5277, 26482, … 2006]","[[0, 5], [6, 13], … [2423, 2425]]"
10,"[""Diego"", ""Estrada"", … "" ""]","[0, 0, … 562]","[""B-NAME_STUDENT"", ""I-NAME_STUDENT"", … ""O""]","[""visual"", ""##ization"", … ""[PAD]""]","[1, 1, … 0]","[5107, 3989, … 0]","[[2310, 2316], [2316, 2323], … [0, 0]]"
16,"[""Reporting"", ""process"", … "" ""]","[0, 0, … 728]","[""O"", ""O"", … ""O""]","[""reporting"", ""process"", … ""the""]","[1, 1, … 1]","[7316, 2832, … 1996]","[[0, 9], [10, 17], … [2446, 2449]]"
…,…,…,…,…,…,…,…
4501,"[""Free"", ""time"", … "" ""]","[0, 0, … 568]","[""O"", ""O"", … ""O""]","[""effect"", ""of"", … ""[PAD]""]","[1, 1, … 0]","[3466, 1997, … 0]","[[2260, 2266], [2267, 2269], … [0, 0]]"
4509,"[""Virat"", ""Patel"", … "" ""]","[0, 0, … 638]","[""B-NAME_STUDENT"", ""I-NAME_STUDENT"", … ""O""]","[""vi"", ""##rat"", … ""customers""]","[1, 1, … 1]","[6819, 8609, … 6304]","[[0, 2], [2, 5], … [2485, 2494]]"
4509,"[""Virat"", ""Patel"", … "" ""]","[0, 0, … 638]","[""B-NAME_STUDENT"", ""I-NAME_STUDENT"", … ""O""]","[""design"", ""thinking"", … ""[PAD]""]","[1, 1, … 0]","[2640, 3241, … 0]","[[2360, 2366], [2367, 2375], … [0, 0]]"
4521,"[""Gabriel"", ""Lara"", … "" ""]","[0, 0, … 912]","[""B-NAME_STUDENT"", ""I-NAME_STUDENT"", … ""O""]","[""gabriel"", ""lara"", … ""auditor""]","[1, 1, … 1]","[6127, 13679, … 20964]","[[0, 7], [8, 12], … [2572, 2579]]"


In [68]:
joblib.dump(train, 'train')

['train']