# Remove Impossible Tokens

- words are being tokenized into different tokens using different tokenizer. In order to help the model learn better, we removed tokens that are impossible to predict so that we don't penalize them during loss calculation. 

In [1]:
import transformers
import tokenizers
import pandas as pd
import numpy as np
import re

# Preprocess and Postprocess

In [16]:
def preprocess_im(df):
    im_dict = {
        'iï¿½m':"i'm",
        'Iï¿½m':"I'm",
        'Iï¿½M':"I'M",
        'Iï¿½d':"I'd",
        'Iï¿½D':"I'D",
    }
    
    for key, item in im_dict.items():
        
        df.loc[df['text'].str.contains(key),'selected_text'] = df.loc[
                    df['text'].str.contains(key),'selected_text'
        ].apply(lambda x: re.sub(key, item, x))
        
        
        df.loc[df['text'].str.contains(key),'text'] = df.loc[
                    df['text'].str.contains(key),'text'
        ].apply(lambda x: re.sub(key, item, x))

    return df

def preprocess_all(df):
    
    df.loc[:, 'text'] = df.loc[:, 'text'].apply(lambda x: x.lower())
    df.loc[:, 'selected_text'] = df.loc[:, 'selected_text'].apply(lambda x: x.lower())
    
    proc_dict = {
        'ï¿½s':"'s",
        'nï¿½t':"n't",
        'ï¿½ve':"'ve",
        'ï¿½ll':"'ll",
        'ï¿½re':"'re",
        "inï¿½": "ing",
        "n`\*\*\*\*": "n't"
    }
    for key, item in proc_dict.items():
        if key == '`s':
            df.loc[df['text'].str.contains("(\w`s)"),'selected_text'] = df.loc[
                df['text'].str.contains("(\w`s)"),'selected_text'
            ].apply(lambda x: re.sub(key, item, x))
            
            df.loc[(df['text'].str.contains("(\w`s)")),'text'] = df.loc[
                df['text'].str.contains("(\w`s)"),'text'
            ].apply(lambda x: re.sub(key, item, x))
            
        else:
            df.loc[df['text'].str.contains(key),'selected_text'] = df.loc[
                df['text'].str.contains(key),'selected_text'
            ].apply(lambda x: re.sub(key, item, x))
            
            df.loc[df['text'].str.contains(key),'text'] = df.loc[
                df['text'].str.contains(key),'text'
            ].apply(lambda x: re.sub(key, item, x))
    
    df.loc[df['selected_text'].str.contains("(ï|¿|½)"),'selected_text'] = df.loc[
        df['selected_text'].str.contains("(ï|¿|½)"),'selected_text'
    ].apply(lambda x: re.sub("(ï|¿|½)", "", x))
    
    
    df.loc[df['text'].str.contains("(ï|¿|½)"),'text'] = df.loc[
        df['text'].str.contains("(ï|¿|½)"),'text'
    ].apply(lambda x: re.sub("(ï|¿|½)", "", x))

            
    return df

def preprocess_repeat(df):
    
    df.loc[df.text.str.contains("(?<=\.)(\.)(?<!\w)"), 'selected_text'] = df.loc[
        df.text.str.contains("(?<=\.)(\.)(?<!\w)")
    ].selected_text.apply(lambda x:re.sub(r'(?<=\.)(\.)(?<!\w)', r' \1', x))
    
    df.loc[df.text.str.contains("(?<=\.)(\.)(?<!\w)"), 'text'] = df.loc[
        df.text.str.contains("(?<=\.)(\.)(?<!\w)")
    ].text.apply(lambda x:re.sub(r'(?<=\.)(\.)(?<!\w)', r' \1', x))
    
    df.loc[df.text.str.contains("(?<=\!)(\!)(?<!\w)"), 'selected_text'] = df.loc[
        df.text.str.contains("(?<=\!)(\!)(?<!\w)")
    ].selected_text.apply(lambda x:re.sub(r'(?<=\!)(\!)(?<!\w)', r' \1', x))
    
    df.loc[df.text.str.contains("(?<=\!)(\!)(?<!\w)"), 'text'] = df.loc[
        df.text.str.contains("(?<=\!)(\!)(?<!\w)")
    ].text.apply(lambda x:re.sub(r'(?<=\!)(\!)(?<!\w)', r' \1', x))
    
    df.loc[df.text.str.contains("(?<=\?)(\?)(?<!\w)"), 'selected_text'] = df.loc[
        df.text.str.contains("(?<=\?)(\?)(?<!\w)")
    ].selected_text.apply(lambda x:re.sub(r'(?<=\?)(\?)(?<!\w)', r' \1', x))
    
    df.loc[df.text.str.contains("(?<=\?)(\?)(?<!\w)"), 'text'] = df.loc[
        df.text.str.contains("(?<=\?)(\?)(?<!\w)")
    ].text.apply(lambda x:re.sub(r'(?<=\?)(\?)(?<!\w)', r' \1', x))
    
    
    return df

In [3]:
def token_encode(x):
    encoded = TOKENIZER.encode(x)
    offsets = encoded.offsets
    # ids = encoded.ids
    # return [offset for i,offset in enumerate(offsets) if ids[i]!=47341]
    return offsets

In [18]:
MAX_LEN = 192
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
EPOCHS = 5
ROBERTA_PATH = "./roberta-base-squad2/"
TOKENIZER = tokenizers.ByteLevelBPETokenizer(
    vocab_file=f"{ROBERTA_PATH}/vocab.json", 
    merges_file=f"{ROBERTA_PATH}/merges.txt", 
    lowercase=True,
    add_prefix_space=True
    )

# Check Start token problems

In [2]:
def check_start(offsets, span_start, text):
    for i,j in offsets:
        if i==span_start:
            return 0
        k=i
        while len(text)-1 > k and text[k]==' ':
        # while text[k]==' ':
            if text[k]==' ':
                k=k+1
            else: break
        if k==span_start:
            return 0
        if k>span_start:
            return j-span_start

In [12]:
dat = pd.read_csv('train_folds_42.csv')
# dat = dat[dat['sentiment']=='neutral'].reset_index(drop=True)

dat['text'] = dat['text'].str.strip()
dat['selected_text'] = dat['selected_text'].str.strip()
dat['text'] = dat['text'].map(lambda x: re.sub('\s+',' ',x))
dat['selected_text'] = dat['selected_text'].map(lambda x: re.sub('\s+',' ',x))

# dat = preprocess_im(dat)
# dat = preprocess_all(dat)
# dat = preprocess_repeat(dat)

dat['offsets'] = dat['text'].map(
    lambda x: token_encode(x)
)
dat['span'] = dat.apply(
    lambda x: re.search(re.escape(x['selected_text']), x['text']).span() , axis=1
)
dat['is_start_problem'] = dat.apply(lambda x: check_start(x['offsets'], x['span'][0], x['text']), axis=1)
print(dat['is_start_problem'].value_counts())
dat['selected_2'] = dat['selected_text']
dat.loc[dat['is_start_problem']>0,'selected_2'] = dat.loc[dat['is_start_problem']>0,'selected_text'].map(
    lambda x: ' '.join(x.split()[1:])
).str.strip()
dat['text'] = dat['text'].map(lambda x: re.sub('\s+',' ',x))
dat['offsets'] = dat['text'].map(
    lambda x: token_encode(x)
)
dat['selected_2'] = dat['selected_2'].map(lambda x: re.sub('\s+',' ',x))
dat['selected_2'] = dat['selected_2'].map(lambda x: re.sub('^\s+$','',x)).replace('',np.nan)
dat = dat.dropna().reset_index(drop=True)
dat.loc[dat['selected_2'] == '','selected_2'] = dat.loc[dat['selected_2'] == '','selected_text']
dat['span'] = dat.apply(
    lambda x: re.search(re.escape(x['selected_2']), x['text']).span(), axis=1
)
dat['is_start_problem'] = dat.apply(lambda x: check_start(x['offsets'], x['span'][0], x['text']), axis=1)
dat = dat.loc[dat['selected_text'].map(len)>=2].reset_index(drop=True)
# dat.loc[dat['selected_text'].map(len)<=1,'selected_2'] = dat.loc[dat['selected_text'].map(len)<=1,'selected_text']
print(dat['is_start_problem'].value_counts())

0     26736
6       140
7       120
5        98
8        74
4        71
3        65
2        57
9        55
10       26
11       22
12        7
13        6
15        2
14        1
Name: is_start_problem, dtype: int64
0    27464
7        1
Name: is_start_problem, dtype: int64


# Check end token problems

In [13]:
def check_end(offsets, span_end, text):
    for i,j in offsets:
        k=j
        while len(text)-1 > k and text[k] == '、':
            k=k+1
        if k==span_end:
            return 0
        if k>span_end:
            return j-span_end

In [14]:
dat['is_end_problem'] = dat.apply(lambda x: check_end(x['offsets'], x['span'][1], x['text']), axis=1)
print(dat['is_end_problem'].value_counts())
dat['selected_3'] = dat['selected_2']
dat.loc[dat['is_end_problem']>0,'selected_3'] = dat.loc[dat['is_end_problem']>0,'selected_2'].map(
    lambda x: ' '.join(x.split()[:-1])
).str.strip()
dat['text'] = dat['text'].map(lambda x: re.sub('\s+',' ',x))
dat['offsets'] = dat['text'].map(
    lambda x: token_encode(x)
)
dat['selected_3'] = dat['selected_3'].map(lambda x: re.sub('\s+',' ',x))
dat['selected_3'] = dat['selected_3'].map(lambda x: re.sub('^\s+$','',x)).replace('',np.nan)
dat = dat.dropna().reset_index(drop=True)
# dat.loc[dat['selected_3'] == '','selected_3'] = dat.loc[dat['selected_3'] == '','selected_2']
dat.loc[dat['selected_text'].map(len)<=2,'selected_3'] = dat.loc[dat['selected_text'].map(len)<=2,'selected_text']
dat['span'] = dat.apply(
    lambda x: re.search(re.escape(x['selected_3']), x['text']).span(), axis=1
)
dat['is_end_problem'] = dat.apply(lambda x: check_end(x['offsets'], x['span'][1], x['text']), axis=1)
print(dat['is_end_problem'].value_counts())
# dat = dat.loc[dat['is_end_problem']==0]

0    26803
1      372
2      154
5       45
3       45
4       33
6        9
8        2
7        2
Name: is_end_problem, dtype: int64
0    27372
1        5
2        3
Name: is_end_problem, dtype: int64


In [15]:
# dat = dat.loc[dat['is_end_problem']==0].reset_index(drop=True)
dat['selected_text'] = dat['selected_3']
dat = dat.drop(['offsets','span','is_start_problem','selected_2','is_end_problem','selected_3'],1)
dat.to_pickle('train_folds_42_clean.pkl')