In [1]:
from os.path import join
import os
import pickle
from collections import Counter
from glob import glob
import json
import pandas as pd
import re
from enum import Enum, auto

from preproc import CleanSentence 

In [2]:
# set lang to work with in the notebook
lang = 'en'

In [3]:
# load vocabulary
vocab = json.load(open(join('data', 'lang_corpus', lang,'vocab.json'), 'r'))
print('vocab size:', len(vocab.keys()))
reverse_vocab = {v: k for k, v in vocab.items()}

vocab size: 1911276


In [4]:
# load toxic dataset
df = pd.read_csv(join('data', 'pan', f'{lang}.tsv'), sep='\t', encoding='utf8')
print(df.head())
print(df.shape)


                                      toxic_sentence  \
0  then all of a sudden i see her , shes now got ...   
1  My page should be protected first so that wort...   
2                        You made a mistake you ass.   
3  you know more than these idiots , stay the cou...   
4     piss me off , fuckin jerk , get on my nerves .   

                                    neutral_sentence  
0    All of a sudden i see her, she is all grown up.  
1  My page should be protected first so that unpl...  
2                                You made a mistake.  
3  you know more than these people , stay the cou...  
4                                   get on my nerves  
(400, 2)


In [5]:
# and toxic word list
toxic_words = set(json.load(open(join('data', 'en_toxic_vocab.json'), 'r')))
print('toxic words:', len(toxic_words))

toxic words: 3007


In [6]:
def Query(gram_counts:dict, gram_ids:tuple) -> tuple:
    look_back = gram_ids[:-1]
    if type(look_back) is not tuple:
        look_back = tuple(look_back)
        
    # assemble queries to consider the entire vocabulary
    queries = []
    for id in vocab.values():
        queries.append(look_back + (id,))
    # print(queries[:5])
    
    # get each query's frequency
    candidates = {}
    for query in queries:
        if query in gram_counts:
            candidates[query] = gram_counts[query]
    
    if len(candidates) == 0: return (-1,) #empty space, which will result in deleting the gram

    # sort candidates by frequency
    candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
    # print('top 10 candidates:', candidates[:5])

    # return top candidate or random of top ones, if there is a tie
    top_candidates = [c[0] for c in candidates if c[1] == candidates[0][1]]
    return top_candidates[0]
        

In [7]:
clean_vocab_ids = [vid for vid in vocab.values() if reverse_vocab[vid] not in toxic_words] #filter out toxic word ids

def QueryGeneral(gram_counts: dict, gram_ids: list) -> tuple:
    if -2 not in gram_ids: 
        # print('no wildcard token')
        return tuple([-1] * len(gram_ids))
    
    # assumes only one wildcard
    wildcard_index = gram_ids.index(-2)
    candidates = {}
    for vid in clean_vocab_ids: #only consider queries of non-toxic words to guarantee replacement with non-toxic or empty
        temp = list(gram_ids) #copy the list
        temp[wildcard_index] = vid
        query = tuple(temp)
        if query in gram_counts:
            candidates[query] = gram_counts[query] #get freq of this ngram

    if not candidates: 
        # print('no candidates')
        return tuple([-1] * len(gram_ids))

    # Sort and return the most frequent match (or one of them)
    candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
    top_candidates = [c[0] for c in candidates if c[1] == candidates[0][1]]
    return top_candidates[0]

In [8]:
def Tokenize(s:str, n:int) -> list[int]:
    s = CleanSentence(s) #clean the same as in the training script
    tokens = [vocab[w] for w in s.split() if w in vocab] # convert each word to its id, if it exists in the vocabulary, otherwise it is ignored
    if len(tokens) < n:
        return []
    tokens = [-1] * (n - 1) + tokens + [-1] * (n - 1) #pad with the -1 token on either end, bcs i want the ngram model to handle sentence boundaries
    return tokens # convert string to list of words

In [9]:
def Untokenize(gram_ids:tuple):
    # convert tuple to list to enable modification
    gram_ids = list(gram_ids)

    # convert ids to words
    for i, id in enumerate(gram_ids):
        gram_ids[i] = reverse_vocab[id]
    return ' '.join(gram_ids)

In [17]:
class QueryStrategy(Enum):
    LOOKBACK = auto()
    LOOKFORWARD = auto()
    CENTER = auto()

def RunModel(cp_df, n, model, strategy):
    print(f'Processing {n}-gram model {strategy.name} strategy...')
    
    model_col = f'{n}-gram-{strategy.name}'
    cp_df[model_col] = '' # create a new column for the n-gram model predictions

    # add n-grams prediction for each toxic sentence
    for i, row in cp_df.iterrows():
        toxic_sent = row['toxic_sentence']
        toxic_ids = Tokenize(toxic_sent, n)
        if len(toxic_ids) == 0: continue

        # replace each toxic word with the n-gram model prediction
        for j, id in enumerate(toxic_ids):
            if j < n-1 or j > len(toxic_ids)-n: continue #skip over padding tokens

            if reverse_vocab[id] in toxic_words:
                if strategy == QueryStrategy.LOOKBACK:
                    context = toxic_ids[j-n+1 : j+1]
                    wildcard_idx = n - 1
                elif strategy == QueryStrategy.LOOKFORWARD:
                    context = toxic_ids[j : j+n]
                    wildcard_idx = 0
                elif strategy == QueryStrategy.CENTER:
                    back = n // 2
                    context = toxic_ids[j-back+1 : j+back+1]
                    wildcard_idx = back

                # inject the single wildcard
                # print(j, wildcard_idx, toxic_ids, toxic_ids[j], context, Untokenize(context))
                context[wildcard_idx] = -2
                # print(context)
                replacement = QueryGeneral(model, context)
                # print(replacement)
                # print(replacement[wildcard_idx])
                toxic_ids[j] = replacement[wildcard_idx] #replace with last id of replacement tuple
                # print(toxic_ids, Untokenize(toxic_ids))
            
        # if i > 4: return

        # remove all -1 tokens, as they are meaningless empty space
        toxic_ids = [t for t in toxic_ids if t != -1]

        # convert back to string
        pred_sent = Untokenize(toxic_ids)
        # add in original sentence ending punctuation
        ending = re.search(r'[^\w]+$', toxic_sent)
        if ending:
            pred_sent += ending.group(0)
        pred_sent = pred_sent[0].upper() + pred_sent[1:] #capitalize start of sentence
        # save down
        cp_df.at[i, model_col] = pred_sent

        if i % 25 == 0:
            print(f'{i}...', end='\r')

        # if i > 4: break #testing

    print('Saving dataframes...')
    # save in the formats required by the eval scripts
    sub = cp_df.copy() #submission
    ref = cp_df.copy() #reference

    sub = sub[['toxic_sentence', model_col]]
    ref = ref[['toxic_sentence', 'neutral_sentence']]

    sub['lang'] = lang #specify for eval script
    ref['lang'] = lang #specify for eval script

    sub = sub[sub[model_col].notna()] #remove NaN rows

    sub.columns = ['toxic_sentence', 'neutral_sentence', 'lang'] #specify for eval script
    ref.columns = ['toxic_sentence', 'neutral_sentence', 'lang'] #specify for eval script

    os.makedirs(join('preds', lang), exist_ok=True)
    sub.to_csv(join('preds', lang, f'{n}-gram_{strategy.name}_sub.tsv'), index=False, sep='\t', encoding='utf8')
    ref.iloc[:len(sub)].to_csv(join('preds', lang, f'{n}-gram_{strategy.name}_ref.tsv'), index=False, sep='\t', encoding='utf8')
    
    print('Done')

In [18]:
for model_path in glob(join('models', 'ngrams', '*.pkl')):
    n = int(os.path.basename(model_path).split('-')[0])
    # if n != 2: continue
    cp_df = df.copy() #local copy so that original df doesnt keep growing
    
    model = None
    print(f'Loading {n}-gram model...')
    with open(model_path, 'rb') as model_file:
        model = pickle.load(model_file)
    
    if n == 4:
        for strategy in [QueryStrategy.LOOKBACK,QueryStrategy.LOOKFORWARD,QueryStrategy.CENTER]:
            RunModel(cp_df, n, model, strategy)
    else:
        RunModel(cp_df, n, model, QueryStrategy.LOOKBACK)

Loading 2-gram model...
Processing 2-gram model LOOKBACK strategy...
Saving dataframes...
Done
Loading 3-gram model...
Processing 3-gram model LOOKBACK strategy...
Saving dataframes...
Done
Loading 4-gram model...
Processing 4-gram model LOOKBACK strategy...
Saving dataframes...
Done
Processing 4-gram model LOOKFORWARD strategy...
Saving dataframes...
Done
Processing 4-gram model CENTER strategy...
Saving dataframes...
Done


In [None]:
# replace each toxic word with the n-gram model prediction
for strategy in [QueryStrategy.LOOKBACK,QueryStrategy.LOOKFORWARD,QueryStrategy.CENTER]:
    n = 4
    toxic_ids = [-1] * (n - 1) + list(range(1,4)) + [-1] * (n - 1)
    print(toxic_ids)
    for j, id in enumerate(toxic_ids):
        if j < n-1: continue
        if j > len(toxic_ids)-n: continue
        if strategy == QueryStrategy.LOOKBACK:
            print(j, j-n+1)
            context = toxic_ids[j-n+1 : j+1]    # list of length n
            wildcard_idx = n - 1             # wildcard at end
        elif strategy == QueryStrategy.LOOKFORWARD:
            print(j, j+n)
            context = toxic_ids[j : j+n]    # list of length n
            wildcard_idx = 0                 # wildcard at start
        elif strategy == QueryStrategy.CENTER:
            back = n // 2
            print(j-back, j+back+1)
            context = toxic_ids[j-back+1 : j+back+1]  # list of length n
            wildcard_idx = back             # center slot

        # inject the single wildcard
        context[wildcard_idx] = -2
        print(strategy, wildcard_idx, context)

In [None]:
# quick test
# sent = (4, 320, 0)
# print(model[sent])
# pred = Query(model, sent)
# print(pred, Untokenize(pred))