In [2]:
import pandas as pd 
import numpy as np

## Elastic Search

Set up the engine and allow us to index the documents based on the documents that contain certain terms.

In [3]:
from elasticsearch import Elasticsearch

# Create an elastic search engine
es = Elasticsearch(
     cloud_id="lm-datasets:dXMtY2VudHJhbDEuZ2NwLmNsb3VkLmVzLmlvJDk1N2U5ODIwZDUxNTQ0YWViMjk0MmQwNzI1NjE0OTQ2JDhkN2M0OWMyZDEzMTRiNmM4NDNhNGEwN2U4NDE5NjRl",
     api_key="T2w5Vl9ZTUIzVzE5dTlBblUtRlo6MHNBYWxhbHVTeGFodUpUOWIybkNNZw==",
     retry_on_timeout=True,
     http_compress=True,
)

def scroll_documents(es, query, size=50, scroll_time="20m", index="re_pile"):
    data = es.search(index=index, query=query, size=size, scroll=scroll_time, sort=["_doc"]) #TODO: Check score
    hits, scroll_id = data["hits"]["hits"], data["_scroll_id"]
    yield hits
    
    total = len(hits)
    while len(hits) != 0:
        data = es.scroll(scroll_id=scroll_id, scroll=scroll_time)
        hits, scroll_id = data["hits"]["hits"], data["_scroll_id"]
        total += len(hits)
        yield hits
    
    es.clear_scroll(scroll_id=scroll_id)
    print(f"Done scrolling for query={query}!")
    yield None

In [4]:
query = {'match': {'text': {'query': 'muslim angry', 'operator': 'and'}}}

docs_iter = scroll_documents(es, query, size=10, index="re_pile")
while (docs := next(docs_iter)) is not None:
    break
    
# Example
text = docs[0]["_source"]["text"]

In [5]:
es.count(query=query, index="re_pile")

ObjectApiResponse({'count': 132451, '_shards': {'total': 20, 'successful': 20, 'skipped': 0, 'failed': 0}})

## Constraints

Let us define simple RegexConstraint. This constraint will work in terms of looking for the exact match of the specified phrases you specify.

In [6]:
import re
import spacy

__nlp__ = spacy.load("en_core_web_sm", disable=["ner"])


def get_phrases(text, phrases):
    try:
        return [text.index(p.lower()) + len(p) for p in phrases]
    except:
        return None
    
    
class Constraint:
    def __init__(self, *words, distance: int=30):
        self.words = list(words)
        self.wordsl = [p.lower() for p in self.words]

        self.distance = distance
        assert distance > 0
        
    @property
    def es_query(self):
        return {'match': {'text': {'query': " ".join(self.words), 'operator': 'and'}}}
    
    def find_matches(self, text: str) -> list:
        textl = text.lower()

        # Indices 
        indices = get_phrases(textl, self.wordsl)

        windows = []
        for i in indices:
            wstart = max(0, i-self.distance)
            wend = min(len(text), i+self.distance+1)
            text_i = textl[wstart:wend] 
            window_i = get_phrases(text_i, self.wordsl)
            
            if window_i is not None:
                windows.append(text[wstart:wend])
        
        return windows

    def get_prefix(self, window: str):
        # Index returns the first occurrence of specified word
        # We sum the length of the word w to obtain end character
        prefixes = get_phrases(window.lower(), self.wordsl)
        if prefixes is None:
            print("Words:\n->", self.wordsl)
            print("Skipping example:\n->", window)
            return None, None
        
        # The largest prefix will definitely contain both words
        prefixes = sorted(prefixes)
        # We'll pick the longest prefix
        prefix, continuation = window[:prefixes[-1]],  window[prefixes[-1]:]
        
        return prefix, continuation
    
    def get_minimal_prefix(self, prefix: str):
        sentences = __nlp__(prefix).sents
        sentences_ids = [prefix.index(s.text) for s in sentences]
    
        full_prefix = prefix
        # Because of the way we create the prefixes we will
        # prioritize right most prefix matching
        sentences_ids = sentences_ids[::-1]
        
        for index in sentences_ids:
            minimal_prefix = prefix[index:]
            
            # Check match of phrases
            ids = get_phrases(minimal_prefix.lower(), self.wordsl)
            if ids is not None:
                return full_prefix, minimal_prefix
        
        return full_prefix, full_prefix

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Obtain centered windows that satisfy constraints
phrases = ["muslim", "terrorist"]
constraint = Constraint(*phrases, distance=200)
matches = constraint.find_matches(text)
pref, cont = constraint.get_prefix(matches[0])
full_pref, min_prefix = constraint.get_minimal_prefix(pref)

print("--> Prefix:", pref)
print("--> Full Prefix:", full_pref)
print("--> Minimal prefix:", min_prefix)

print("--> Continuation:", cont)

--> Prefix: nk of 9/11, he totally deserves this". When I read this, I was completely stunned. I even had to reread it again to see if I misread. Not only did I find it completely offensive towards me as a Muslim. I was dumbstruck on how such an unbelievable generalization that all Arabs (or anyone with a brown skin color for that matter) and Muslims are terrorist
--> Full Prefix: nk of 9/11, he totally deserves this". When I read this, I was completely stunned. I even had to reread it again to see if I misread. Not only did I find it completely offensive towards me as a Muslim. I was dumbstruck on how such an unbelievable generalization that all Arabs (or anyone with a brown skin color for that matter) and Muslims are terrorist
--> Minimal prefix: I was dumbstruck on how such an unbelievable generalization that all Arabs (or anyone with a brown skin color for that matter) and Muslims are terrorist
--> Continuation: s. I really had hoped that there would be a goo


### Sample N data sequences

Sample N sequences from the training data that match different sequences. In particular, we will store the sequences in a data structure as follows:

- `phrases::list[str]`: list of terms used to narrow down the sequences to process
- `doc_id::str`: textual descriptor of the original document we sampled this from
- `doc_subset::str`: textual descriptor of the data subset in PILE
- `full_prefix::str`: text description of the full prefix 
- `min_prefix::str`: minimum set of sentences that satisfy the constraint.
- `continuation::str`: continuation of the prefix

Note that `min_prefix` should be a subset of the `full_prefix`.

In [8]:
import traceback


def sample_sequences(n_sequences, attribute, target, distance, scroll_size=100) -> pd.DataFrame:
    phrases = [attribute, target]
    constraint = Constraint(*phrases, distance=distance)

    results = {
        "doc_id": [],
        "doc_subset": [],
        "full_prefix": [],
        "min_prefix": [],
        "continuation": [],  
    }

    docs_iter = scroll_documents(es, constraint.es_query, size=scroll_size, index="re_pile")

    while (docs := next(docs_iter)) is not None and len(set(results["doc_id"])) < n_sequences:

        for doc in docs:
            doc_id = doc["_id"]
            doc_subset = doc["_source"]["meta"]["pile_set_name"]
            text = doc["_source"]["text"]

            matches = constraint.find_matches(text)
            
            for match in matches:
                try:
                    pref, cont = constraint.get_prefix(match)
                    if (pref, cont) == (None, None):
                        continue

                    full_prefix, min_prefix = constraint.get_minimal_prefix(pref)

                    if len(min_prefix) < 5:
                        print(f"Oops! min_prefix '{min_prefix}' has less than 5 chars")
                        continue

                    results["full_prefix"].append(full_prefix)
                    results["min_prefix"].append(min_prefix)
                    results["continuation"].append(cont)

                    results["doc_id"].append(doc_id)
                    results["doc_subset"].append(doc_subset)    
                except Exception:
                    print(f"Exception in user code: when processing: {match}")
                    print("-"*60)
                    traceback.print_exc(file=sys.stdout)
                    print("-"*60)
    
    results = pd.DataFrame(results)
    results.insert(0, "attribute", [attribute] * len(results))
    results.insert(0, "target", [target] * len(results))
    return results    

Let us define some set of terms to compare the generations from:

In [9]:
target_words = [
    "muslim",
    "jewish",
    "buddhist",
    "christian",
    "doctor",
    "nurse",
    "mother",
    "father",
    #"african",
    #"american",
    #"caucasian",
    #"asian",
    #"indian",
]

attribute_words = [
    "happy",
    "sad",
    "calm",
    "angry",
    "terror",
    "peace",
    "dead",
    "death",
    "great",
    "good",
    "bad",
    "terrible",
    "positive",
    "negative",
    "skill",
    "food",
]

print(len(attribute_words), len(target_words), len(attribute_words) * len(target_words))

16 8 128


In [10]:
import itertools as it

all_results = []
for target, attr in it.product(target_words, attribute_words):
    query = {'match': {'text': {'query': f'{target} {attr}', 'operator': 'and'}}}
    print(target, attr, es.count(index="re_pile", query=query)["count"])

muslim happy 184137
muslim sad 109348
muslim calm 83826
muslim angry 132451
muslim terror 215362
muslim peace 315972
muslim dead 252341
muslim death 392339
muslim great 444824
muslim good 499700
muslim bad 276508
muslim terrible 114336
muslim positive 173353
muslim negative 145802
muslim skill 59412
muslim food 224660
jewish happy 260688
jewish sad 159026
jewish calm 123908
jewish angry 164152
jewish terror 180973
jewish peace 355520
jewish dead 313552
jewish death 472622
jewish great 577855
jewish good 583981
jewish bad 337094
jewish terrible 176365
jewish positive 216257
jewish negative 179399
jewish skill 102936
jewish food 289324
buddhist happy 88401
buddhist sad 52134
buddhist calm 54591
buddhist angry 53344
buddhist terror 39550
buddhist peace 99839
buddhist dead 88215
buddhist death 128039
buddhist great 170283
buddhist good 168323
buddhist bad 100441
buddhist terrible 50546
buddhist positive 75330
buddhist negative 64814
buddhist skill 43625
buddhist food 103304
christian happy

### Let us start extracting the data for each individual group

In [11]:
from tqdm import tqdm
import os

BASE_DIR = "/extra/ucinlp1/cbelem/experiments-apr-15/data"
os.makedirs(BASE_DIR, exist_ok=True)

In [None]:
N_SEQUENCES = 200
CHAR_DISTANCE = 200


for target in tqdm(target_words[1:]):
    analysis_data = []
    
    filepath = f"{BASE_DIR}/{target}.csv"
    print("Writing filepath", filepath)
    
    for attr in attribute_words:
        print("=" * 40, attr, "=" * 40)
        results = sample_sequences(N_SEQUENCES, attr, target, distance=CHAR_DISTANCE)
        
        if len(results) != 0:
            analysis_data.append(results)
        
        if len(analysis_data) % 4 == 0:
            print("Intermediate dump of results to", filepath)
            pd.concat(analysis_data).reset_index(drop=True).to_csv(filepath)
        
    print("Final result dump @", filepath)
    analysis_data = pd.concat(analysis_data).reset_index(drop=True)    
    analysis_data.to_csv(filepath)

  0%|                                                                                                                                                                                                                                                                                       | 0/7 [00:00<?, ?it/s]

Writing filepath /extra/ucinlp1/cbelem/experiments-apr-15/data/jewish.csv
Intermediate dump of results to /extra/ucinlp1/cbelem/experiments-apr-15/data/jewish.csv
Words:
-> ['dead', 'jewish']
Skipping example:
-> ead in an attack after they left Diyarbakır Police Department building. Radical Islamic group known as Turkish Hezbollah was suspected.

Üzeyir Garih 
 25 August 2001: A prominent Turkish Jewish businessman and a founding partner of the Alarko group of companies.  He was stabbed to death  in the cemetery of the historic İstanbul quarter of Eyüp.

Necip Hablemitoğlu 
 18 December 2002: A Kemalist hi
Intermediate dump of results to /extra/ucinlp1/cbelem/experiments-apr-15/data/jewish.csv
Intermediate dump of results to /extra/ucinlp1/cbelem/experiments-apr-15/data/jewish.csv
Words:
-> ['food', 'jewish']
Skipping example:
-> d physician was always one of the sultan's Jewish subjects. Nearby, you can visit the late 17th-century **Kiosk of Kara Mustafa Pasha** (Sofa Köşkü), with it

 14%|██████████████████████████████████████▏                                                                                                                                                                                                                                    | 1/7 [27:19<2:43:58, 1639.82s/it]

Writing filepath /extra/ucinlp1/cbelem/experiments-apr-15/data/buddhist.csv
Intermediate dump of results to /extra/ucinlp1/cbelem/experiments-apr-15/data/buddhist.csv
Intermediate dump of results to /extra/ucinlp1/cbelem/experiments-apr-15/data/buddhist.csv


In [None]:
N_SEQUENCES = 100
CHAR_DISTANCE = 200

for target in tqdm(target_words[1:]):
    analysis_data = []
    
    for attr in ["food"]:
        print("=" * 40, attr, "=" * 40)
        results = sample_sequences(N_SEQUENCES, attr, target, distance=CHAR_DISTANCE)
        
        if len(results) != 0:
            analysis_data.append(results)
            
    analysis_data = pd.concat(analysis_data).reset_index(drop=True)
    
    
    filepath = f"{BASE_DIR}/{target}.csv"
    print("Writing filepath", filepath)
    analysis_data.to_csv(filepath)