In [None]:
! pip install nltk numpy scikit-learn sentence-transformers --no-input

In [None]:
! python -m nltk.downloader punkt

In [1]:
import json
import math
from pathlib import Path

from nltk.tokenize import sent_tokenize, RegexpTokenizer
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tiktoken

encoder = tiktoken.get_encoding('cl100k_base')

<br><br><br>
# Load dataset data (with GPT)

In [2]:
rebel_data_path = Path('REBEL.json')
with rebel_data_path.open('r') as f:
    rebel_data = json.load(f)

<br><br><br>
# Get counts data

In [3]:
def get_counts_dict(data):
    counts_data = dict()
    for doc_id, doc_data in enumerate(data):                
        doc_counts = dict()

        # --- Text centered stats
        doc_counts['text'] = {
            'sentence count': len(sent_tokenize(doc_data['doc']['text'])),
            'token count': len(encoder.encode(doc_data['doc']['text']))
        }

        # --- Entity centered stats
        gpt_entities = doc_data['entities']['gpt']
        rebel_entities = list(set([e['surfaceform'] for e in doc_data['entities']['gold']]))
        doc_counts['entities'] = {
            'total':         len(gpt_entities),
            'rebel count':   len(rebel_entities),
        }
        
        # --- Triplet centered stats
        gpt_triples = doc_data['triples']['gpt']
        rebel_triples = list(set([
            '|'.join([t[role]['surfaceform'].strip() for role in ['subject', 'predicate', 'object']])
            for t in doc_data['triples']['gold']
        ]))
        doc_counts['triples'] = {
            'total':         len(gpt_triples),
            'rebel count':   len(rebel_triples),
        }


        # --- Track metrics
        counts_data[doc_id] = doc_counts
    
    return counts_data

In [4]:
m = get_counts_dict(rebel_data)

## Counts

In [5]:
print(f'Number of documents: {len(rebel_data):,d}')

Number of documents: 148


In [6]:
ce = sum([doc['entities']['total'] for _, doc in m.items()])
print(f'Number of entities: {ce:,d}')

Number of entities: 2,845


In [7]:
ct = sum([doc['triples']['total'] for _, doc in m.items()])
print(f'Number of triples: {ct:,d}')

Number of triples: 2,024


In [8]:
sentence_count = sum([doc['text']['sentence count'] for _, doc in m.items()])
print(f'Sentence count: {sentence_count:,d}')

Sentence count: 2,803


In [9]:
token_count_list = [doc['text']['token count'] for _, doc in m.items()]
print(f'Average token count: {round(sum(token_count_list)/len(token_count_list)):,d}')

Average token count: 605


# Topical Similarity Score

Latent Dirichlet Allocation references:
[LDA scikit-learn official doc](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html);
[LDA scikit-learn other](https://machinelearninggeek.com/latent-dirichlet-allocation-using-scikit-learn/).

Settings

In [10]:
N_LDA_TRAIN_DOCS = 10_000
N_TOPICS = [5, 10, 20, 30, 40, 50, 75, 100]

In [11]:
original_rebel_path = Path('original')
rebel_train_path = original_rebel_path.joinpath('en_train.jsonl')
if not rebel_train_path.exists():
    raise Exception(
        f'Original REBEL dataset not existing! Please, download it and place it into the "{original_rebel_path}" folder.'
        f' You can download it with the `rebel_download.sh` script or from "https://huggingface.co/datasets/Babelscape/rebel-dataset/tree/main".'
    )

Loading train data

In [12]:
with rebel_train_path.open('r', encoding='utf-8') as f:
    train_data = [json.loads(f.readline().strip()) for i in range(N_LDA_TRAIN_DOCS)]
lda_train_texts = [td['text'] for td in train_data]
print(f'Number of LDA train texts: {len(lda_train_texts):,}')

Number of LDA train texts: 10,000


Processing train data

In [13]:
# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [14]:
# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    ngram_range = (1,1),
    tokenizer = tokenizer.tokenize
)

In [15]:
%%capture
# Fit and transform the documents
lda_train_texts_matrix = tfidf.fit_transform(lda_train_texts)

Modeling documents according to different numbers of latent topics

In [16]:
%%time
lda_models = dict()
for n in N_TOPICS:
    model = LatentDirichletAllocation(n_components=n)
    model.fit(lda_train_texts_matrix)
    lda_models[n] = model

CPU times: total: 32.6 s
Wall time: 1min 25s


Evaluation

*Running this evaluation again may result in slightly different Topic Similarity scores, as the trained LDA models may vary*

In [17]:
def topical_sim(document_lda, triples_lda) -> float:
    document_lda = document_lda.tolist()[0]
    triples_lda = triples_lda.tolist()[0]
    return math.e ** (-sum(document_lda[i] * math.log2(document_lda[i]/triples_lda[i]) for i in range(len(document_lda))))

In [18]:
def gpt_t_str(doc_t):
    return f'{doc_t["subject label"]}, {doc_t["predicate label"]}, {doc_t["object label"]}'

def rebel_t_str(doc_t):
    return f'{doc_t["subject"]["surfaceform"]}, {doc_t["predicate"]["surfaceform"]}, {doc_t["object"]["surfaceform"]}'

texts_l = list()
triples_gpt_l = list()
triples_rebel_l = list()
for d in rebel_data:
    texts_l.append(d['doc']['text'])
    triples_gpt_l.append('\t'.join([gpt_t_str(doc_t) for doc_t in d['triples']['gpt']]))
    triples_rebel_l.append('\t'.join([rebel_t_str(doc_t) for doc_t in d['triples']['gold']]))

In [19]:
texts_l_matrix = tfidf.transform(texts_l)
triples_gpt_l_matrix = tfidf.transform(triples_gpt_l)
triples_rebel_l_matrix = tfidf.transform(triples_rebel_l)

In [20]:
topical_sim_scores = dict()
for n in N_TOPICS:
    topical_sim_scores[n] = {'gpt': [], 'rebel': []}
    for i in range(len(texts_l)):
        topical_sim_scores[n]['gpt'].append(
            topical_sim(lda_models[n].transform(texts_l_matrix[i]), lda_models[n].transform(triples_gpt_l_matrix[i]))
        )
        topical_sim_scores[n]['rebel'].append(
            topical_sim(lda_models[n].transform(texts_l_matrix[i]), lda_models[n].transform(triples_rebel_l_matrix[i]))
        )

In [21]:
print('TOPICAL SIMILARITY SCORES')
for n in N_TOPICS:
    print(f'N topics: {n:4} - Score GPT: {np.average(topical_sim_scores[n]["gpt"]):4.3f} | Score REBEL: {np.average(topical_sim_scores[n]["rebel"]):4.3f}')

TOPICAL SIMILARITY SCORES
N topics:    5 - Score GPT: 0.724 | Score REBEL: 0.648
N topics:   10 - Score GPT: 0.704 | Score REBEL: 0.682
N topics:   20 - Score GPT: 0.568 | Score REBEL: 0.469
N topics:   30 - Score GPT: 0.510 | Score REBEL: 0.435
N topics:   40 - Score GPT: 0.406 | Score REBEL: 0.319
N topics:   50 - Score GPT: 0.413 | Score REBEL: 0.323
N topics:   75 - Score GPT: 0.311 | Score REBEL: 0.200
N topics:  100 - Score GPT: 0.241 | Score REBEL: 0.148


## Uniqueness Score

Embeddings references: [Sentence Embeddings](https://www.sbert.net/); [Models](https://www.sbert.net/docs/pretrained_models.html#model-overview).

Settings

In [22]:
THRESHOLDS = [v / 1000 for v in range(700, 1000, 50)]

Loading embedding model

In [23]:
%%time
%%capture
embedding_model = SentenceTransformer('all-mpnet-base-v2')

CPU times: total: 250 ms
Wall time: 4.97 s


Evaluation

In [24]:
def cos_sim(v1, v2) -> float:
    return cosine_similarity([v1], [v2])[0][0]

def uniqueness(triples_list: list, threshold: float) -> float:
    assert all([isinstance(t, list) and len(t) == 3 for t in triples_list])
    n = len(triples_list)
    v = [embedding_model.encode(', '.join(t)) for t in triples_list]
    return sum([int(cos_sim(v[i], v[j]) < threshold) for i in range(n) for j in range(n) if i != j]) / (n * (n-1)) 

In [25]:
def gpt_t_list(doc_t):
    labels = ['subject label', 'predicate label', 'object label']
    return [doc_t[l] for l in labels]

def rebel_t_list(doc_t):
    labels = ['subject', 'predicate', 'object']
    return [doc_t[l]['surfaceform'] for l in labels]

In [26]:
%%time
# Not the smartest way of computing it, as encoding is repeated multiple times ...
uniqueness_scores = dict()
for th in THRESHOLDS:
    uniqueness_scores[th] = {'gpt': [], 'rebel': []}
    for i in range(len(texts_l)):
        gpt_triples = [gpt_t_list(doc_t) for doc_t in d['triples']['gpt']]
        uniqueness_scores[th]['gpt'].append(uniqueness(gpt_triples, th))
        
        rebel_triples = [rebel_t_list(doc_t) for doc_t in d['triples']['gold']]
        uniqueness_scores[th]['rebel'].append(uniqueness(rebel_triples, th))

CPU times: total: 6min 43s
Wall time: 12min 33s


In [27]:
print('UNIQUENESS SCORES')
for th in THRESHOLDS:
    print(f'Similarity threshold: {th:3.2f} - Score GPT: {np.average(uniqueness_scores[th]["gpt"]):4.3f} | Score REBEL: {np.average(uniqueness_scores[th]["rebel"]):4.3f}')

UNIQUENESS SCORES
Similarity threshold: 0.70 - Score GPT: 0.924 | Score REBEL: 1.000
Similarity threshold: 0.75 - Score GPT: 0.952 | Score REBEL: 1.000
Similarity threshold: 0.80 - Score GPT: 0.981 | Score REBEL: 1.000
Similarity threshold: 0.85 - Score GPT: 0.990 | Score REBEL: 1.000
Similarity threshold: 0.90 - Score GPT: 0.995 | Score REBEL: 1.000
Similarity threshold: 0.95 - Score GPT: 1.000 | Score REBEL: 1.000
