In [None]:
%pip install -r requirements.txt
!python -m spacy download en_core_web_sm

In [None]:
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
from huggingface_hub import login

import numpy as np
import pandas as pd
import spacy

In [None]:
from import_data import import_parade, import_bbc, import_textbook
from prepare_data import prep_parade, prep_bbc, prep_textbook
from benchmark import aggregate_similarity, cluster_similarity

In [None]:
HF_TOKEN = "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"

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [None]:
def get_sentence_embeddings(sentences):
    return embedding_model.encode(
        sentences,
        progress_bar = True)

In [None]:
def clustering(sentences, sentence_embeddings, threshold_multiplier = 1):
  distances = cosine_distances(sentence_embeddings)
  distance_threshold = np.mean(distances) * threshold_multiplier

  clustering_model = AgglomerativeClustering(
      n_clusters = None,
      metric = "precomputed",
      distance_threshold = distance_threshold,
      linkage = "complete"
  )

  clustering_model.fit(distances)

  labels = clustering_model.labels_

  number_of_lables = 1 + max(labels)

  clusters = [[] for _ in range(number_of_lables)]

  for index, label in enumerate(labels):
    clusters[label].extend(sentences[index].strip())

  return clusters

In [None]:
def semantic_chunking(sentences, threshold_multiplier = 1):
  chunks = []
  current_sentence_chunk = []
  added_indexes = set()

  sentence_embeddings = embedding_model.encode(sentences)

  average_similarity = np.mean(cosine_similarity(sentence_embeddings))
  similarity_threshold = average_similarity * threshold_multiplier

  for j in range(len(sentences)):
    for i in range(len(sentences)):
      if not current_sentence_chunk and i not in added_indexes:
        current_sentence_chunk.append(sentences[i])
        added_indexes.add(i)
      elif i not in added_indexes:
        chunk_embeddings = embedding_model.encode(current_sentence_chunk)

        avg_chunk_embeddings = np.mean(chunk_embeddings, axis = 0)

        similarity = cosine_similarity([avg_chunk_embeddings], [sentence_embeddings[i]])

        if similarity > similarity_threshold:
          current_sentence_chunk.append(sentences[i].strip())
          added_indexes.add(i)

    if current_sentence_chunk:
      chunks.append(current_sentence_chunk)
      current_sentence_chunk = []

  return chunks

In [None]:
def semantic_chunking_sliding_window(sentences, threshold_multiplier = 1, window_size = 15):
  chunks = []
  current_chunk_sentences = []

  sentence_embeddings = embedding_model.encode(sentences)

  for i in range(len(sentences)):
    if not current_chunk_sentences:
        current_chunk_sentences.append(sentences[i])
    else:
        current_chunk_embeddings = embedding_model.encode(current_chunk_sentences)

        avg_chunk_embedding = np.mean(current_chunk_embeddings, axis = 0)

        similarity = cosine_similarity([sentence_embeddings[i]],[avg_chunk_embedding])
        similarity_threshold = similarity * threshold_multiplier
        if similarity >= similarity_threshold:
            current_chunk_sentences.append(sentences[i])
        else:
            chunks.append(current_chunk_sentences)
            current_chunk_sentences = []

    if len(current_chunk_sentences) >= window_size and i < len(sentences) - 1:
        chunks.append(current_chunk_sentences)

    if current_chunk_sentences:
        chunks.append(current_chunk_sentences)

    return chunks

In [None]:
def individualize_sentences(data) -> list:
    sentence_data = []
    if type(data) is list:
      for paragraphs in data:
          doc = nlp(paragraphs)
          for sent in doc.sents:
            sentence_data.append(sent.text)
      return sentence_data
    elif type(data) is str:
      sentence_data = []
      doc = nlp(data)
      for sent in doc.sents:
        sentence_data.append(sent.text)
      return sentence_data

In [None]:
import_parade()
parade = prep_parade()
parade_sentences = individualize_sentences(parade)
parade_embeddings = get_sentence_embeddings(parade_sentences)
parade_clusters = clustering(parade_sentences, parade_embeddings, 1.35)

In [None]:
parade_semantics = []
for chunk in enumerate(parade_clusters):
    parade_semantics.extend(semantic_chunking(chunk, 1.3))

In [None]:
parade_aggregate_score = aggregate_similarity(
    get_sentence_embeddings([chunks for chunks in parade]), 
    get_sentence_embeddings([" ".join(chunks) for chunks in parade_semantics]))
parade_cluster_scores = cluster_similarity(
    [individualize_sentences(chunks) for chunks in parade], 
    parade_semantics, 
    nmi_method="geometric", 
    v_beta = 0.85)

In [None]:
print(f"Parade")
print(f"Aggregate Similarity: {parade_aggregate_score}")
print(f"ARI: {parade_cluster_scores["ari"]}")
print(f"NMI: {parade_cluster_scores["nmi"]}")
print(f"V_Measure: {parade_cluster_scores["v_measure"]}")

In [None]:
import_bbc(HF_TOKEN)
bbc = prep_bbc()
bbc_sentences = individualize_sentences(bbc)
bbc_embeddings = get_sentence_embeddings(bbc_sentences)
bbc_clusters = clustering(bbc_sentences, bbc_embeddings, 1.35)

In [None]:
bbc_semantics = []
for chunk in enumerate(bbc_clusters):
    bbc_semantics.extend(semantic_chunking(chunk, 1.3))

In [None]:
bbc_aggregate_score = aggregate_similarity(
    get_sentence_embeddings([chunks for chunks in bbc]), 
    get_sentence_embeddings([" ".join(chunks) for chunks in bbc_semantics]))
bbc_cluster_scores = cluster_similarity(
    [individualize_sentences(chunks) for chunks in bbc], 
    bbc_semantics, 
    nmi_method="geometric", 
    v_beta = 0.85)

In [None]:
print(f"BBC")
print(f"Aggregate Similarity: {bbc_aggregate_score}")
print(f"ARI: {bbc_cluster_scores["ari"]}")
print(f"NMI: {bbc_cluster_scores["nmi"]}")
print(f"V_Measure: {bbc_cluster_scores["v_measure"]}")

In [None]:

import_textbook(HF_TOKEN)
textbook = prep_textbook()
textbook_sentences = individualize_sentences(textbook)
textbook_embeddings = get_sentence_embeddings(textbook_sentences)
textbook_clusters = clustering(textbook_sentences, textbook_embeddings, 1.35)

In [None]:
textbook_semantics = []
for chunk in enumerate(textbook_clusters):
    textbook_semantics.extend(semantic_chunking(chunk, 1.3))

In [None]:
textbook_aggregate_score = aggregate_similarity(
    get_sentence_embeddings([chunks for chunks in textbook]), 
    get_sentence_embeddings([" ".join(chunks) for chunks in textbook_semantics]))
textbook_cluster_scores = cluster_similarity(
    [individualize_sentences(chunks) for chunks in textbook], 
    textbook_semantics, 
    nmi_method="geometric", 
    v_beta = 0.85)

In [None]:
print(f"Textbook")
print(f"Aggregate Similarity: {textbook_aggregate_score}")
print(f"ARI: {bbc_cluster_scores["ari"]}")
print(f"NMI: {bbc_cluster_scores["nmi"]}")
print(f"V_Measure: {bbc_cluster_scores["v_measure"]}")

In [None]:
sliding_parade_semantics = []
for chunk in enumerate(parade_clusters):
    parade_semantics.extend(semantic_chunking_sliding_window(chunk, 1.3))

sliding_bbc_semantics = []
for chunk in enumerate(bbc_clusters):
    bbc_semantics.extend(semantic_chunking_sliding_window(chunk, 1.3))

sliding_textbook_semantics = []
for chunk in enumerate(textbook_clusters):
    textbook_semantics.extend(semantic_chunking_sliding_window(chunk, 1.3))

In [None]:
sliding_parade_aggregate_score = aggregate_similarity(
    get_sentence_embeddings([chunks for chunks in parade]), 
    get_sentence_embeddings([" ".join(chunks) for chunks in parade_semantics]))
sliding_parade_cluster_scores = cluster_similarity(
    [individualize_sentences(chunks) for chunks in parade], 
    parade_semantics, 
    nmi_method="geometric", 
    v_beta = 0.85)

sliding_bbc_aggregate_score = aggregate_similarity(
    get_sentence_embeddings([chunks for chunks in bbc]), 
    get_sentence_embeddings([" ".join(chunks) for chunks in bbc_semantics]))
sliding_bbc_cluster_scores = cluster_similarity(
    [individualize_sentences(chunks) for chunks in bbc], 
    bbc_semantics, 
    nmi_method="geometric", 
    v_beta = 0.85)

sliding_textbook_aggregate_score = aggregate_similarity(
    get_sentence_embeddings([chunks for chunks in textbook]), 
    get_sentence_embeddings([" ".join(chunks) for chunks in textbook_semantics]))
sliding_textbook_cluster_scores = cluster_similarity(
    [individualize_sentences(chunks) for chunks in textbook], 
    textbook_semantics, 
    nmi_method="geometric", 
    v_beta = 0.85)

In [None]:
print(f"Parade")
print(f"Aggregate Similarity: {sliding_parade_aggregate_score}")
print(f"ARI: {sliding_parade_cluster_scores["ari"]}")
print(f"NMI: {sliding_parade_cluster_scores["nmi"]}")
print(f"V_Measure: {sliding_parade_cluster_scores["v_measure"]}")

print(f"BBC")
print(f"Aggregate Similarity: {sliding_bbc_aggregate_score}")
print(f"ARI: {sliding_bbc_cluster_scores["ari"]}")
print(f"NMI: {sliding_bbc_cluster_scores["nmi"]}")
print(f"V_Measure: {sliding_bbc_cluster_scores["v_measure"]}")

print(f"Textbook")
print(f"Aggregate Similarity: {sliding_textbook_aggregate_score}")
print(f"ARI: {sliding_bbc_cluster_scores["ari"]}")
print(f"NMI: {sliding_bbc_cluster_scores["nmi"]}")
print(f"V_Measure: {sliding_bbc_cluster_scores["v_measure"]}")