In [None]:
%pip install -r requirements.txt
%python -m spacy download en_core_web_sm

In [None]:
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
from huggingface_hub import login

import numpy as np
import pandas as pd
import spacy

In [None]:
from import_data import import_parade, import_bbc, import_textbook
from prepare_data import prep_parade, prep_bbc, prep_textbook
from benchmark import aggregate_similarity, cluster_similarity

In [None]:
HF_TOKEN = ""

In [None]:
import_parade()
import_bbc(HF_TOKEN)
import_textbook(HF_TOKEN)

In [None]:
parade = prep_parade()
bbc = prep_bbc()
textbook = prep_textbook()

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [None]:
def get_sentence_embeddings(sentences):
    return embedding_model.encode(
        sentences,
        progress_bar = True)

In [None]:
def clustering(sentences, sentence_embeddings, threshold_multiplier = 0.8):
  distances = cosine_distances(sentence_embeddings)
  distance_threshold = np.mean(distances) * threshold_multiplier

  clustering_model = AgglomerativeClustering(
      n_clusters = None,
      metric = "precomputed",
      distance_threshold = distance_threshold,
      linkage = "complete"
  )

  clustering_model.fit(distances)

  labels = clustering_model.labels_

  number_of_lables = 1 + max(labels)

  clusters = [[] for _ in range(number_of_lables)]

  for index, label in enumerate(labels):
    clusters[label].append(sentences[index].strip())

  return clusters

In [None]:
def semantic_chunking(sentences, threshold_multiplier = 1.3):
  chunks = []
  current_sentence_chunk = []
  added_indexes = set()

  sentence_embeddings = embedding_model.encode(sentences)

  average_similarity = np.mean(cosine_similarity(sentence_embeddings))
  similarity_threshold = average_similarity * threshold_multiplier

  for j in range(len(sentences)):
    for i in range(len(sentences)):
      if not current_sentence_chunk and i not in added_indexes:
        current_sentence_chunk.append(sentences[i])
        added_indexes.add(i)
      elif i not in added_indexes:
        chunk_embeddings = embedding_model.encode(current_sentence_chunk)

        avg_chunk_embeddings = np.mean(chunk_embeddings, axis = 0)

        similarity = cosine_similarity([avg_chunk_embeddings], [sentence_embeddings[i]])

        if similarity > similarity_threshold:
          current_sentence_chunk.append(sentences[i].strip())
          added_indexes.add(i)

    if current_sentence_chunk:
      chunks.append(current_sentence_chunk)
      current_sentence_chunk = []

  return chunks

In [None]:
def individualize_sentences(data) -> list:
    sentence_data = []
    for paragraphs in data:
        doc = nlp(paragraphs)
        sentence_data.extend([sent.text for sent in doc.sents])

In [None]:
parade_sentences = individualize_sentences(parade)
bbc_sentences = individualize_sentences(bbc)
textbook_sentences = individualize_sentences(textbook)

In [None]:
parade_embeddings = get_sentence_embeddings(parade_sentences)
bbc_embeddings = get_sentence_embeddings(bbc_sentences)
textbook_embeddings = get_sentence_embeddings(textbook_sentences)

In [None]:
parade_clusters = clustering(parade_sentences, parade_embeddings)
bbc_clusters = clustering(bbc_sentences, bbc_embeddings)
textbook_clusters = clustering(textbook_sentences, textbook_embeddings)

In [None]:
parade_semantics = semantic_chunking(parade_clusters)
bbc_semantics = semantic_chunking(bbc_clusters)
textbook_semantics = semantic_chunking(textbook_clusters)

In [None]:
parade_aggregate_score = aggregate_similarity(
    get_sentence_embeddings([chunks for chunks in parade]), 
    get_sentence_embeddings([chunks for chunks in parade_semantics]))
parade_cluster_scores = cluster_similarity(parade, parade_semantics, nmi_method="geometric", v_beta = 0.85)

bbc_aggregate_score = aggregate_similarity(
    get_sentence_embeddings([chunks for chunks in bbc]), 
    get_sentence_embeddings([chunks for chunks in bbc_semantics]))
bbc_cluster_scores = cluster_similarity(bbc, bbc_semantics, nmi_method="geometric", v_beta = 0.85)

textbook_aggregate_score = aggregate_similarity(
    get_sentence_embeddings([chunks for chunks in textbook]), 
    get_sentence_embeddings([chunks for chunks in textbook_semantics]))
textbook_cluster_scores = cluster_similarity(textbook, textbook_semantics, nmi_method="geometric", v_beta = 0.85)