In [1]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="out/cleaned-formatted.jsonl", split="train", streaming=True)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/")
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(50000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [3]:
import torch
def getEncodings(textArray):
  inputs = tokenizer(
    textArray,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
  )
  with torch.no_grad():
    outputs = model(**inputs)
  cls_embeddings = outputs.last_hidden_state[:,0,:]
  return cls_embeddings.cpu().numpy()

In [4]:
def count_existing_lines(file_path):
    try:
        with open(file_path, "r") as f:
            return sum(1 for _ in f)
    except FileNotFoundError:
        return 0

In [11]:
from tqdm import tqdm
import json
embedings = []
batch_documents = []
batch = []
print("Getting embeds")
processed_count = count_existing_lines("out/cleaned-with-embed.jsonl")
print(f"Resuming from index {processed_count}")

for idx, example in enumerate(tqdm(dataset, desc="Resuming embedding")):
  if idx < processed_count:
    continue  # Already processed
  batch.append(example)
  if len(batch) == 16:
    texts = [doc["content"] for doc in batch]
    embeded_docs = getEncodings(texts)
    with open("out/cleaned-with-embed.jsonl", "a") as f:
      for doc, emb in zip(batch, embeded_docs):
        doc["embeds"] = emb.tolist()
        f.write(json.dumps(doc) + "\n")
    batch = []
    
if batch:
  texts = [doc["content"] for doc in batch]
  embeded_docs = getEncodings(texts)
  for doc, emb in zip(batch, embeded_docs):
    doc["embeds"] = emb
    with open("out/cleaned-with-embed.jsonl", "a") as f:
      for doc, emb in zip(batch, embeded_docs):
        doc["embeds"] = emb.tolist()
        f.write(json.dumps(doc) + "\n")

Getting embeds
Resuming from index 0


Resuming embedding: 201583it [1:07:00, 50.14it/s]


In [15]:
from sklearn.decomposition import PCA
import json

data = []
with open("out/cleaned-with-embed.jsonl", "r", encoding="utf-8") as f:
  for line in f:
    doc = json.loads(line)
    data.append(doc)
embeds = [doc["embeds"] for doc in data]

pca = PCA(n_components=50)
reduced_embeddings = pca.fit_transform(embeds)

with open("out/cleaned-formatted.json", "r", encoding="utf-8") as f:
  rawjson = json.load(f)
  
with open("out/cleaned-with-reduced-embed.jsonl", "a", encoding="utf-8") as fl:
  for obj, embeds in zip(rawjson, reduced_embeddings):
    obj["embeds"] = embeds.tolist()
    fl.write(json.dumps(obj) + "\n")
      

In [1]:
from sre_parse import Verbose
from sklearn.cluster import DBSCAN
import json
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np


dbscan = DBSCAN(eps=10.0, min_samples=4)

reduced_embed_documents = []
with open("out/cleaned-with-reduced-embed.jsonl", "r", encoding="utf-8") as filewithreducedembed:
  for line in filewithreducedembed:
    reduced_embed_documents.append(json.loads(line))
    
reduced_embeds = [doc["embeds"] for doc in reduced_embed_documents]

cluster_labels = dbscan.fit_predict(reduced_embeds)


with open("out/cleaned-formatted.json", "r", encoding="utf-8") as basejson:
  base_document = json.load(basejson)
  
for doc, label in zip(base_document, cluster_labels):
  doc["metadata"]["bucket_label"] = int(label)

base_document = sorted(base_document, key=lambda x: x["metadata"]["bucket_label"])

with open("out/cleaned-with-labels.json", "w", encoding="utf-8") as labeljson:
  json.dump(base_document, labeljson, indent=2, ensure_ascii=False)


In [4]:
from sre_parse import Verbose
from sklearn.cluster import KMeans
import json
import matplotlib.pyplot as plt
import numpy as np


kmeans = KMeans(n_clusters=5, random_state=42)

reduced_embed_documents = []

with open("out/cleaned-with-reduced-embed.jsonl", "r", encoding="utf-8") as filewithreducedembed:
  for line in filewithreducedembed:
    reduced_embed_documents.append(json.loads(line))
    
reduced_embeds = [doc["embeds"] for doc in reduced_embed_documents]
cluster_labels = kmeans.fit_predict(reduced_embeds)


with open("out/cleaned-formatted.json", "r", encoding="utf-8") as basejson:
  base_document = json.load(basejson)
  
for doc, label in zip(base_document, cluster_labels):
  doc["metadata"]["bucket_label"] = int(label)

base_document = sorted(base_document, key=lambda x: x["metadata"]["bucket_label"])

with open("out/cleaned-with-labels.json", "w", encoding="utf-8") as labeljson:
  json.dump(base_document, labeljson, indent=2, ensure_ascii=False)


In [None]:
import json
with open("out/embeded-data.json", "w") as f_out:
    json.dump(batch_documents, f_out, ensure_ascii=False, indent=2)