In [None]:
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

print("Environment variables loaded successfully!")
print(f"JINA_API_KEY loaded: {'✅' if os.getenv('JINA_API_KEY') else '❌'}")

In [None]:
import pdfplumber
import os
import json

def extract_text_from_pdfs(input_folder, output_file):
    """Extract text from PDF files in the specified folder and save to a JSON file."""
    extracted_data = []
    for filename in os.listdir(input_folder):
        if filename.endswith('.pdf'):
            filepath = os.path.join(input_folder, filename)
            with pdfplumber.open(filepath) as pdf:
                text = ''.join([page.extract_text() for page in pdf.pages])
                extracted_data.append({"filename": filename, "text": text})
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(extracted_data, f, ensure_ascii=False, indent=4)

# Example usage
extract_text_from_pdfs('./docs', 'extracted.json')

In [None]:
import requests
import json

def split_text_by_length(text, max_length):
    """Split text into chunks with a maximum length."""
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

def chunk_documents(input_file, output_file, api_url, api_key):
    """Chunk documents using the Jina Segment API and save the results to a JSON file."""
    # Load documents from input file
    with open(input_file, 'r', encoding='utf-8') as f:
        documents = json.load(f)

    chunked_data = []
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}'
    }
    max_api_length = 64000  # 64k chars, a bit less for safety

    for doc in documents:
        if not doc.get("text"):
            print(f"Skipping document {doc.get('filename', 'unknown')} due to missing or empty text.")
            continue

        text_chunks = split_text_by_length(doc["text"], max_api_length)
        for part_idx, text_part in enumerate(text_chunks):
            data = {
                "content": text_part,
                "tokenizer": "o200k_base",
                "return_tokens": True,
                "return_chunks": True,
                "max_chunk_length": 1000
            }
            response = requests.post(api_url, headers=headers, json=data)
            if response.status_code == 200:
                print(f"Successfully processed {doc.get('filename', 'unknown')} part {part_idx+1}/{len(text_chunks)}")
                chunks = response.json().get("chunks", [])
                for idx, chunk in enumerate(chunks):
                    chunked_data.append({
                        "filename": doc["filename"],
                        "doc_part": part_idx,
                        "chunk_index": idx,
                        "chunk": chunk
                    })
            else:
                print(f"Error processing {doc.get('filename', 'unknown')} part {part_idx+1}: {response.status_code} {response.text}")

    # Save chunked data to output file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(chunked_data, f, ensure_ascii=False, indent=4)

# Example usage
chunk_documents(
    input_file='extracted.json',
    output_file='chunks.json',
    api_url='https://api.jina.ai/v1/segment',
    api_key=os.getenv('JINA_API_KEY')
)

Successfully processed Perda Kota Cimahi No. 8 Tahun 2014.pdf part 1/3
Successfully processed Perda Kota Cimahi No. 8 Tahun 2014.pdf part 2/3
Successfully processed Perda Kota Cimahi No. 8 Tahun 2014.pdf part 3/3
Successfully processed Perda No 8 2011 IMB.pdf part 1/2
Successfully processed Perda No 8 2011 IMB.pdf part 2/2
Successfully processed Perwal Cimahi 6 2024.pdf part 1/3
Successfully processed Perwal Cimahi 6 2024.pdf part 2/3
Successfully processed Perwal Cimahi 6 2024.pdf part 3/3


In [8]:
import json

def show_chunk_stats(chunk_file, n=5):
    """Display total chunk count and show n longest chunks by length."""
    with open(chunk_file, 'r', encoding='utf-8') as f:
        chunks = json.load(f)
    print(f"Total chunks: {len(chunks)}\n")
    # Sort by chunk length (descending)
    sorted_chunks = sorted(chunks, key=lambda x: len(x['chunk']), reverse=True)
    print(f"Top {n} longest chunks:")
    for i, chunk in enumerate(sorted_chunks[:n]):
        print(f"\nChunk #{i+1} (filename: {chunk['filename']}, part: {chunk.get('doc_part', 0)}, index: {chunk['chunk_index']}, length: {len(chunk['chunk'])}):\n")
        print(chunk['chunk'][:500] + ('...' if len(chunk['chunk']) > 500 else ''))

# Example usage
show_chunk_stats('chunks.json', n=5)

Total chunks: 873

Top 5 longest chunks:

Chunk #1 (filename: Perwal Cimahi 6 2024.pdf, part: 0, index: 4, length: 997):

Badan adalah sekumpulan orang dan/atau modal yang merupakan kesatuan, baik yang melakukan usaha maupun yang tidak melakukan usaha yang meliputi perseroan terbatas, perseroan komanditer, perseroan lainnya, badan usaha milik negara, badan usaha milik daerah, atau badan usaha milik desa, dengan nama dan dalam bentuk apa pun, firma, kongsi, koperasi, dana pensiun, persekutuan, perkumpulan, yayasan, organisasi masa, organisasi sosial politik, atau organisasi lainnya, lembaga dan bentuk badan lainnya,...

Chunk #2 (filename: Perwal Cimahi 6 2024.pdf, part: 1, index: 120, length: 985):

n Pajak / Reklame Tahun (Rp/M) Pajak BILLBOARD 1 250,000 375,000 500,000 M2 50,000 1 Tahun / BANDO /3 Bln PAPAN MERK 2 (PMT) 100,000 150,000 200,000 M2 50,000 1 Tahun / MELEKAT/ /3 Bln DINDING DAN BANGUNAN NEON SIGN 3 M2 50,000 1 Tahun / NEON 150,000 200,000 250,000 /3 Bln BOX 4 BALIHO 50,0

In [None]:

import requests
import json
import time

def embed_chunks_batch(chunk_file, output_file, api_url, api_key, model="jina-embeddings-v3", batch_size=8, sleep_time=0.5):
    """
    Embed all chunks using Jina Embeddings API in batches and save the results to a JSON file.
    """
    with open(chunk_file, 'r', encoding='utf-8') as f:
        chunks = json.load(f)

    embeddings = []
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}'
    }

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        texts = [c["chunk"] for c in batch]
        data = {
            "model": model,
            "task": "retrieval.passage",
            "late_chunking": True,
            "input": texts
        }
        response = requests.post(api_url, headers=headers, json=data)
        if response.status_code == 200:
            emb_list = response.json().get("data", [])
            for c, emb in zip(batch, emb_list):
                embeddings.append({
                    "filename": c["filename"],
                    "doc_part": c.get("doc_part", 0),
                    "chunk_index": c["chunk_index"],
                    "embedding": emb.get("embedding", [])
                })
            print(f"Embedded batch {i//batch_size+1} ({i+1}-{i+len(batch)}) of {len(chunks)})")
        else:
            print(f"Error embedding batch {i//batch_size+1}: {response.status_code} {response.text}")
        time.sleep(sleep_time)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(embeddings, f, ensure_ascii=False, indent=4)

# Example usage
embed_chunks_batch(
    chunk_file='chunks.json',
    output_file='embeddings.json',
    api_url='https://api.jina.ai/v1/embeddings',
    api_key=os.getenv('JINA_API_KEY'),
    batch_size=8
)


Embedded batch 1 (1-8) of 873)
Embedded batch 2 (9-16) of 873)
Embedded batch 3 (17-24) of 873)
Embedded batch 4 (25-32) of 873)
Embedded batch 5 (33-40) of 873)
Embedded batch 6 (41-48) of 873)
Embedded batch 7 (49-56) of 873)
Embedded batch 8 (57-64) of 873)
Embedded batch 9 (65-72) of 873)
Embedded batch 10 (73-80) of 873)
Embedded batch 11 (81-88) of 873)
Embedded batch 12 (89-96) of 873)
Embedded batch 13 (97-104) of 873)
Embedded batch 14 (105-112) of 873)
Embedded batch 15 (113-120) of 873)
Embedded batch 16 (121-128) of 873)
Embedded batch 17 (129-136) of 873)
Embedded batch 18 (137-144) of 873)
Embedded batch 19 (145-152) of 873)
Embedded batch 20 (153-160) of 873)
Embedded batch 21 (161-168) of 873)
Embedded batch 22 (169-176) of 873)
Embedded batch 23 (177-184) of 873)
Embedded batch 24 (185-192) of 873)
Embedded batch 25 (193-200) of 873)
Embedded batch 26 (201-208) of 873)
Embedded batch 27 (209-216) of 873)
Embedded batch 28 (217-224) of 873)
Embedded batch 29 (225-232) o

In [10]:
import json
import numpy as np
import faiss

# Load embeddings from embeddings.json
with open('embeddings.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Extract vectors and (optionally) metadata
vectors = [item['embedding'] for item in data]
# Optionally, save metadata for later retrieval
metadata = [
    {
        'filename': item.get('filename'),
        'doc_part': item.get('doc_part'),
        'chunk_index': item.get('chunk_index')
    }
    for item in data
]

# Convert to numpy array (float32)
vecs_np = np.array(vectors, dtype='float32')

# Create FAISS index (L2 similarity)
dim = vecs_np.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(vecs_np)

# Save index to file
faiss.write_index(index, 'faiss_index')

# Optionally, save metadata for later use
with open('faiss_metadata.json', 'w', encoding='utf-8') as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)

print(f"FAISS index created with {index.ntotal} vectors and saved to 'faiss_index'.")

FAISS index created with 873 vectors and saved to 'faiss_index'.
