<a href="https://colab.research.google.com/github/RM-RAMASAMY/Clustering-Algorithms/blob/main/LLM_document_embeddings_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title Install Required Libraries:
!pip install transformers scikit-learn PyMuPDF



In [2]:
# @title Extract Text from PDFs:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Example PDF paths
pdf_paths = ["butterflies.pdf", "docker.pdf", "flowers.pdf","rockmusic.pdf"]

# Extract text from each PDF
texts = [extract_text_from_pdf(pdf_path) for pdf_path in pdf_paths]

In [3]:
# @title Generate Embeddings:
from transformers import AutoTokenizer, AutoModel
import torch

# Load pre-trained model and tokenizer
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to generate embeddings
def get_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Generate embeddings for the extracted texts
embeddings = get_embeddings(texts)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# @title Cluster Embeddings:
from sklearn.cluster import KMeans

# Number of clusters
num_clusters = 3

# Apply KMeans clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
clusters = kmeans.fit_predict(embeddings)

# Print cluster assignments
for pdf_path, cluster in zip(pdf_paths, clusters):
    print(f"PDF: {pdf_path} -> Cluster: {cluster}")

PDF: butterflies.pdf -> Cluster: 0
PDF: docker.pdf -> Cluster: 2
PDF: flowers.pdf -> Cluster: 0
PDF: rockmusic.pdf -> Cluster: 1


In [5]:
# @title Prepare Data for Visualization: Save the clustering results to a JSON file that can be used by the D3.js script.
import json

# Prepare data for visualization
data = [{"pdf": pdf_path, "cluster": int(cluster)} for pdf_path, cluster in zip(pdf_paths, clusters)]

# Save data to a JSON file
with open("clustering_results.json", "w") as f:
    json.dump(data, f)

In [21]:
import numpy as np
import plotly.express as px
import pandas as pd

# Prepare data for visualization
data = [{"pdf": pdf_path, "cluster": int(cluster)} for pdf_path, cluster in zip(pdf_paths, clusters)]

# Generate random coordinates for demonstration purposes
for d in data:
    d['x'] = np.random.rand()
    d['y'] = np.random.rand()
    d['z'] = np.random.rand()

# Convert data to a DataFrame
df = pd.DataFrame(data)

# Create a 3D scatter plot
fig = px.scatter_3d(df, x='x', y='y', z='z', color='cluster', hover_data=['pdf'], title='PDF Clustering Visualization')

# Add data labels
fig.update_traces(marker=dict(size=5),
                  selector=dict(mode='markers'))

# Show the plot
fig.show()