In [1]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
import os

# Load documents from the "dataset" folder
documents = {}
for filename in os.listdir("dataset/dataset"):
    with open(os.path.join("dataset/dataset", filename), "r", encoding="utf-8") as file:
        documents[filename] = file.read()

# Extract features using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
X = vectorizer.fit_transform(documents.values())

# Cluster the documents using Agglomerative (hierarchical) clustering
n_clusters = 7  # Adjust based on your dataset
agg_clustering = AgglomerativeClustering(n_clusters=n_clusters)
labels = agg_clustering.fit_predict(X.toarray())

# Output clustering result as JSON
clusters = {}
for doc_id, label in zip(documents.keys(), labels):
    clusters.setdefault(int(label), []).append(doc_id)

with open("document_clusters1.json", "w") as f:
    json.dump(clusters, f, indent=2)

print("Clustering complete. Results saved in document_clusters.json")

Clustering complete. Results saved in document_clusters.json


In [2]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import SpectralCoclustering
import os
import numpy as np

# Load documents from the "dataset" folder
documents = {}
for filename in os.listdir("dataset/dataset"):
    with open(os.path.join("dataset/dataset", filename), "r", encoding="utf-8") as file:
        documents[filename] = file.read()

# Extract features using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
X = vectorizer.fit_transform(documents.values())

# Perform biclustering with Spectral Coclustering
n_clusters = 7  # Adjust based on your dataset
model = SpectralCoclustering(n_clusters=n_clusters, random_state=42)
model.fit(X.toarray())

# Each document is assigned to a cluster (stored in model.row_labels_)
doc_ids = list(documents.keys())
labels = model.row_labels_

# Group documents by their assigned cluster
clusters = {}
for doc_id, label in zip(doc_ids, labels):
    clusters.setdefault(int(label), []).append(doc_id)

with open("document_clusters_biclustering.json", "w") as f:
    json.dump(clusters, f, indent=2)

print("Biclustering complete. Results saved in document_clusters_biclustering.json")

Biclustering complete. Results saved in document_clusters_biclustering.json




In [3]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import os

# Load documents from the "dataset" folder
documents = {}
for filename in os.listdir("dataset/dataset"):
    with open(os.path.join("dataset/dataset", filename), "r", encoding="utf-8") as file:
        documents[filename] = file.read()

# Extract features using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
X = vectorizer.fit_transform(documents.values())

# Perform clustering with KMeans
n_clusters = 7  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(X)

# Each document is assigned a cluster label; group documents by cluster
doc_ids = list(documents.keys())
clusters = {}
for doc_id, label in zip(doc_ids, labels):
    clusters.setdefault(int(label), []).append(doc_id)

with open("document_clusters_kmeans.json", "w") as f:
    json.dump(clusters, f, indent=2)

print("KMeans clustering complete. Results saved in document_clusters_kmeans.json")

KMeans clustering complete. Results saved in document_clusters_kmeans.json
