In [52]:
import nltk
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.cluster import rand_score
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Download Reuters dataset
nltk.download('reuters')

# Select relevant classes (crude, interest, and grain)
relevant_classes = ['crude', 'interest', 'grain']

# Filter documents belonging to the relevant classes
documents = [file_id for file_id in reuters.fileids() if set(reuters.categories(file_id)).issubset(relevant_classes)]

# Tokenize and vectorize documents
corpus = [' '.join(reuters.words(file_id)) for file_id in documents]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

# Compute single-link clustering
linkage_matrix = linkage(X.toarray(), method='single')

# Cut the dendrogram at the second branch from the top to obtain K=3 clusters
k = 3
clusters = fcluster(linkage_matrix, k, criterion='maxclust')

# Compute Rand index
true_labels = [relevant_classes.index(reuters.categories(file_id)[0]) for file_id in documents]
rand_index = rand_score(true_labels, clusters)

print(f'Rand Index: {rand_index}')


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


Rand Index: 0.4472750593003992


In [53]:
import nltk
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.cluster import rand_score
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Download Reuters dataset
nltk.download('reuters')

# Select relevant classes (crude, interest, and grain)
relevant_classes = ['crude', 'interest', 'grain']

# Filter documents belonging to the relevant classes
documents = [file_id for file_id in reuters.fileids() if set(reuters.categories(file_id)).issubset(relevant_classes)]

# Tokenize and vectorize documents
corpus = [' '.join(reuters.words(file_id)) for file_id in documents]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

# Compute complete-link clustering
linkage_matrix = linkage(X.toarray(), method='complete')

# Cut the dendrogram at the second branch from the top to obtain K=3 clusters
k = 3
clusters = fcluster(linkage_matrix, k, criterion='maxclust')

# Compute Rand index
true_labels = [relevant_classes.index(reuters.categories(file_id)[0]) for file_id in documents]
rand_index = rand_score(true_labels, clusters)

print(f'Rand Index: {rand_index}')


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


Rand Index: 0.5387436126173162


In [54]:
import nltk
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.cluster import rand_score
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Download Reuters dataset
nltk.download('reuters')

# Select relevant classes (crude, interest, and grain)
relevant_classes = ['crude', 'interest', 'grain']

# Filter documents belonging to the relevant classes
documents = [file_id for file_id in reuters.fileids() if set(reuters.categories(file_id)).issubset(relevant_classes)]

# Tokenize and vectorize documents
corpus = [' '.join(reuters.words(file_id)) for file_id in documents]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

# Compute average-link clustering
linkage_matrix = linkage(X.toarray(), method='average')

# Cut the dendrogram at the second branch from the top to obtain K=3 clusters
k = 3
clusters = fcluster(linkage_matrix, k, criterion='maxclust')

# Compute Rand index
true_labels = [relevant_classes.index(reuters.categories(file_id)[0]) for file_id in documents]
rand_index = rand_score(true_labels, clusters)

print(f'Rand Index: {rand_index}')


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


Rand Index: 0.44947852647243813


In [55]:
import nltk
from nltk.corpus import reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.cluster import rand_score
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Download Reuters dataset
nltk.download('reuters')

# Select relevant classes (crude, interest, and grain)
relevant_classes = ['crude', 'interest', 'grain']

# Filter documents belonging to the relevant classes
documents = [file_id for file_id in reuters.fileids() if set(reuters.categories(file_id)).issubset(relevant_classes)]

# Tokenize and vectorize documents
corpus = [' '.join(reuters.words(file_id)) for file_id in documents]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

# Compute centroid (ward) clustering
linkage_matrix = linkage(X.toarray(), method='ward')

# Cut the dendrogram at the second branch from the top to obtain K=3 clusters
k = 3
clusters = fcluster(linkage_matrix, k, criterion='maxclust')

# Compute Rand index
true_labels = [relevant_classes.index(reuters.categories(file_id)[0]) for file_id in documents]
rand_index = rand_score(true_labels, clusters)

print(f'Rand Index: {rand_index}')


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


Rand Index: 0.7297875051900696
