In [1]:
import nltk
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.cluster import rand_score
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Download Reuters dataset
nltk.download('reuters')

# Select relevant classes (crude, interest, and grain)
relevant_classes = ['crude', 'interest', 'grain']

# Filter documents belonging to the relevant classes
documents = [file_id for file_id in reuters.fileids() if set(reuters.categories(file_id)).issubset(relevant_classes)]

# Tokenize and vectorize documents
corpus = [' '.join(reuters.words(file_id)) for file_id in documents]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

# Compute single-link clustering
linkage_matrix = linkage(X.toarray(), method='single')

# Cut the dendrogram at the second branch from the top to obtain K=3 clusters
k = 3
clusters = fcluster(linkage_matrix, k, criterion='maxclust')

# Compute Rand index
true_labels = [relevant_classes.index(reuters.categories(file_id)[0]) for file_id in documents]
rand_index_single = rand_score(true_labels, clusters)

print(f'Rand Index: {rand_index_single}')


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Ritik\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


Rand Index: 0.4472750593003992


In [2]:
import nltk
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.cluster import rand_score
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Download Reuters dataset
nltk.download('reuters')

# Select relevant classes (crude, interest, and grain)
relevant_classes = ['crude', 'interest', 'grain']

# Filter documents belonging to the relevant classes
documents = [file_id for file_id in reuters.fileids() if set(reuters.categories(file_id)).issubset(relevant_classes)]

# Tokenize and vectorize documents
corpus = [' '.join(reuters.words(file_id)) for file_id in documents]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

# Compute complete-link clustering
linkage_matrix = linkage(X.toarray(), method='complete')

# Cut the dendrogram at the second branch from the top to obtain K=3 clusters
k = 3
clusters = fcluster(linkage_matrix, k, criterion='maxclust')

# Compute Rand index
true_labels = [relevant_classes.index(reuters.categories(file_id)[0]) for file_id in documents]
rand_index_complete = rand_score(true_labels, clusters)

print(f'Rand Index: {rand_index_complete}')


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Ritik\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


Rand Index: 0.5387436126173162


In [3]:
import nltk
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.cluster import rand_score
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Download Reuters dataset
nltk.download('reuters')

# Select relevant classes (crude, interest, and grain)
relevant_classes = ['crude', 'interest', 'grain']

# Filter documents belonging to the relevant classes
documents = [file_id for file_id in reuters.fileids() if set(reuters.categories(file_id)).issubset(relevant_classes)]

# Tokenize and vectorize documents
corpus = [' '.join(reuters.words(file_id)) for file_id in documents]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

# Compute average-link clustering
linkage_matrix = linkage(X.toarray(), method='average')

# Cut the dendrogram at the second branch from the top to obtain K=3 clusters
k = 3
clusters = fcluster(linkage_matrix, k, criterion='maxclust')

# Compute Rand index
true_labels = [relevant_classes.index(reuters.categories(file_id)[0]) for file_id in documents]
rand_index_average = rand_score(true_labels, clusters)

print(f'Rand Index: {rand_index_average}')


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Ritik\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


Rand Index: 0.44947852647243813


In [5]:
import nltk
from nltk.corpus import reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.cluster import rand_score
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Download Reuters dataset
nltk.download('reuters')

# Select relevant classes (crude, interest, and grain)
relevant_classes = ['crude', 'interest', 'grain']

# Filter documents belonging to the relevant classes
documents = [file_id for file_id in reuters.fileids() if set(reuters.categories(file_id)).issubset(relevant_classes)]

# Tokenize and vectorize documents
corpus = [' '.join(reuters.words(file_id)) for file_id in documents]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)

# Compute centroid (ward) clustering
linkage_matrix = linkage(X.toarray(), method='ward')

# Cut the dendrogram at the second branch from the top to obtain K=3 clusters
k = 3
clusters = fcluster(linkage_matrix, k, criterion='maxclust')

# Compute Rand index
true_labels = [relevant_classes.index(reuters.categories(file_id)[0]) for file_id in documents]
rand_index_centroid = rand_score(true_labels, clusters)

print(f'Rand Index: {rand_index_centroid}')


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Ritik\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


Rand Index: 0.7297875051900696


In [6]:
from prettytable import PrettyTable
table = PrettyTable()
table.field_names = ['Clustering', 'Rand Index']
table.add_row(['Single_link', rand_index_single])
table.add_row(['Complete_link', rand_index_complete])
table.add_row(['GAAC', rand_index_average])
table.add_row(['Centroid', rand_index_centroid])
print(table)

+---------------+---------------------+
|   Clustering  |      Rand Index     |
+---------------+---------------------+
|  Single_link  |  0.4472750593003992 |
| Complete_link |  0.5387436126173162 |
|      GAAC     | 0.44947852647243813 |
|    Centroid   |  0.7297875051900696 |
+---------------+---------------------+
