In [2]:
import numpy as np
from sklearn.cluster import KMeans
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string

In [3]:
# Compute inertia and assign labels to closest centroid
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances

def compute_inertia(centroids, data, threshold=100000, distance_metric='manhattan'):
    # Choose distance metric
    if distance_metric == 'euclidean':
        distances = euclidean_distances(data, centroids)
    elif distance_metric == 'manhattan':
        distances = manhattan_distances(data, centroids)
    else:
        raise ValueError("Unsupported distance metric. Choose 'euclidean' or 'manhattan'.")

    labels = np.argmin(distances, axis=1)
    min_distances = np.min(distances, axis=1)

    # Assign -1 for distances greater than threshold
    labels[min_distances > threshold] = -1

    # Compute inertia for assigned data points
    assigned_data_points = data[labels != -1]
    if len(assigned_data_points) > 0:
        assigned_labels = labels[labels != -1]
        inertia = np.sum((assigned_data_points - centroids[assigned_labels]) ** 2)
    else:
        inertia = 0

    # Calculate additional metrics if needed
    num_outliers = np.sum(labels == -1)
    average_distance = np.mean(min_distances[labels != -1]) if len(assigned_data_points) > 0 else 0

    return inertia

In [4]:
class Firefly:
    def __init__(self, data, n_clusters):
        self.position = data[np.random.choice(data.shape[0], n_clusters, replace=False), :]
        self.fitness = compute_inertia(self.position, data)


In [5]:
def firefly_kmeans(data, n_clusters, n_fireflies=10, max_iter=100, alpha=0.5, gamma=1.0):
    fireflies = [Firefly(data, n_clusters) for _ in range(n_fireflies)]

    for _ in range(max_iter):
        fireflies.sort(key=lambda x: x.fitness)
        for i in range(n_fireflies):
            for j in range(n_fireflies):
                if fireflies[j].fitness < fireflies[i].fitness:  # Move i towards j
                    r = np.linalg.norm(fireflies[i].position - fireflies[j].position)
                    beta0 = 1
                    beta = beta0 * np.exp(-gamma * r**2)
                    fireflies[i].position += beta * (fireflies[j].position - fireflies[i].position) + alpha * (np.random.rand(*fireflies[i].position.shape) - 0.5)
                    fireflies[i].fitness = compute_inertia(fireflies[i].position, data)

    best_firefly = fireflies[0]
    final_kmeans = KMeans(n_clusters=n_clusters, init=np.array(best_firefly.position), n_init=1, max_iter=300)
    final_kmeans.fit(data)
    return final_kmeans

In [6]:
newsgroups_dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'), random_state=42)

In [7]:
nltk.download('stopwords')
nltk.download('punkt')

# Initialize stopwords, stemmer, and punctuation set
stop_words = set(stopwords.words('english'))
punctuation_set = set(string.punctuation)
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to C:\Users\Devendra
[nltk_data]     Nemade\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Devendra
[nltk_data]     Nemade\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # Convert to lowercase, remove punctuation, remove stopwords, and stem
    processed_tokens = [stemmer.stem(word.lower()) for word in tokens if word.lower() not in stop_words and word not in punctuation_set]
    # Re-join processed tokens into a single string
    return ' '.join(processed_tokens)

In [9]:
processed_data = [preprocess_text(doc) for doc in newsgroups_dataset.data]
# Use TfidfVectorizer to convert the raw text into TF-IDF features
print("Vectorizing the text data...")
import gensim.downloader as api
from sklearn.decomposition import PCA
# Load Word2Vec model trained on Google News dataset
print("Loading Word2Vec model...")
word2vec_model = api.load('word2vec-google-news-300')

# Define a function to get the vector representation of a document
def document_vector(doc):
    # Remove punctuation and tokenize the document
    tokens = [word.lower() for word in word_tokenize(doc) if word.lower() not in punctuation_set]
    # Filter out tokens that are not in the Word2Vec model's vocabulary
    tokens = [word for word in tokens if word in word2vec_model.key_to_index]
    if len(tokens) == 0:
        return np.zeros(word2vec_model.vector_size)
    # Calculate the mean of word vectors for tokens in the document
    return np.mean(word2vec_model[tokens], axis=0)

# Obtain Word2Vec embeddings for each document
print("Obtaining Word2Vec embeddings for each document...")
word2vec_embeddings = np.array([document_vector(doc) for doc in processed_data])
n_components = 100
svd = TruncatedSVD(n_components=n_components, random_state=42)
tfidf_matrix_reduced = svd.fit_transform(word2vec_embeddings)


Vectorizing the text data...
Loading Word2Vec model...
Obtaining Word2Vec embeddings for each document...


In [10]:
n_clusters = 20
kmeans = firefly_kmeans(tfidf_matrix_reduced, n_clusters)
print("Cluster centers:", kmeans.cluster_centers_)
print("Labels:", kmeans.labels_)
#Firefly-Kmeans on 20newsgroups


Cluster centers: [[ 7.48137066e-01 -1.01397252e-01 -8.03535080e-02 ... -1.37926776e-03
  -2.91331663e-03  1.85280105e-03]
 [ 7.68187778e-01  1.42020798e-01 -2.51274254e-01 ...  1.45760558e-03
  -5.18606089e-04 -5.11972286e-04]
 [ 7.63019607e-01 -1.19693480e-01  9.67469290e-02 ...  2.79883823e-03
   2.92514697e-03  2.22010558e-03]
 ...
 [ 8.08995502e-01 -1.03511840e-01 -3.15730925e-02 ... -1.13072955e-03
  -1.67079916e-05 -1.25393916e-03]
 [ 8.59194332e-01  3.03169800e-02  1.33908126e-01 ... -1.95825012e-03
   1.36219289e-04  1.94808343e-03]
 [ 8.18640129e-01  1.66322391e-02  2.52665707e-01 ...  3.78496752e-04
  -3.93213646e-03 -2.38146909e-03]]
Labels: [ 3 12 15 ...  1  2 14]


In [11]:
silhouette = silhouette_score(tfidf_matrix_reduced, kmeans.labels_)
davies_bouldin = davies_bouldin_score(tfidf_matrix_reduced, kmeans.labels_)
calinski_harabasz = calinski_harabasz_score(tfidf_matrix_reduced, kmeans.labels_)

print("Cluster centers:", kmeans.cluster_centers_)
print("Labels:", kmeans.labels_)
print(f"Silhouette Score: {silhouette}")
print(f"Davies-Bouldin Index: {davies_bouldin}")
print(f"Calinski-Harabasz Index: {calinski_harabasz}")

Cluster centers: [[ 7.48137066e-01 -1.01397252e-01 -8.03535080e-02 ... -1.37926776e-03
  -2.91331663e-03  1.85280105e-03]
 [ 7.68187778e-01  1.42020798e-01 -2.51274254e-01 ...  1.45760558e-03
  -5.18606089e-04 -5.11972286e-04]
 [ 7.63019607e-01 -1.19693480e-01  9.67469290e-02 ...  2.79883823e-03
   2.92514697e-03  2.22010558e-03]
 ...
 [ 8.08995502e-01 -1.03511840e-01 -3.15730925e-02 ... -1.13072955e-03
  -1.67079916e-05 -1.25393916e-03]
 [ 8.59194332e-01  3.03169800e-02  1.33908126e-01 ... -1.95825012e-03
   1.36219289e-04  1.94808343e-03]
 [ 8.18640129e-01  1.66322391e-02  2.52665707e-01 ...  3.78496752e-04
  -3.93213646e-03 -2.38146909e-03]]
Labels: [ 3 12 15 ...  1  2 14]
Silhouette Score: 0.03811588021246144
Davies-Bouldin Index: 2.7280705464101653
Calinski-Harabasz Index: 380.81844351933836


In [12]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
silhouette_scores = []
davies_bouldin_scores = []
calinski_harabasz_scores = []
Labels_save=[];
for n_clusters in range(2, 21):
    # Run PSO-KMeans with the current number of clusters
    kmeans =  firefly_kmeans(tfidf_matrix_reduced, n_clusters)
    
    # Calculate metrics
    silhouette = silhouette_score(tfidf_matrix_reduced, kmeans.labels_)
    davies_bouldin = davies_bouldin_score(tfidf_matrix_reduced, kmeans.labels_)
    calinski_harabasz = calinski_harabasz_score(tfidf_matrix_reduced, kmeans.labels_)
    
    # Store metrics
    silhouette_scores.append(silhouette)
    davies_bouldin_scores.append(davies_bouldin)
    calinski_harabasz_scores.append(calinski_harabasz)
    

In [13]:
import matplotlib.pyplot as plt
# Plotting the metrics
fig, axs = plt.subplots(3, 1, figsize=(10, 15))

Silhouette Score
axs[0].plot(range(2, 21), silhouette_scores, marker='o', linestyle='-', color='blue')
axs[0].set_title('Silhouette Score')
axs[0].set_xlabel('Number of Clusters')
axs[0].set_ylabel('Silhouette Score')

# Davies-Bouldin Score
axs[1].plot(range(2, 21), davies_bouldin_scores, marker='o', linestyle='-', color='red')
axs[1].set_title('Davies-Bouldin Score')
axs[1].set_xlabel('Number of Clusters')
axs[1].set_ylabel('Davies-Bouldin Score')
plt.tight_layout()
plt.savefig("Davies-Bouldin Score_News_word2vec_firefly-Kmeans.png")
# Calinski-Harabasz Score
axs[2].plot(range(2, 21), calinski_harabasz_scores, marker='o', linestyle='-', color='green')
axs[2].set_title('Calinski-Harabasz Score')
axs[2].set_xlabel('Number of Clusters')
axs[2].set_ylabel('Calinski-Harabasz Score')
plt.tight_layout()
plt.savefig("Calinski-Harabasz Score_News_word2vec_firefly-Kmeans.png")
plt.tight_layout()
plt.show()

SyntaxError: invalid syntax (2870759568.py, line 5)

In [14]:
import pandas as pd
data = {'Silhouette Score':silhouette_scores,'davies_bouldin_scores': davies_bouldin_scores,
        'calinski_harabasz_scores': calinski_harabasz_scores}
df=pd.DataFrame(data,index=range(2, 21))
df

Unnamed: 0,Silhouette Score,davies_bouldin_scores,calinski_harabasz_scores
2,0.10269,3.469627,1140.288308
3,0.058308,3.278812,1011.011282
4,0.089144,2.571697,1015.983662
5,0.072582,2.791839,915.089955
6,0.064052,2.958459,811.491039
7,0.061607,2.653671,735.711188
8,0.057261,2.916292,681.697051
9,0.060824,2.798546,636.332599
10,0.063984,2.769854,600.55484
11,0.049412,2.909121,566.461097


In [15]:
df.to_csv('News_word2vec_FIREFLY-Kmeans.csv')