In [6]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string

# Compute inertia and assign labels to closest centroid
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances

In [8]:
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances

def compute_inertia(centroids, data, threshold=100000, distance_metric='manhattan'):
    # Ensure centroids is a 2D array: if it's 1D, reshape it to 2D (1, number of features)
    if centroids.ndim == 1:
        centroids = centroids.reshape(1, -1)
    
    # Choose distance metric
    if distance_metric == 'euclidean':
        distances = euclidean_distances(data, centroids)
    elif distance_metric == 'manhattan':
        distances = manhattan_distances(data, centroids)
    else:
        raise ValueError("Unsupported distance metric. Choose 'euclidean' or 'manhattan'.")

    labels = np.argmin(distances, axis=1)
    min_distances = np.min(distances, axis=1)

    # Assign -1 for distances greater than threshold
    labels[min_distances > threshold] = -1

    # Compute inertia for assigned data points
    assigned_data_points = data[labels != -1]
    if len(assigned_data_points) > 0:
        assigned_labels = labels[labels != -1]
        inertia = np.sum((assigned_data_points - centroids[assigned_labels]) ** 2)
    else:
        inertia = 0

    return inertia


In [12]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

def initialize_centroids_with_kmeans(data, n_clusters):
    """Initialize centroids using KMeans."""
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(data)
    return kmeans.cluster_centers_
class Firefly:
    def __init__(self, position):
        self.position = position
        self.fitness = float('inf')
class Cuckoo:
    def __init__(self, position, data):
        self.position = position
        self.fitness= compute_inertia_and_labels(position, data)

def firefly_algorithm_update(centroids, data, n_fireflies=10, alpha=0.5, gamma=1.0):
    """Simplified Firefly algorithm to update centroids."""
    # Initialize fireflies based on the current centroids
    if len(centroids) < n_fireflies:
        # If there are fewer centroids than fireflies, replicate centroids
        additional_fireflies = n_fireflies - len(centroids)
        extra_centroids = np.tile(centroids, (additional_fireflies, 1))
        fireflies = [Firefly(centroid) for centroid in np.vstack((centroids, extra_centroids))[:n_fireflies]]
    else:
        fireflies = [Firefly(centroid) for centroid in centroids[:n_fireflies]]
    
    # Evaluate initial fitness
    for firefly in fireflies:
        firefly.fitness = compute_inertia(firefly.position, data)
    
    # Firefly algorithm optimization loop
    for i in range(n_fireflies):
        for j in range(n_fireflies):
            if fireflies[j].fitness < fireflies[i].fitness:  # Move i towards j
                r = np.linalg.norm(fireflies[i].position - fireflies[j].position)
                beta0 = 1
                beta = beta0 * np.exp(-gamma * r**2)
                fireflies[i].position += beta * (fireflies[j].position - fireflies[i].position) + alpha * (np.random.rand(*fireflies[i].position.shape) - 0.5)
                fireflies[i].fitness = compute_inertia(fireflies[i].position, data)
    
    # Update centroids based on the best fireflies
    updated_centroids = np.array([firefly.position for firefly in fireflies])
    
    return updated_centroids


def cuckoo_search_update(centroids, data, n_nests=10, pa=0.25):
    """Simplified Cuckoo Search algorithm to update centroids."""
    cuckoos = [Cuckoo(centroid, data) for centroid in centroids]
    
    for _ in range(n_nests):
        for cuckoo in cuckoos:
            # Generate new solution (new position for centroids)
            new_position = cuckoo.position + np.random.uniform(-1, 1, cuckoo.position.shape)
            new_fitness, _ = compute_inertia_and_labels(new_position, data)

            # Randomly choose another cuckoo (nest)
            random_cuckoo = np.random.choice(cuckoos)

            # Replace the position of the randomly chosen cuckoo if the new solution is better (has lower inertia)
            if new_fitness < random_cuckoo.fitness:
                random_cuckoo.position = new_position
                random_cuckoo.fitness = new_fitness

    # After iterating, select the best solutions
    cuckoos.sort(key=lambda x: x.fitness)
    
    # Replace a fraction of the worst nests with new random solutions
    for i in range(int(len(cuckoos) * pa), len(cuckoos)):
        new_position = np.random.rand(*centroids.shape)  # Generate completely new solutions
        cuckoos[i] = Cuckoo(new_position, data)

    # Extract updated centroids from cuckoos
    updated_centroids = np.array([cuckoo.position for cuckoo in cuckoos])
    
    return updated_centroids




def firefly_cuckoo_kmeans(data, n_clusters, max_iter=100):
    """Hybrid Firefly-Cuckoo-KMeans algorithm."""
    centroids = initialize_centroids_with_kmeans(data, n_clusters)
    for iteration in range(max_iter):
        centroids = firefly_algorithm_update(centroids, data)
        centroids = cuckoo_search_update(centroids, data)
        # Optionally, you can intersperse KMeans steps to refine centroids
        
    inertia, labels = compute_inertia_and_labels(centroids, data)
    return centroids, labels, inertia

# Example usage
data = np.random.rand(100, 2)  # Example data
n_clusters = 3
centroids, labels, inertia = firefly_cuckoo_kmeans(data, n_clusters)
print("Centroids:", centroids)
print("Labels:", labels)
print("Inertia:", inertia)


silhouette = silhouette_score(data, labels)
davies_bouldin = davies_bouldin_score(data, labels)
calinski_harabasz = calinski_harabasz_score(data, labels)

print(f"Silhouette Score: {silhouette}")
print(f"Davies-Bouldin Index: {davies_bouldin}")
print(f"Calinski-Harabasz Index: {calinski_harabasz}")


  super()._check_params_vs_input(X, default_n_init=10)


ValueError: Expected 2D array, got 1D array instead:
array=[0.27137297 0.29232688].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.