## Comparative Clustering Analysis: K-Means vs AffinityPropagation

## K-means Clustering

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Processing and ML Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Additional Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class KMeansClustering:
    def __init__(self, file_path):
        """
        Initialize the K-Means clustering analysis
        
        Parameters:
        file_path (str): Path to the Excel file
        """
        # Download NLTK resources
        # nltk.download('punkt', quiet=True)
        # nltk.download('stopwords', quiet=True)
        
        # Load and preprocess data
        self.load_and_preprocess_data(file_path)
    
    def load_and_preprocess_data(self, file_path):
        """
        Load and preprocess data from Excel file
        """
        # Read Excel sheets
        ib_data = pd.read_excel(file_path, sheet_name='DATA IB')
        ik_data = pd.read_excel(file_path, sheet_name='DATA IK')
        
        # Combine purposes
        ib_data['source'] = 'IB'
        ik_data['source'] = 'IK'
        ib_data['purpose'] = ib_data['desc']
        ik_data['purpose'] = ik_data['tujuan']
        
        # Merge datasets
        self.combined_data = pd.concat([
            ib_data[['purpose', 'source']], 
            ik_data[['purpose', 'source']]
        ], ignore_index=True)
        
        # Remove NaN values
        self.combined_data = self.combined_data.dropna(subset=['purpose'])

        # Preprocess purposes
        self.combined_data['purpose'] = self.combined_data['purpose'].apply(self.advanced_text_preprocessing)

        self.combined_data = self.combined_data.drop_duplicates(subset=['purpose'])
    
    def advanced_text_preprocessing(self, text):
        """
        Advanced text preprocessing
        """
        # Convert to lowercase
        text = str(text).lower()

        # Remove unwanted characters such as `_x000D_` and non-alphanumeric characters
        text = re.sub(r'_x[0-9a-fA-F]{4}_', '', text) # Remove `_x000D_` or similar

        text = re.sub(r'[^a-zA-Z\s]', '', text) # Keep only letters and spaces

        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Tokenization
        try:
            tokens = word_tokenize(text)
        except:
            tokens = text.split()
        
        # Remove stopwords
        try:
            stop_words = set(stopwords.words('indonesian'))
        except:
            stop_words = set()
        
        tokens = [token for token in tokens if token not in stop_words]
        
        return ' '.join(tokens)
    
    def feature_extraction(self):
        """
        Extract features from text data
        
        Returns:
        numpy array: Feature matrix
        list: Feature names
        """
        # Preprocess text
        self.combined_data['processed_purpose'] = self.combined_data['purpose'].apply(self.advanced_text_preprocessing)
        
        # Feature extraction using TF-IDF
        vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
        feature_matrix = vectorizer.fit_transform(self.combined_data['processed_purpose'])
        
        # Store feature names for interpretation
        self.feature_names = vectorizer.get_feature_names_out()
        
        return feature_matrix.toarray(), self.feature_names
    
    def perform_kmeans_clustering(self, features, feature_names):
        """
        Perform K-Means clustering with optimization
        
        Parameters:
        features (numpy array): Feature matrix
        feature_names (list): Feature names
        
        Returns:
        dict: Clustering results and insights
        """
        # Dimensionality reduction for visualization
        pca = PCA(n_components=2)
        reduced_features = pca.fit_transform(features)
        
        # Evaluate different numbers of clusters
        max_clusters = min(10, len(features) // 2)  # Limit max clusters
        silhouette_scores = []
        wcss = []
        
        # Reduce computation time by using a smaller range
        cluster_range = range(2, max_clusters + 1)
        
        for n_clusters in cluster_range:
            try:
                kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
                cluster_labels = kmeans.fit_predict(features)
                
                # Calculate silhouette score
                silhouette = silhouette_score(features, cluster_labels)
                silhouette_scores.append(silhouette)
                
                # Calculate WCSS
                wcss.append(kmeans.inertia_)
            except Exception as e:
                print(f"Error with {n_clusters} clusters: {e}")
                silhouette_scores.append(-1)
                wcss.append(None)
        
        # Visualize Silhouette Scores
        plt.figure(figsize=(15, 5))
        
        # Subplot 1: Silhouette Scores
        plt.subplot(121)
        plt.plot(list(cluster_range), silhouette_scores, marker='o', label='Silhouette Score')
        plt.title('K-Means Silhouette Scores')
        plt.xlabel('Number of Clusters')
        plt.ylabel('Silhouette Score')
        plt.legend()
        
        # Subplot 2: Elbow Method (WCSS)
        plt.subplot(122)
        plt.plot(list(cluster_range), wcss, marker='o', label='WCSS')
        plt.title('Elbow Method for Optimal K')
        plt.xlabel('Number of Clusters')
        plt.ylabel('WCSS')
        plt.legend()
        
        plt.tight_layout()
        plt.savefig('kmeans_evaluation/kmeans_evaluation.png')
        plt.close()
        
        # Select best number of clusters (from Silhouette Score)
        best_n_clusters = list(cluster_range)[np.argmax(silhouette_scores)]
        print(f"Best number of clusters (Silhouette Score): {best_n_clusters}")
        
        # Perform clustering with best number of clusters
        kmeans = KMeans(n_clusters=best_n_clusters, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(features)
        
        # Visualization
        plt.figure(figsize=(12, 5))
        
        # Scatter plot of clusters
        plt.subplot(121)
        scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], 
                              c=cluster_labels, cmap='viridis')
        plt.title('K-Means Clustering')
        plt.colorbar(scatter)
        
        # Cluster distribution
        plt.subplot(122)
        cluster_counts = pd.Series(cluster_labels).value_counts()
        cluster_counts.plot(kind='bar')
        plt.title('Cluster Distribution')
        plt.xlabel('Cluster')
        plt.ylabel('Number of Samples')
        plt.tight_layout()
        plt.savefig('kmeans_visualization/kmeans_clustering_visualization.png')
        plt.close()
        
        # Generate business insights
        return self.generate_business_insights(
            features, cluster_labels, feature_names
        )

    
    def generate_business_insights(self, features, cluster_labels, feature_names):
        """
        Generate business insights from clustered data
        """
        # Add cluster labels to dataframe
        self.combined_data['cluster'] = cluster_labels
        
        # Analyze clusters
        cluster_insights = {}
        
        for cluster in np.unique(cluster_labels):
            cluster_data = self.combined_data[self.combined_data['cluster'] == cluster]
            
            # Find top keywords for the cluster
            cluster_features = features[cluster_labels == cluster]
            top_keyword_indices = cluster_features.sum(axis=0).argsort()[::-1][:5]
            top_keywords = [feature_names[i] for i in top_keyword_indices]
            
            cluster_insights[int(cluster)] = {
                'keywords': top_keywords,
                'sample_purposes': cluster_data['purpose'].sample(min(3, len(cluster_data))).tolist(),
                'business_opportunities': self.map_keywords_to_opportunities(top_keywords)
            }
        
        return cluster_insights
    
    def map_keywords_to_opportunities(self, keywords):
        """
        Map keywords to potential business opportunities
        """
        opportunity_mappings = {
            'laptop': ['Computer Repair Shop', 'Laptop Sales and Service', 'Tech Accessory Store'],
            'market': ['Market Research Consultancy', 'Local Business Consulting', 'Research Services'],
            'ibadah': ['Event Management', 'Religious Event Planning', 'Community Event Services'],
            'dinas': ['Government Liaison Services', 'Permit and Documentation Assistance'],
            # 'tua': ['Family Support Services', 'Travel Assistance', 'Family Reunion Planning'],
            'orang tua': ['Family Support Services', 'Travel Assistance', 'Personal Assistance'],
            'rumah': ['Home Services', 'Family Support', 'Travel Accommodation'],
            'berobat': ['Healthcare Consultation', 'Medical Travel Services', 'Health Tourism'],
            'tugas': ['Project Management', 'Consulting Services', 'Academic Support'],
            'proyek': ['Project Management', 'Consulting Services', 'Academic Support'],
            'libur': ['Travel Services', 'Vacation Planning', 'Leisure Consulting'],
            'covid': ['Health Safety Consulting', 'Remote Work Solutions', 'Telemedicine Services'],
            'magang': ['Internship Placement', 'Career Development', 'Professional Training'],
            'gigi' : ['Klinik Gigi Spesialis Ortodonti', 'Produk Perawatan Gigi Tambahan', 'Layanan Konsultasi Online']
        }
        
        opportunities = []
        for keyword in keywords:
            if keyword in opportunity_mappings:
                opportunities.extend(opportunity_mappings[keyword])
            if keyword not in opportunity_mappings:
                opportunities.append("General Business Opportunities")
        
        return list(set(opportunities))

def main():
    # File path
    file_path = 'dataset.xlsx'
    
    # Initialize K-Means clustering analysis
    analysis = KMeansClustering(file_path)
    
    # Feature Extraction
    features, feature_names = analysis.feature_extraction()
    
    # Perform K-Means Clustering
    kmeans_results = analysis.perform_kmeans_clustering(features, feature_names)
    
    # Print Business Insights
    print("\n--- K-Means Clustering Business Insights ---")
    for cluster, cluster_info in kmeans_results.items():
        print(f"\nCluster {cluster}:")
        print(f"Top Keywords: {cluster_info['keywords']}")
        print("Sample Purposes:")
        for purpose in cluster_info['sample_purposes']:
            print(f"- {purpose}")
        print("Potential Business Opportunities:")
        for opportunity in cluster_info['business_opportunities']:
            print(f"- {opportunity}")
    
    # Optional: Save results to a file
    import json
    with open('kmeans_evaluation/kmeans_business_insights.json', 'w') as f:
        json.dump(kmeans_results, f, indent=2)

if __name__ == "__main__":
    main()

Best number of clusters (Silhouette Score): 9

--- K-Means Clustering Business Insights ---

Cluster 0:
Top Keywords: ['memperbaiki', 'laptop', 'ib', 'mengurus', 'balige']
Sample Purposes:
- mengikuti acara adat saur matua oppung
- melayat ketempat aldi simanjuntak
- pergi kantor dinas lingkungan hidup menjumpai pegawai terkait wawancara pengumpalan data permohonan izin pelaksanaan kegiatan pkmki merangkap pelengkapan mata kuliah kreativitas inovasi
Potential Business Opportunities:
- Laptop Sales and Service
- Computer Repair Shop
- Tech Accessory Store
- General Business Opportunities

Cluster 1:
Top Keywords: ['libur', 'semester', 'natal', 'raya', 'idul']
Sample Purposes:
- libur natal pjj semester genap
- libur semester genap
- libur nasional
Potential Business Opportunities:
- Travel Services
- Leisure Consulting
- Vacation Planning
- General Business Opportunities

Cluster 2:
Top Keywords: ['pulang', 'kerumah', 'ib', 'pemilu', 'teman']
Sample Purposes:
- pulang ib karna jam koson

## AffinityPropagation Clustering

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler

import re
import nltk
import ssl
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# NLTK SSL Handling
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


class AffinityPropagationClustering:
    def __init__(self, file_path):
        self.load_and_preprocess_data(file_path)

    def load_and_preprocess_data(self, file_path):
        """
        Load and preprocess data from Excel file
        """
        # Read Excel sheets
        ib_data = pd.read_excel(file_path, sheet_name='DATA IB')
        ik_data = pd.read_excel(file_path, sheet_name='DATA IK')
        
        # Combine purposes
        ib_data['source'] = 'IB'
        ik_data['source'] = 'IK'
        ib_data['purpose'] = ib_data['desc']
        ik_data['purpose'] = ik_data['tujuan']
        
        # Merge datasets
        self.combined_data = pd.concat([
            ib_data[['purpose', 'source']], 
            ik_data[['purpose', 'source']]
        ], ignore_index=True)
        
        # Remove NaN values
        self.combined_data = self.combined_data.dropna(subset=['purpose'])

        # Preprocess purposes
        self.combined_data['purpose'] = self.combined_data['purpose'].apply(self.advanced_text_preprocessing)

        self.combined_data = self.combined_data.drop_duplicates(subset=['purpose'])
    
    def advanced_text_preprocessing(self, text):
        """
        Advanced text preprocessing
        """
        # Convert to lowercase
        text = str(text).lower()

        # Remove unwanted characters such as `_x000D_` and non-alphanumeric characters
        text = re.sub(r'_x[0-9a-fA-F]{4}_', '', text) # Remove `_x000D_` or similar

        text = re.sub(r'[^a-zA-Z\s]', '', text) # Keep only letters and spaces

        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Tokenization
        try:
            tokens = word_tokenize(text)
        except:
            tokens = text.split()
        
        # Remove stopwords
        try:
            stop_words = set(stopwords.words('indonesian'))
        except:
            stop_words = set()
        
        tokens = [token for token in tokens if token not in stop_words]
        
        return ' '.join(tokens)

    def feature_extraction(self):
        self.combined_data['processed_purpose'] = self.combined_data['purpose'].apply(self.advanced_text_preprocessing)

        vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
        feature_matrix = vectorizer.fit_transform(self.combined_data['processed_purpose'])
        self.feature_names = list(vectorizer.get_feature_names_out())
        return feature_matrix

    def perform_affinity_propagation_clustering(self, features):
        # Scale the features
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(features.toarray())
    
        # Try different damping parameters and preferences
        damping_values = [0.5, 0.6, 0.7, 0.8, 0.9, 1]
        preference_values = [None, 'median', 'mean']
    
        evaluation_results = []
        plt.figure(figsize=(20, 15))
    
        for i, damping in enumerate(damping_values):
            for j, pref_type in enumerate(preference_values):
                try:
                    # Calculate preference
                    if pref_type == 'median':
                        preference = np.median(cosine_similarity(scaled_features))
                    elif pref_type == 'mean':
                        preference = np.mean(cosine_similarity(scaled_features))
                    else:
                        preference = None
    
                    # Perform Affinity Propagation
                    similarity_matrix = cosine_similarity(scaled_features)
                    similarity_matrix = np.clip(similarity_matrix, 0, 1)
    
                    affinity_prop = AffinityPropagation(
                        damping=damping,
                        preference=preference,
                        affinity='precomputed',
                        random_state=42
                    )
                    cluster_labels = affinity_prop.fit_predict(similarity_matrix)
    
                    # Skip single cluster scenarios
                    unique_clusters = np.unique(cluster_labels)
                    if len(unique_clusters) <= 1:
                        continue
    
                    # Compute evaluation metrics
                    silhouette = silhouette_score(scaled_features, cluster_labels)
                    calinski = calinski_harabasz_score(scaled_features, cluster_labels)
                    davies = davies_bouldin_score(scaled_features, cluster_labels)
    
                    evaluation_results.append({
                        'Damping': damping,
                        'Preference': pref_type,
                        'Num Clusters': len(unique_clusters),
                        'Silhouette Score': silhouette,
                        'Calinski-Harabasz Score': calinski,
                        'Davies-Bouldin Score': davies
                    })
    
                    # Visualization subplot for the evaluation
                    plt.subplot(len(damping_values), len(preference_values),
                                i * len(preference_values) + j + 1)
    
                    # PCA for 2D visualization
                    pca = PCA(n_components=2)
                    reduced_features = pca.fit_transform(scaled_features)
    
                    scatter = plt.scatter(
                        reduced_features[:, 0],
                        reduced_features[:, 1],
                        c=cluster_labels,
                        cmap='viridis'
                    )
                    plt.title(f'Damping: {damping}, Pref: {pref_type}\nClusters: {len(unique_clusters)}')
                    plt.colorbar(scatter)
    
                    # Additional visualization for this configuration
                    plt.figure(figsize=(15, 5))
    
                    # Plot 1: Scatterplot of clusters
                    plt.subplot(131)
                    scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=cluster_labels, cmap='viridis')
                    plt.title('Affinity Propagation Clustering')
                    plt.colorbar(scatter)
    
                    # Plot 2: Cluster distribution as bar chart
                    plt.subplot(132)
                    cluster_counts = pd.Series(cluster_labels).value_counts()
                    cluster_counts.plot(kind='bar')
                    plt.title('Cluster Distribution')
                    plt.xlabel('Cluster')
                    plt.ylabel('Number of Samples')
    
                    # Plot 3: Scatterplot with exemplar points
                    plt.subplot(133)
                    exemplar_indices = affinity_prop.cluster_centers_indices_
                    plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=cluster_labels, cmap='viridis', alpha=0.7)
                    plt.scatter(reduced_features[exemplar_indices, 0], reduced_features[exemplar_indices, 1],
                                c='red', s=200, marker='x')
                    plt.title('Exemplar Points')
    
                    plt.tight_layout()
                    plt.savefig('af_visualization/affinity_propagation_visualization_{}_{}.png'.format(damping, pref_type))
                    plt.close()
    
                except Exception as e:
                    print(f"Error with damping {damping}, preference {pref_type}: {e}")
    
        plt.tight_layout()
        plt.savefig('af_evaluation/affinity_propagation_evaluation.png')
        plt.close()
    
        # Create and save evaluation results
        evaluation_df = pd.DataFrame(evaluation_results)
        evaluation_df.to_csv('af_evaluation/affinity_propagation_metrics.csv', index=False)
        print("\nEvaluation Metrics:")
        print(evaluation_df)
    
        # Select best configuration based on metrics
        if evaluation_results:
            best_config = max(evaluation_results, key=lambda x: x['Silhouette Score'])
            print("\nBest Configuration:")
            print(best_config)
    
            # Rerun clustering with best configuration
            best_damping = best_config['Damping']
            best_pref_type = best_config['Preference']
    
            # Recalculate preference based on best type
            if best_pref_type == 'median':
                best_preference = np.median(cosine_similarity(scaled_features))
            elif best_pref_type == 'mean':
                best_preference = np.mean(cosine_similarity(scaled_features))
            else:
                best_preference = None
    
            final_affinity_prop = AffinityPropagation(
                damping=best_damping,
                preference=best_preference,
                affinity='precomputed',
                random_state=42
            )
            final_cluster_labels = final_affinity_prop.fit_predict(similarity_matrix)
    
            # Continue with your existing clustering and business insights generation
            return self.generate_business_insights(features, final_cluster_labels, final_affinity_prop.cluster_centers_indices_)
    
        # Fallback to original method if no valid configuration found
        return self.generate_business_insights(features, cluster_labels, affinity_prop.cluster_centers_indices_)


    def generate_business_insights(self, features, cluster_labels, exemplar_indices):
        self.combined_data['cluster'] = cluster_labels
        cluster_insights = {}

        for cluster in np.unique(cluster_labels):
            if cluster == -1:
                continue

            cluster_data = self.combined_data[self.combined_data['cluster'] == cluster]
            cluster_mask = cluster_labels == cluster
            cluster_features = features[cluster_mask]
            top_keyword_indices = np.array(cluster_features.sum(axis=0)).ravel().argsort()[::-1][:5]
            top_keywords = [self.feature_names[i] for i in top_keyword_indices]

            exemplar_index = exemplar_indices[np.where(np.unique(cluster_labels) == cluster)[0][0]]
            exemplar_purpose = self.combined_data.iloc[exemplar_index]['purpose']

            cluster_insights[int(cluster)] = {
                'keywords': top_keywords,
                'sample_purposes': cluster_data['purpose'].sample(min(3, len(cluster_data))).tolist(),
                'business_opportunities': self.map_keywords_to_opportunities(top_keywords),
                'exemplar_purpose': exemplar_purpose,
                'cluster_size': len(cluster_data)
            }

        return cluster_insights

    def map_keywords_to_opportunities(self, keywords):
        opportunity_mappings = {
            'laptop': ['Computer Repair Shop', 'Laptop Sales and Service', 'Tech Accessory Store'],
            'market': ['Market Research Consultancy', 'Local Business Consulting', 'Research Services'],
            'ibadah': ['Event Management', 'Religious Event Planning', 'Community Event Services'],
            'dinas': ['Government Liaison Services', 'Permit and Documentation Assistance'],
            # 'tua': ['Family Support Services', 'Travel Assistance', 'Family Reunion Planning'],
            'orang tua': ['Family Support Services', 'Travel Assistance', 'Personal Assistance'],
            'rumah': ['Home Services', 'Family Support', 'Travel Accommodation'],
            'berobat': ['Healthcare Consultation', 'Medical Travel Services', 'Health Tourism'],
            'tugas': ['Project Management', 'Consulting Services', 'Academic Support'],
            'proyek': ['Project Management', 'Consulting Services', 'Academic Support'],
            'libur': ['Travel Services', 'Vacation Planning', 'Leisure Consulting'],
            'covid': ['Health Safety Consulting', 'Remote Work Solutions', 'Telemedicine Services'],
            'magang': ['Internship Placement', 'Career Development', 'Professional Training'],
            'gigi' : ['Klinik Gigi Spesialis Ortodonti', 'Produk Perawatan Gigi Tambahan', 'Layanan Konsultasi Online']
        }

        opportunities = []
        for keyword in keywords:
            if keyword in opportunity_mappings:
                opportunities.extend(opportunity_mappings[keyword])
            if keyword not in opportunity_mappings:
                opportunities.append("General Business Opportunities")


        return list(set(opportunities))


def main():
    file_path = 'dataset.xlsx'
    try:
        analysis = AffinityPropagationClustering(file_path)
        features = analysis.feature_extraction()
        affinity_results = analysis.perform_affinity_propagation_clustering(features)

        print("\n--- Affinity Propagation Clustering Business Insights ---")
        for cluster, cluster_info in affinity_results.items():
            print(f"\nCluster {cluster} (Size: {cluster_info.get('cluster_size', 'N/A')}):")
            print("Top Keywords:", ", ".join(cluster_info['keywords']))
            print("Sample Purposes:")
            for purpose in cluster_info['sample_purposes']:
                print(f"- {purpose}")
            print("Potential Business Opportunities:")
            for opportunity in cluster_info['business_opportunities']:
                print(f"- {opportunity}")

        with open('af_evaluation/affinity_propagation_business_insights.json', 'w') as f:
            import json
            json.dump(affinity_results, f, indent=2)

    except Exception as e:
        print(f"An error occurred: {e}")


if __name__ == "__main__":
    main()




Error with damping 1, preference None: The 'damping' parameter of AffinityPropagation must be a float in the range [0.5, 1.0). Got 1 instead.
Error with damping 1, preference median: The 'damping' parameter of AffinityPropagation must be a float in the range [0.5, 1.0). Got 1 instead.
Error with damping 1, preference mean: The 'damping' parameter of AffinityPropagation must be a float in the range [0.5, 1.0). Got 1 instead.

Evaluation Metrics:
    Damping Preference  Num Clusters  Silhouette Score  \
0       0.5       None           636          0.201675   
1       0.5     median           635          0.199685   
2       0.5       mean           625          0.202981   
3       0.6       None           609          0.209196   
4       0.6     median           606          0.206062   
5       0.6       mean           609          0.209196   
6       0.7       None           581          0.226484   
7       0.7     median           576          0.221958   
8       0.7       mean       