## Comparative Clustering Analysis: K-Means vs DBSCAN

Key Differences Between K-Means and DBSCAN:

K-Means: Requires predefined number of clusters

DBSCAN: Discovers clusters based on density

K-Means: Assumes spherical cluster shapes

DBSCAN: Can find arbitrarily shaped clusters

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Processing and ML Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# Clustering Models
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Additional Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class ComparativeClustering:
    def __init__(self, file_path):
        """
        Initialize the comparative clustering analysis
        
        Parameters:
        file_path (str): Path to the Excel file
        """
        # Download NLTK resources
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        
        # Load and preprocess data
        self.load_and_preprocess_data(file_path)
    
    def load_and_preprocess_data(self, file_path):
        """
        Load and preprocess data from Excel file
        """
        # Read Excel sheets
        ib_data = pd.read_excel(file_path, sheet_name='DATA IB')
        ik_data = pd.read_excel(file_path, sheet_name='DATA IK')
        
        # Combine purposes
        ib_data['source'] = 'IB'
        ik_data['source'] = 'IK'
        ib_data['purpose'] = ib_data['desc']
        ik_data['purpose'] = ik_data['tujuan']
        
        # Merge datasets
        self.combined_data = pd.concat([
            ib_data[['purpose', 'source']], 
            ik_data[['purpose', 'source']]
        ], ignore_index=True)
        
        # Remove NaN values
        self.combined_data = self.combined_data.dropna(subset=['purpose'])
    
    def advanced_text_preprocessing(self, text):
        """
        Advanced text preprocessing
        """
        # Convert to lowercase
        text = str(text).lower()
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Tokenization
        tokens = word_tokenize(text)
        
        # Remove stopwords
        stop_words = set(stopwords.words('indonesian'))
        tokens = [token for token in tokens if token not in stop_words]
        
        return ' '.join(tokens)
    
    def feature_extraction(self):
        """
        Extract features from text data
        
        Returns:
        numpy array: Feature matrix
        list: Feature names
        """
        # Preprocess text
        self.combined_data['processed_purpose'] = self.combined_data['purpose'].apply(self.advanced_text_preprocessing)
        
        # Feature extraction using TF-IDF
        vectorizer = TfidfVectorizer(stop_words='english')
        feature_matrix = vectorizer.fit_transform(self.combined_data['processed_purpose'])
        
        # Store feature names for interpretation
        self.feature_names = vectorizer.get_feature_names_out()
        
        return feature_matrix.toarray(), self.feature_names
    
    def perform_clustering(self, features, feature_names):
        """
        Perform comparative clustering analysis
        
        Parameters:
        features (numpy array): Feature matrix
        feature_names (list): Feature names
        
        Returns:
        dict: Clustering results for K-Means and DBSCAN
        """
        # Dimensionality reduction for visualization
        pca = PCA(n_components=2)
        reduced_features = pca.fit_transform(features)
        
        # Clustering methods to compare
        clustering_methods = {
            'K-Means': {
                'algorithm': KMeans(n_clusters=5, random_state=42, n_init=10),
                'params': {'n_clusters': range(2, 11)}
            },
            'DBSCAN': {
                'algorithm': DBSCAN(eps=0.5, min_samples=3),
                'params': {
                    'eps': [0.1, 0.3, 0.5, 0.7, 1.0],
                    'min_samples': [2, 3, 5]
                }
            }
        }
        
        # Results storage
        clustering_results = {}
        
        # Comparative analysis
        for method_name, method_config in clustering_methods.items():
            print(f"\nAnalyzing {method_name} Clustering:")
            
            if method_name == 'K-Means':
                # K-Means specific evaluation
                silhouette_scores = []
                for n_clusters in method_config['params']['n_clusters']:
                    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
                    cluster_labels = kmeans.fit_predict(features)
                    
                    # Calculate metrics
                    silhouette = silhouette_score(features, cluster_labels)
                    silhouette_scores.append(silhouette)
                
                # Visualize K-Means cluster evaluation
                plt.figure(figsize=(10, 5))
                plt.plot(list(method_config['params']['n_clusters']), silhouette_scores, marker='o')
                plt.title(f'{method_name} Silhouette Scores')
                plt.xlabel('Number of Clusters')
                plt.ylabel('Silhouette Score')
                plt.show()
                
                # Select best number of clusters
                best_n_clusters = list(method_config['params']['n_clusters'])[np.argmax(silhouette_scores)]
                kmeans = KMeans(n_clusters=best_n_clusters, random_state=42, n_init=10)
                cluster_labels = kmeans.fit_predict(features)
            
            else:  # DBSCAN
                # DBSCAN specific evaluation
                best_silhouette = -1
                best_params = {}
                best_cluster_labels = None
                
                for eps in method_config['params']['eps']:
                    for min_samples in method_config['params']['min_samples']:
                        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                        cluster_labels = dbscan.fit_predict(features)
                        
                        # Only evaluate if more than one cluster is found
                        unique_clusters = np.unique(cluster_labels)
                        if len(unique_clusters) > 1 and -1 not in unique_clusters:
                            try:
                                silhouette = silhouette_score(features, cluster_labels)
                                if silhouette > best_silhouette:
                                    best_silhouette = silhouette
                                    best_params = {'eps': eps, 'min_samples': min_samples}
                                    best_cluster_labels = cluster_labels
                            except:
                                continue
                
                cluster_labels = best_cluster_labels
                print(f"Best DBSCAN Parameters: {best_params}")
            
            # Visualization
            plt.figure(figsize=(12, 5))
            plt.subplot(121)
            scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], 
                                  c=cluster_labels, cmap='viridis')
            plt.title(f'{method_name} Clustering')
            plt.colorbar(scatter)
            
            # Cluster distribution
            plt.subplot(122)
            cluster_counts = pd.Series(cluster_labels).value_counts()
            cluster_counts.plot(kind='bar')
            plt.title(f'{method_name} Cluster Distribution')
            plt.xlabel('Cluster')
            plt.ylabel('Number of Samples')
            plt.tight_layout()
            plt.show()
            
            # Generate business insights
            clustering_results[method_name] = self.generate_business_insights(
                features, cluster_labels, feature_names
            )
        
        return clustering_results
    
    def generate_business_insights(self, features, cluster_labels, feature_names):
        """
        Generate business insights from clustered data
        """
        # Add cluster labels to dataframe
        self.combined_data['cluster'] = cluster_labels
        
        # Analyze clusters
        cluster_insights = {}
        unique_clusters = np.unique(cluster_labels)
        
        for cluster in unique_clusters:
            if cluster == -1:  # Skip noise points in DBSCAN
                continue
            
            cluster_data = self.combined_data[self.combined_data['cluster'] == cluster]
            
            # Find top keywords for the cluster
            cluster_features = features[cluster_labels == cluster]
            top_keyword_indices = cluster_features.sum(axis=0).argsort()[::-1][:5]
            top_keywords = [feature_names[i] for i in top_keyword_indices]
            
            cluster_insights[cluster] = {
                'keywords': top_keywords,
                'sample_purposes': cluster_data['purpose'].sample(min(3, len(cluster_data))).tolist(),
                'business_opportunities': self.map_keywords_to_opportunities(top_keywords)
            }
        
        return cluster_insights
    
    def map_keywords_to_opportunities(self, keywords):
        """
        Map keywords to potential business opportunities
        """
        opportunity_mappings = {
            'laptop': ['Computer Repair Shop', 'Laptop Sales and Service', 'Tech Accessory Store'],
            'orang': ['Family Support Services', 'Travel Assistance', 'Family Reunion Planning'],
            'market': ['Market Research Consultancy', 'Local Business Consulting', 'Research Services'],
            'ibadah': ['Event Management', 'Religious Event Planning', 'Community Event Services'],
            'dinas': ['Government Liaison Services', 'Permit and Documentation Assistance']
        }
        
        opportunities = []
        for keyword in keywords:
            if keyword in opportunity_mappings:
                opportunities.extend(opportunity_mappings[keyword])
        
        return list(set(opportunities))

def main():
    # File path
    file_path = 'dataset.xlsx'
    
    # Initialize comparative clustering analysis
    analysis = ComparativeClustering(file_path)
    
    # Feature Extraction
    features, feature_names = analysis.feature_extraction()
    
    # Perform Comparative Clustering
    clustering_results = analysis.perform_clustering(features, feature_names)
    
    # Print Business Insights
    for clustering_method, insights in clustering_results.items():
        print(f"\n--- {clustering_method} Clustering Business Insights ---")
        for cluster, cluster_info in insights.items():
            print(f"\nCluster {cluster}:")
            print(f"Top Keywords: {cluster_info['keywords']}")
            print("Sample Purposes:")
            for purpose in cluster_info['sample_purposes']:
                print(f"- {purpose}")
            print("Potential Business Opportunities:")
            for opportunity in cluster_info['business_opportunities']:
                print(f"- {opportunity}")

if __name__ == "__main__":
    main()