In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Processing and ML Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Additional Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class KMeansClustering:
    def __init__(self, file_path):
        """
        Initialize the K-Means clustering analysis
        
        Parameters:
        file_path (str): Path to the Excel file
        """
        # Download NLTK resources
        nltk.download('punkt', quiet=True)
        nltk.download('stopwords', quiet=True)
        
        # Load and preprocess data
        self.load_and_preprocess_data(file_path)
    
    def load_and_preprocess_data(self, file_path):
        """
        Load and preprocess data from Excel file
        """
        # Read Excel sheets
        ib_data = pd.read_excel(file_path, sheet_name='DATA IB')
        ik_data = pd.read_excel(file_path, sheet_name='DATA IK')
        
        # Combine purposes
        ib_data['source'] = 'IB'
        ik_data['source'] = 'IK'
        ib_data['purpose'] = ib_data['desc']
        ik_data['purpose'] = ik_data['tujuan']
        
        # Merge datasets
        self.combined_data = pd.concat([
            ib_data[['purpose', 'source']], 
            ik_data[['purpose', 'source']]
        ], ignore_index=True)
        
        # Remove NaN values
        self.combined_data = self.combined_data.dropna(subset=['purpose'])
    
    def advanced_text_preprocessing(self, text):
        """
        Advanced text preprocessing
        """
        # Convert to lowercase
        text = str(text).lower()
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Tokenization
        try:
            tokens = word_tokenize(text)
        except:
            tokens = text.split()
        
        # Remove stopwords
        try:
            stop_words = set(stopwords.words('indonesian'))
        except:
            stop_words = set()
        
        tokens = [token for token in tokens if token not in stop_words]
        
        return ' '.join(tokens)
    
    def feature_extraction(self):
        """
        Extract features from text data
        
        Returns:
        numpy array: Feature matrix
        list: Feature names
        """
        # Preprocess text
        self.combined_data['processed_purpose'] = self.combined_data['purpose'].apply(self.advanced_text_preprocessing)
        
        # Feature extraction using TF-IDF
        vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
        feature_matrix = vectorizer.fit_transform(self.combined_data['processed_purpose'])
        
        # Store feature names for interpretation
        self.feature_names = vectorizer.get_feature_names_out()
        
        return feature_matrix.toarray(), self.feature_names
    
    def perform_kmeans_clustering(self, features, feature_names):
        """
        Perform K-Means clustering with optimization
        
        Parameters:
        features (numpy array): Feature matrix
        feature_names (list): Feature names
        
        Returns:
        dict: Clustering results and insights
        """
        # Dimensionality reduction for visualization
        pca = PCA(n_components=2)
        reduced_features = pca.fit_transform(features)
        
        # Evaluate different numbers of clusters
        max_clusters = min(10, len(features) // 2)  # Limit max clusters
        silhouette_scores = []
        
        # Reduce computation time by using a smaller range
        cluster_range = range(2, max_clusters + 1)
        
        for n_clusters in cluster_range:
            try:
                kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
                cluster_labels = kmeans.fit_predict(features)
                
                # Calculate silhouette score
                silhouette = silhouette_score(features, cluster_labels)
                silhouette_scores.append(silhouette)
            except Exception as e:
                print(f"Error with {n_clusters} clusters: {e}")
                silhouette_scores.append(-1)
        
        # Visualize Silhouette Scores
        plt.figure(figsize=(10, 5))
        plt.plot(list(cluster_range), silhouette_scores, marker='o')
        plt.title('K-Means Silhouette Scores')
        plt.xlabel('Number of Clusters')
        plt.ylabel('Silhouette Score')
        plt.tight_layout()
        plt.savefig('kmeans_silhouette_scores.png')
        plt.close()
        
        # Select best number of clusters
        best_n_clusters = list(cluster_range)[np.argmax(silhouette_scores)]
        print(f"Best number of clusters: {best_n_clusters}")
        
        # Perform clustering with best number of clusters
        kmeans = KMeans(n_clusters=best_n_clusters, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(features)
        
        # Visualization
        plt.figure(figsize=(12, 5))
        
        # Scatter plot of clusters
        plt.subplot(121)
        scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], 
                              c=cluster_labels, cmap='viridis')
        plt.title('K-Means Clustering')
        plt.colorbar(scatter)
        
        # Cluster distribution
        plt.subplot(122)
        cluster_counts = pd.Series(cluster_labels).value_counts()
        cluster_counts.plot(kind='bar')
        plt.title('Cluster Distribution')
        plt.xlabel('Cluster')
        plt.ylabel('Number of Samples')
        plt.tight_layout()
        plt.savefig('kmeans_clustering_visualization.png')
        plt.close()
        
        # Generate business insights
        return self.generate_business_insights(
            features, cluster_labels, feature_names
        )
    
    def generate_business_insights(self, features, cluster_labels, feature_names):
        """
        Generate business insights from clustered data
        """
        # Add cluster labels to dataframe
        self.combined_data['cluster'] = cluster_labels
        
        # Analyze clusters
        cluster_insights = {}
        
        for cluster in np.unique(cluster_labels):
            cluster_data = self.combined_data[self.combined_data['cluster'] == cluster]
            
            # Find top keywords for the cluster
            cluster_features = features[cluster_labels == cluster]
            top_keyword_indices = cluster_features.sum(axis=0).argsort()[::-1][:5]
            top_keywords = [feature_names[i] for i in top_keyword_indices]
            
            cluster_insights[cluster] = {
                'keywords': top_keywords,
                'sample_purposes': cluster_data['purpose'].sample(min(3, len(cluster_data))).tolist(),
                'business_opportunities': self.map_keywords_to_opportunities(top_keywords)
            }
        
        return cluster_insights
    
    def map_keywords_to_opportunities(self, keywords):
        """
        Map keywords to potential business opportunities
        """
        opportunity_mappings = {
            'laptop': ['Computer Repair Shop', 'Laptop Sales and Service', 'Tech Accessory Store'],
            'orang': ['Family Support Services', 'Travel Assistance', 'Family Reunion Planning'],
            'market': ['Market Research Consultancy', 'Local Business Consulting', 'Research Services'],
            'ibadah': ['Event Management', 'Religious Event Planning', 'Community Event Services'],
            'dinas': ['Government Liaison Services', 'Permit and Documentation Assistance']
        }
        
        opportunities = []
        for keyword in keywords:
            if keyword in opportunity_mappings:
                opportunities.extend(opportunity_mappings[keyword])
        
        return list(set(opportunities))

def main():
    # File path
    file_path = 'dataset.xlsx'
    
    # Initialize K-Means clustering analysis
    analysis = KMeansClustering(file_path)
    
    # Feature Extraction
    features, feature_names = analysis.feature_extraction()
    
    # Perform K-Means Clustering
    kmeans_results = analysis.perform_kmeans_clustering(features, feature_names)
    
    # Print Business Insights
    print("\n--- K-Means Clustering Business Insights ---")
    for cluster, cluster_info in kmeans_results.items():
        print(f"\nCluster {cluster}:")
        print(f"Top Keywords: {cluster_info['keywords']}")
        print("Sample Purposes:")
        for purpose in cluster_info['sample_purposes']:
            print(f"- {purpose}")
        print("Potential Business Opportunities:")
        for opportunity in cluster_info['business_opportunities']:
            print(f"- {opportunity}")
    
    # Optional: Save results to a file
    import json
    with open('kmeans_business_insights.json', 'w') as f:
        # Convert numpy keys to standard Python types
        json.dump({int(k): v for k, v in kmeans_results.items()}, f, indent=2)

if __name__ == "__main__":
    main()


Best number of clusters: 10

--- K-Means Clustering Business Insights ---

Cluster 0:
Top Keywords: ['keluarga', 'bertemu', 'acara', 'berkunjung', 'rumah']
Sample Purposes:
- Ingin bertemu dengan keluarga dan ingin libur UAS
- Rumah keluarga
- Berkunjung ke tempat keluarga
Potential Business Opportunities:

Cluster 1:
Top Keywords: ['kerumah', 'pulang', 'keluarga', 'orangtua', 'kakak']
Sample Purposes:
- pulang kerumah
- Balik kerumah
- kembali kerumah
Potential Business Opportunities:

Cluster 2:
Top Keywords: ['pulang', 'rumah', 'keluarga', 'saudara', 'libur']
Sample Purposes:
- Pulang ke rumah
- Pulang ke rumah
- Pulang ke rumah
Potential Business Opportunities:

Cluster 3:
Top Keywords: ['lebaran', 'libur', 'liburan', 'keluarga', 'bertemu']
Sample Purposes:
- Libur lebaran dan bertemu keluarga
- Libur Lebaran
- Libur lebaran
Potential Business Opportunities:

Cluster 4:
Top Keywords: ['paskah', 'libur', 'merayakan', 'keluarga', 'perayaan']
Sample Purposes:
- Merayakan paskah
- Pask