In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

# Data Processing and ML Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

# Additional Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Debugging Utilities
import psutil
import gc

# Ensure NLTK downloads are available
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)


def log_memory(stage):
    """Log memory usage at a specific stage."""
    print(f"{stage}: Memory usage: {psutil.virtual_memory().percent}%")
    gc.collect()


class DBSCANClustering:
    def __init__(self, file_path):
        log_memory("Initialization")
        self.file_path = file_path
        self.load_and_preprocess_data()

    def load_and_preprocess_data(self):
        """Load data from Excel and preprocess columns."""
        print(f"Reading file: {self.file_path}")
        try:
            ib_data = pd.read_excel(self.file_path, sheet_name='DATA IB')
            ik_data = pd.read_excel(self.file_path, sheet_name='DATA IK')
        except Exception as e:
            print(f"Error loading data: {e}")
            return
        
        ib_data['source'] = 'IB'
        ik_data['source'] = 'IK'
        ib_data['purpose'] = ib_data['desc']
        ik_data['purpose'] = ik_data['tujuan']
        self.combined_data = pd.concat(
            [ib_data[['purpose', 'source']], ik_data[['purpose', 'source']]],
            ignore_index=True
        )
        self.combined_data = self.combined_data.dropna(subset=['purpose'])
        print("Data loaded and combined successfully.")

    def advanced_text_preprocessing(self, text):
        """Clean and preprocess text for analysis."""
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('indonesian'))
        tokens = [token for token in tokens if token not in stop_words]
        return ' '.join(tokens)

    def feature_extraction(self):
        """Extract features using TF-IDF vectorization."""
        print("Starting feature extraction...")
        self.combined_data['processed_purpose'] = self.combined_data['purpose'].apply(self.advanced_text_preprocessing)
        vectorizer = TfidfVectorizer(max_features=1000)
        features = vectorizer.fit_transform(self.combined_data['processed_purpose']).toarray()
        print("Feature extraction completed.")
        return features, vectorizer.get_feature_names_out()

    def perform_dbscan_clustering_with_optimization(self, features, feature_names):
        """Perform DBSCAN clustering and optimize parameters."""
        print("Performing DBSCAN clustering...")
        pca = PCA(n_components=2)
        reduced_features = pca.fit_transform(features)
        
        eps_range = np.linspace(0.5, 2.0, 10)
        min_samples_range = range(3, 10)
        
        best_silhouette = -1
        best_params = None
        best_labels = None
        
        for eps in eps_range:
            for min_samples in min_samples_range:
                try:
                    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                    labels = dbscan.fit_predict(features)
                    if len(set(labels)) <= 1 or (len(set(labels)) == 2 and -1 in set(labels)):
                        continue
                    
                    silhouette = silhouette_score(features, labels)
                    if silhouette > best_silhouette:
                        best_silhouette = silhouette
                        best_params = (eps, min_samples)
                        best_labels = labels
                except Exception as e:
                    print(f"Error with eps={eps}, min_samples={min_samples}: {e}")
        
        if best_params:
            print(f"Best Parameters: eps={best_params[0]}, min_samples={best_params[1]}")
            print(f"Best Silhouette Score: {best_silhouette}")
        else:
            print("No valid clustering found.")
            return None
        
        # Visualization
        plt.figure(figsize=(12, 5))
        plt.subplot(121)
        scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=best_labels, cmap='viridis', s=15)
        plt.title("DBSCAN Clustering with Best Parameters")
        plt.colorbar(scatter)

        plt.subplot(122)
        cluster_counts = pd.Series(best_labels).value_counts()
        cluster_counts = cluster_counts[cluster_counts.index != -1]
        cluster_counts.plot(kind='bar', color='blue')
        plt.title("Cluster Distribution")
        plt.xlabel("Cluster")
        plt.ylabel("Number of Samples")
        plt.tight_layout()
        plt.savefig('dbscan_optimized_visualization.png')
        plt.close()
        
        return self.generate_business_insights(features, best_labels, feature_names)

    def generate_business_insights(self, features, labels, feature_names):
        """Generate business insights based on clustering."""
        insights = {}
        for cluster in np.unique(labels):
            if cluster == -1:
                continue
            cluster_indices = np.where(labels == cluster)[0]
            cluster_features = features[cluster_indices]
            top_keyword_indices = cluster_features.sum(axis=0).argsort()[-5:][::-1]
            top_keywords = [feature_names[i] for i in top_keyword_indices]
            insights[cluster] = {
                'keywords': top_keywords,
                'sample_purposes': self.combined_data.iloc[cluster_indices]['purpose'].sample(min(3, len(cluster_indices))).tolist(),
                'business_opportunities': self.map_keywords_to_opportunities(top_keywords)
            }
        return insights

    def map_keywords_to_opportunities(self, keywords):
        """Map keywords to business opportunities."""
        opportunity_mappings = {
            'laptop': ['Computer Repair Shop', 'Laptop Sales and Service', 'Tech Accessory Store'],
            'orang': ['Family Support Services', 'Travel Assistance', 'Family Reunion Planning'],
            'market': ['Market Research Consultancy', 'Local Business Consulting', 'Research Services'],
            'ibadah': ['Event Management', 'Religious Event Planning', 'Community Event Services'],
            'dinas': ['Government Liaison Services', 'Permit and Documentation Assistance']
        }
        opportunities = []
        for keyword in keywords:
            if keyword in opportunity_mappings:
                opportunities.extend(opportunity_mappings[keyword])
        return list(set(opportunities))


def main():
    file_path = 'dataset.xlsx'
    analysis = DBSCANClustering(file_path)
    features, feature_names = analysis.feature_extraction()
    results = analysis.perform_dbscan_clustering_with_optimization(features, feature_names)

    if results:
        print("\n--- DBSCAN Clustering Business Insights ---")
        for cluster, cluster_info in results.items():
            print(f"\nCluster {cluster}:")
            print(f"Top Keywords: {cluster_info['keywords']}")
            print("Sample Purposes:")
            for purpose in cluster_info['sample_purposes']:
                print(f"- {purpose}")
            print("Potential Business Opportunities:")
            for opportunity in cluster_info['business_opportunities']:
                print(f"- {opportunity}")

        with open('dbscan_business_insights.json', 'w') as f:
            json.dump(results, f, indent=2)
        print("Results saved to 'dbscan_business_insights.json'.")


if __name__ == "__main__":
    main()


Initialization: Memory usage: 43.9%
Reading file: dataset.xlsx
Data loaded and combined successfully.
Starting feature extraction...
Feature extraction completed.
Performing DBSCAN clustering...
