In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, silhouette_score

# Keras/TensorFlow imports
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical

In [None]:
class AdvancedMLProject:
    def __init__(self, file_path):
        # Load data with error handling
        try:
            self.ib_data = pd.read_excel(file_path, sheet_name="DATA IB")
            self.ik_data = pd.read_excel(file_path, sheet_name="DATA IK")
        except Exception as e:
            print(f"Error loading Excel file: {e}")
            raise
        
        # Preprocess data
        self.prepare_data()
    
    def prepare_data(self):
        # Combine and preprocess data with error handling
        try:
            # Use the correct column names based on your description
            ib_data = self.ib_data[["desc"]].rename(columns={"desc": "keperluan"})
            ik_data = self.ik_data[["tujuan"]].rename(columns={"tujuan": "keperluan"})
            
            ib_data["izin"] = "IB"
            ik_data["izin"] = "IK"
            
            # Combine datasets
            self.combined_data = pd.concat([ib_data, ik_data], ignore_index=True)
            
            # Text preprocessing with additional cleaning
            self.combined_data['keperluan_cleaned'] = (
                self.combined_data['keperluan']
                .str.lower()
                .str.strip()
                .str.replace('[^\w\s]', '', regex=True)  # Remove punctuation
            )
            
            # Encode permit type
            le = LabelEncoder()
            self.combined_data['izin_encoded'] = le.fit_transform(self.combined_data['izin'])
            
            print(f"Total combined data: {len(self.combined_data)}")
        except Exception as e:
            print(f"Error in data preparation: {e}")
            raise
    
    def feature_extraction(self, max_features=500):
        # More robust TF-IDF Vectorization
        try:
            vectorizer = TfidfVectorizer(
                max_features=max_features,
                stop_words='english',  # You might want to use Indonesian stop words
                ngram_range=(1, 2)  # Consider unigrams and bigrams
            )
            tfidf_matrix = vectorizer.fit_transform(self.combined_data['keperluan_cleaned']).toarray()
            
            print(f"TF-IDF Matrix shape: {tfidf_matrix.shape}")
            return tfidf_matrix, vectorizer
        except Exception as e:
            print(f"Error in feature extraction: {e}")
            raise
    
    def multiple_clustering_approaches(self, tfidf_matrix):
        # More robust clustering with error handling
        results = {}
        
        try:
            # K-means Clustering with multiple initializations
            kmeans = KMeans(
                n_clusters=5, 
                n_init=10,  # Multiple initializations to find best centroid
                random_state=42
            )
            kmeans_labels = kmeans.fit_predict(tfidf_matrix)
            results['KMeans'] = {
                'labels': kmeans_labels,
                'silhouette_score': silhouette_score(tfidf_matrix, kmeans_labels)
            }
            
            # DBSCAN with adaptive parameters
            try:
                from sklearn.preprocessing import StandardScaler
                scaled_matrix = StandardScaler().fit_transform(tfidf_matrix)
                dbscan = DBSCAN(eps=0.5, min_samples=min(5, len(tfidf_matrix)//10))
                dbscan_labels = dbscan.fit_predict(scaled_matrix)
                if len(set(dbscan_labels)) > 1:  # Ensure more than one cluster
                    results['DBSCAN'] = {
                        'labels': dbscan_labels,
                        'silhouette_score': silhouette_score(tfidf_matrix, dbscan_labels)
                    }
            except Exception as e:
                print("DBSCAN encountered an issue:", e)
        
        except Exception as e:
            print(f"Clustering error: {e}")
        
        return results
    
    def classification_models(self, tfidf_matrix):
        # More robust classification with additional processing
        X = tfidf_matrix
        y = self.combined_data['izin_encoded']
        
        # Split the data with stratification
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale features for better performance
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Multiple classification models
        models = {
            'SVM': SVC(kernel='linear', class_weight='balanced'),
            'Random Forest': RandomForestClassifier(
                n_estimators=100, 
                class_weight='balanced',
                random_state=42
            )
        }
        
        results = {}
        for name, model in models.items():
            try:
                # Cross-validation
                cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
                
                # Fit the model
                model.fit(X_train_scaled, y_train)
                
                # Predictions
                y_pred = model.predict(X_test_scaled)
                
                results[name] = {
                    'cross_val_scores': cv_scores,
                    'mean_cv_score': cv_scores.mean(),
                    'classification_report': classification_report(y_test, y_pred)
                }
            except Exception as e:
                print(f"Error in {name} classification: {e}")
        
        return results
    
    def deep_learning_approach(self):
        # More memory-efficient deep learning approach
        try:
            # Tokenization with more robust parameters
            tokenizer = Tokenizer(num_words=1000, oov_token='<OOV>')
            tokenizer.fit_on_texts(self.combined_data['keperluan_cleaned'])
            
            # Sequence preparation
            sequences = tokenizer.texts_to_sequences(self.combined_data['keperluan_cleaned'])
            padded_sequences = pad_sequences(sequences, maxlen=50, padding='post', truncating='post')
            
            # Prepare labels (convert to categorical)
            y = to_categorical(self.combined_data['izin_encoded'])
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                padded_sequences, y, test_size=0.2, random_state=42, stratify=y
            )
            
            # More complex LSTM Model
            model = Sequential([
                Embedding(1000, 64, input_length=50),
                LSTM(128, return_sequences=True),
                LSTM(64),
                Dense(32, activation='relu'),
                Dense(y.shape[1], activation='softmax')
            ])
            
            model.compile(
                optimizer='adam', 
                loss='categorical_crossentropy', 
                metrics=['accuracy']
            )
            
            # Reduce memory usage with smaller batches and early stopping
            from tensorflow.keras.callbacks import EarlyStopping
            early_stopping = EarlyStopping(
                monitor='val_loss', 
                patience=3, 
                restore_best_weights=True
            )
            
            # Train with reduced epochs and verbose output
            history = model.fit(
                X_train, y_train, 
                epochs=20, 
                batch_size=32,
                validation_split=0.2, 
                callbacks=[early_stopping],
                verbose=1  # Change to 1 to see progress
            )
            
            # Evaluate
            loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
            
            return {
                'model': model,
                'loss': loss,
                'accuracy': accuracy,
                'training_history': history
            }
        except Exception as e:
            print(f"Deep Learning approach error: {e}")
            raise
    
    def visualize_results(self, clustering_results, classification_results):
        # Enhanced visualization with error handling
        try:
            plt.figure(figsize=(15, 6))
            
            # Silhouette Scores
            plt.subplot(1, 2, 1)
            silhouette_data = [results.get('silhouette_score', 0) for results in clustering_results.values()]
            plt.bar(clustering_results.keys(), silhouette_data)
            plt.title('Clustering Silhouette Scores')
            plt.ylabel('Silhouette Score')
            plt.ylim(0, 1)
            
            # Classification Accuracy
            plt.subplot(1, 2, 2)
            cv_scores = [results.get('mean_cv_score', 0) for results in classification_results.values()]
            plt.bar(classification_results.keys(), cv_scores)
            plt.title('Classification Cross-Validation Scores')
            plt.ylabel('Mean CV Score')
            plt.ylim(0, 1)
            
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"Visualization error: {e}")
    
    def identify_business_opportunities(self):
        # Extract and analyze business opportunities from text
        try:
            # Group by purpose and count
            purpose_counts = self.combined_data['keperluan_cleaned'].value_counts()
            
            # Top purposes
            top_purposes = purpose_counts.head(10)
            
            print("\nTop Business Opportunity Suggestions:")
            for purpose, count in top_purposes.items():
                print(f"Purpose: {purpose}, Frequency: {count}")
            
            return top_purposes
        except Exception as e:
            print(f"Business opportunity analysis error: {e}")
            return None
    
    def run_project(self):
        # Comprehensive project execution with error handling
        try:
            # Feature Extraction
            tfidf_matrix, vectorizer = self.feature_extraction()
            
            # Clustering Approaches
            clustering_results = self.multiple_clustering_approaches(tfidf_matrix)
            
            # Classification Models
            classification_results = self.classification_models(tfidf_matrix)
            
            # Deep Learning Approach
            deep_learning_results = self.deep_learning_approach()
            
            # Visualize Results
            self.visualize_results(clustering_results, classification_results)
            
            # Identify Business Opportunities
            business_opportunities = self.identify_business_opportunities()
            
            return {
                'clustering': clustering_results,
                'classification': classification_results,
                'deep_learning': deep_learning_results,
                'business_opportunities': business_opportunities
            }
        except Exception as e:
            print(f"Project execution error: {e}")
            return None

# Jupyter Notebook Execution
def main():
    file_path = "dataset.xlsx"
    project = AdvancedMLProject(file_path)
    results = project.run_project()
    
    if results:
        # Detailed Results Printing
        print("\n--- Detailed Project Results ---")
        
        print("\nClustering Results:")
        for algo, result in results['clustering'].items():
            print(f"{algo} - Silhouette Score: {result.get('silhouette_score', 'N/A')}")
        
        print("\nClassification Results:")
        for model, result in results['classification'].items():
            print(f"{model}:")
            print(f"  Mean CV Score: {result['mean_cv_score']}")
            print("  Classification Report:")
            print(result['classification_report'])
        
        print("\nDeep Learning Results:")
        print(f"Accuracy: {results['deep_learning'].get('accuracy', 'N/A')}")
        print(f"Loss: {results['deep_learning'].get('loss', 'N/A')}")
        
        print("\nBusiness Opportunity Suggestions:")
        print(results.get('business_opportunities', 'No opportunities found'))

# Run the main function
if __name__ == "__main__":
    main()

Total combined data: 80366
TF-IDF Matrix shape: (80366, 500)
