In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Models
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Model Evaluation
from sklearn.metrics import (
    silhouette_score, 
    adjusted_rand_score, 
    confusion_matrix, 
    classification_report
)

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Deep Learning (Optional)
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
class AdvancedModelEvaluation:
    def __init__(self, file_path):
        # Load data
        self.ib_data = pd.read_excel(file_path, sheet_name="DATA IB")
        self.ik_data = pd.read_excel(file_path, sheet_name="DATA IK")
        
        # Prepare data
        self.prepare_data()
        
    def prepare_data(self):
        # Combine and preprocess data
        ib_data = self.ib_data[["desc"]].rename(columns={"desc": "keperluan"})
        ik_data = self.ik_data[["tujuan"]].rename(columns={"tujuan": "keperluan"})
        
        ib_data["izin"] = "IB"
        ik_data["izin"] = "IK"
        
        self.combined_data = pd.concat([ib_data, ik_data], ignore_index=True)
        
        # Clean text
        self.combined_data['keperluan_cleaned'] = self.combined_data['keperluan'].str.lower().str.strip()
        
    def feature_extraction(self, method='tfidf'):
        """
        Multiple feature extraction methods
        """
        if method == 'tfidf':
            # TF-IDF Vectorization
            vectorizer = TfidfVectorizer(
                stop_words='english', 
                max_features=100, 
                ngram_range=(1,2)
            )
            features = vectorizer.fit_transform(self.combined_data['keperluan_cleaned'])
        elif method == 'count':
            # Count Vectorization
            vectorizer = CountVectorizer(
                stop_words='english', 
                max_features=100, 
                ngram_range=(1,2)
            )
            features = vectorizer.fit_transform(self.combined_data['keperluan_cleaned'])
        
        return features, vectorizer
    
    def clustering_models(self, features):
        """
        Multiple Clustering Approaches
        """
        results = {}
        
        # K-means Clustering
        kmeans = KMeans(n_clusters=5, random_state=42)
        kmeans_labels = kmeans.fit_predict(features)
        results['kmeans'] = {
            'model': kmeans,
            'labels': kmeans_labels,
            'silhouette_score': silhouette_score(features, kmeans_labels)
        }
        
        # DBSCAN Clustering
        dbscan = DBSCAN(eps=0.5, min_samples=5)
        try:
            dbscan_labels = dbscan.fit_predict(features)
            results['dbscan'] = {
                'model': dbscan,
                'labels': dbscan_labels,
                'silhouette_score': silhouette_score(features, dbscan_labels)
            }
        except:
            print("DBSCAN clustering failed")
        
        # Gaussian Mixture Model
        gmm = GaussianMixture(n_components=5, random_state=42)
        gmm_labels = gmm.fit_predict(features)
        results['gmm'] = {
            'model': gmm,
            'labels': gmm_labels,
            'silhouette_score': silhouette_score(features, gmm_labels)
        }
        
        return results
    
    def classification_models(self, features):
        """
        Classification Models for Validation
        """
        # Encode labels
        le = LabelEncoder()
        y = le.fit_transform(self.combined_data['izin'])
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            features, y, test_size=0.2, random_state=42
        )
        
        # Support Vector Machine
        svm_params = {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
        svm = SVC(random_state=42)
        svm_grid = GridSearchCV(svm, svm_params, cv=3)
        svm_grid.fit(X_train, y_train)
        
        # Random Forest
        rf_params = {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20]
        }
        rf = RandomForestClassifier(random_state=42)
        rf_grid = GridSearchCV(rf, rf_params, cv=3)
        rf_grid.fit(X_train, y_train)
        
        # Evaluation
        print("\nSVM Classification Report:")
        svm_pred = svm_grid.predict(X_test)
        print(classification_report(y_test, svm_pred))
        
        print("\nRandom Forest Classification Report:")
        rf_pred = rf_grid.predict(X_test)
        print(classification_report(y_test, rf_pred))
        
        return {
            'svm': svm_grid,
            'random_forest': rf_grid
        }
    
    def deep_learning_approach(self):
        """
        Optional Deep Learning Approach
        """
        # Tokenization
        tokenizer = Tokenizer(num_words=100)
        tokenizer.fit_on_texts(self.combined_data['keperluan_cleaned'])
        sequences = tokenizer.texts_to_sequences(self.combined_data['keperluan_cleaned'])
        
        # Pad sequences
        padded_sequences = pad_sequences(sequences, maxlen=20)
        
        # Encode labels
        le = LabelEncoder()
        y = le.fit_transform(self.combined_data['izin'])
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            padded_sequences, y, test_size=0.2, random_state=42
        )
        
        # Build LSTM Model
        model = Sequential([
            Embedding(100, 32, input_length=20),
            LSTM(64),
            Dense(16, activation='relu'),
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer='adam', 
            loss='binary_crossentropy', 
            metrics=['accuracy']
        )
        
        # Train model
        history = model.fit(
            X_train, y_train, 
            epochs=10, 
            validation_split=0.2, 
            batch_size=32
        )
        
        # Evaluate
        test_loss, test_acc = model.evaluate(X_test, y_test)
        print(f"\nDeep Learning Model Accuracy: {test_acc}")
        
        return model
    
    def visualization(self, features):
        """
        Dimensionality Reduction for Visualization
        """
        # PCA for visualization
        pca = PCA(n_components=2)
        pca_features = pca.fit_transform(features.toarray())
        
        plt.figure(figsize=(10, 8))
        plt.scatter(pca_features[:, 0], pca_features[:, 1])
        plt.title('PCA Visualization of Features')
        plt.xlabel('First Principal Component')
        plt.ylabel('Second Principal Component')
        plt.show()
    
    def run_evaluation(self):
        """
        Comprehensive Model Evaluation
        """
        # Feature Extraction
        tfidf_features, tfidf_vectorizer = self.feature_extraction(method='tfidf')
        count_features, count_vectorizer = self.feature_extraction(method='count')
        
        # Visualization
        self.visualization(tfidf_features)
        
        # Clustering Models
        print("\nClustering Model Evaluation:")
        clustering_results = self.clustering_models(tfidf_features)
        for name, result in clustering_results.items():
            print(f"{name.upper()} Silhouette Score: {result['silhouette_score']}")
        
        # Classification Models
        print("\nClassification Model Evaluation:")
        classification_results = self.classification_models(tfidf_features)
        
        # Optional Deep Learning
        # deep_learning_model = self.deep_learning_approach()
        
        return {
            'clustering': clustering_results,
            'classification': classification_results
        }

# Main Execution
def main():
    file_path = "dataset.xlsx"
    evaluator = AdvancedModelEvaluation(file_path)
    results = evaluator.run_evaluation()

if __name__ == "__main__":
    main()

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, silhouette_score

# Keras/TensorFlow imports
# import tensorflow as tf
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense
# from tensorflow.keras.utils import to_categorical

In [2]:
# Indonesian stopwords
indonesian_stopwords = set([
    'yang', 'di', 'ke', 'dari', 'pada', 'dalam', 'untuk', 'dengan', 'dan', 'atau',
    'sebuah', 'seorang', 'ada', 'tidak', 'ini', 'itu', 'akan', 'dapat', 'saya', 'anda',
    'dia', 'mereka', 'kita', 'tentang', 'sudah', 'jika', 'karena', 'adalah', 'bisa',
    'saat', 'hal', 'mata', 'maka', 'kepada', 'setelah', 'sebagai', 'masih', 'seperti',
    'sangat', 'telah', 'namun', 'jadi', 'melalui', 'apabila', 'sampai', 'lebih', 'selain'
])

def load_and_preprocess_data(file_path):
    """
    Load and preprocess data from Excel file
    
    Args:
        file_path (str): Path to the Excel file
    
    Returns:
        pd.DataFrame: Preprocessed combined data
    """
    try:
        # Load data
        ib_data = pd.read_excel(file_path, sheet_name="DATA IB")
        ik_data = pd.read_excel(file_path, sheet_name="DATA IK")
        
        # Prepare data
        ib_data = ib_data[["desc"]].rename(columns={"desc": "keperluan"})
        ik_data = ik_data[["tujuan"]].rename(columns={"tujuan": "keperluan"})
        
        ib_data["izin"] = "IB"
        ik_data["izin"] = "IK"
        
        # Combine datasets
        combined_data = pd.concat([ib_data, ik_data], ignore_index=True)
        
        # Text preprocessing with Indonesian-specific cleaning
        def clean_text(text):
            # Convert to lowercase
            text = str(text).lower()
            
            # Remove punctuation and numbers
            text = ''.join([char for char in text if char.isalpha() or char.isspace()])
            
            # Remove stopwords
            words = text.split()
            words = [word for word in words if word not in indonesian_stopwords]
            
            return ' '.join(words)
        
        combined_data['keperluan_cleaned'] = combined_data['keperluan'].apply(clean_text)
        
        # Encode permit type
        le = LabelEncoder()
        combined_data['izin_encoded'] = le.fit_transform(combined_data['izin'])
        
        print(f"Total combined data: {len(combined_data)}")
        print(f"Unique permit types: {combined_data['izin'].unique()}")
        
        return combined_data
    
    except Exception as e:
        print(f"Error in data loading and preprocessing: {e}")
        raise

In [5]:
# Load Data
file_path = "dataset.xlsx"
combined_data = load_and_preprocess_data(file_path)

Total combined data: 80366
Unique permit types: ['IB' 'IK']


In [3]:
def extract_features(combined_data, max_features=500):
    """
    Extract TF-IDF features from cleaned text
    
    Args:
        combined_data (pd.DataFrame): Preprocessed data
        max_features (int): Maximum number of features
    
    Returns:
        tuple: TF-IDF matrix and vectorizer
    """
    try:
        vectorizer = TfidfVectorizer(
            max_features=max_features,
            stop_words=list(indonesian_stopwords),
            ngram_range=(1, 2)
        )
        tfidf_matrix = vectorizer.fit_transform(combined_data['keperluan_cleaned']).toarray()
        
        print(f"TF-IDF Matrix shape: {tfidf_matrix.shape}")
        return tfidf_matrix, vectorizer
    
    except Exception as e:
        print(f"Error in feature extraction: {e}")
        raise

In [7]:
# Feature Extraction
tfidf_matrix, vectorizer = extract_features(combined_data)

TF-IDF Matrix shape: (80366, 500)


In [4]:
def perform_clustering(tfidf_matrix):
    """
    Perform multiple clustering approaches
    
    Args:
        tfidf_matrix (np.ndarray): TF-IDF feature matrix
    
    Returns:
        dict: Clustering results
    """
    results = {}
    
    try:
        # K-means Clustering
        kmeans = KMeans(n_clusters=5, n_init=10, random_state=42)
        kmeans_labels = kmeans.fit_predict(tfidf_matrix)
        results['KMeans'] = {
            'labels': kmeans_labels,
            'silhouette_score': silhouette_score(tfidf_matrix, kmeans_labels)
        }
        
        # DBSCAN with adaptive parameters
        try:
            scaled_matrix = StandardScaler().fit_transform(tfidf_matrix)
            dbscan = DBSCAN(eps=0.5, min_samples=min(5, len(tfidf_matrix)//10))
            dbscan_labels = dbscan.fit_predict(scaled_matrix)
            
            if len(set(dbscan_labels)) > 1:
                results['DBSCAN'] = {
                    'labels': dbscan_labels,
                    'silhouette_score': silhouette_score(tfidf_matrix, dbscan_labels)
                }
        except Exception as e:
            print("DBSCAN encountered an issue:", e)
        
        return results
    
    except Exception as e:
        print(f"Clustering error: {e}")
        return results

In [10]:
# Clustering
clustering_results = perform_clustering(tfidf_matrix)

In [5]:
def perform_classification(tfidf_matrix, combined_data):
    """
    Perform classification using multiple models
    
    Args:
        tfidf_matrix (np.ndarray): TF-IDF feature matrix
        combined_data (pd.DataFrame): Preprocessed data
    
    Returns:
        dict: Classification results
    """
    X = tfidf_matrix
    y = combined_data['izin_encoded']
    
    # Split the data with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Classification models
    models = {
        'SVM': SVC(kernel='linear', class_weight='balanced'),
        'Random Forest': RandomForestClassifier(
            n_estimators=100, 
            class_weight='balanced',
            random_state=42
        )
    }
    
    results = {}
    for name, model in models.items():
        try:
            # Cross-validation
            cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
            
            # Fit the model
            model.fit(X_train_scaled, y_train)
            
            # Predictions
            y_pred = model.predict(X_test_scaled)
            
            results[name] = {
                'cross_val_scores': cv_scores,
                'mean_cv_score': cv_scores.mean(),
                'classification_report': classification_report(y_test, y_pred)
            }
        except Exception as e:
            print(f"Error in {name} classification: {e}")
    
    return results

In [17]:
# Classification
classification_results = perform_classification

In [9]:
def identify_business_opportunities(combined_data):
    """
    Analyze and suggest business opportunities from text
    
    Args:
        combined_data (pd.DataFrame): Preprocessed data
    
    Returns:
        pd.Series: Top business opportunities
    """
    try:
        # Group by purpose and count
        purpose_counts = combined_data['keperluan_cleaned'].value_counts()
        
        # Top purposes
        top_purposes = purpose_counts.head(15)
        
        print("\nTop Business Opportunity Suggestions:")
        for purpose, count in top_purposes.items():
            print(f"Purpose: {purpose}, Frequency: {count}")
        
        return top_purposes
    except Exception as e:
        print(f"Business opportunity analysis error: {e}")
        return None

def visualize_results(clustering_results, classification_results):
    """
    Visualize clustering and classification results
    
    Args:
        clustering_results (dict): Clustering results
        classification_results (dict): Classification results
    """
    try:
        plt.figure(figsize=(15, 6))
        
        # Silhouette Scores
        plt.subplot(1, 2, 1)
        silhouette_data = [results.get('silhouette_score', 0) for results in clustering_results.values()]
        plt.bar(clustering_results.keys(), silhouette_data)
        plt.title('Clustering Silhouette Scores')
        plt.ylabel('Silhouette Score')
        plt.ylim(0, 1)
        
        # Classification Accuracy
        plt.subplot(1, 2, 2)
        cv_scores = [results.get('mean_cv_score', 0) for results in classification_results.values()]
        plt.bar(classification_results.keys(), cv_scores)
        plt.title('Classification Cross-Validation Scores')
        plt.ylabel('Mean CV Score')
        plt.ylim(0, 1)
        
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"Visualization error: {e}")

# Main execution function
def run_complete_analysis(file_path):
    """
    Run complete machine learning analysis
    
    Args:
        file_path (str): Path to the Excel file
    
    Returns:
        dict: Comprehensive analysis results
    """
    try:
        # Step 1: Load and Preprocess Data
        combined_data = load_and_preprocess_data(file_path)
        
        # Step 2: Feature Extraction
        tfidf_matrix, vectorizer = extract_features(combined_data)
        
        # Step 3: Clustering
        clustering_results = perform_clustering(tfidf_matrix)
        
        # Step 4: Classification
        classification_results = perform_classification(tfidf_matrix, combined_data)
        
        # Step 5: Deep Learning
        # deep_learning_results = perform_deep_learning(combined_data)
        
        # Step 6: Business Opportunities
        business_opportunities = identify_business_opportunities(combined_data)
        
        # Step 7: Visualization
        visualize_results(clustering_results, classification_results)
        
        return {
            'combined_data': combined_data,
            'tfidf_matrix': tfidf_matrix,
            'vectorizer': vectorizer,
            'clustering': clustering_results,
            'classification': classification_results,
            # 'deep_learning': deep_learning_results,
            'business_opportunities': business_opportunities
        }
    
    except Exception as e:
        print(f"Complete analysis error: {e}")
        return None