In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    silhouette_score
)

# Machine Learning Models
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Deep Learning and Advanced Techniques
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
class AdvancedMLProject:
    def __init__(self, file_path):
        # Load data
        self.ib_data = pd.read_excel(file_path, sheet_name="DATA IB")
        self.ik_data = pd.read_excel(file_path, sheet_name="DATA IK")
        
        # Preprocess data
        self.prepare_data()
    
    def prepare_data(self):
        # Combine and preprocess data
        ib_data = self.ib_data[["desc"]].rename(columns={"desc": "keperluan"})
        ik_data = self.ik_data[["tujuan"]].rename(columns={"tujuan": "keperluan"})
        
        ib_data["izin"] = "IB"
        ik_data["izin"] = "IK"
        
        # Combine datasets
        self.combined_data = pd.concat([ib_data, ik_data], ignore_index=True)
        
        # Text preprocessing
        self.combined_data['keperluan_cleaned'] = self.combined_data['keperluan'].str.lower().str.strip()
        
        # Encode permit type
        le = LabelEncoder()
        self.combined_data['izin_encoded'] = le.fit_transform(self.combined_data['izin'])
    
    def feature_extraction(self):
        # TF-IDF Vectorization
        vectorizer = TfidfVectorizer(max_features=100)
        tfidf_matrix = vectorizer.fit_transform(self.combined_data['keperluan_cleaned']).toarray()
        
        return tfidf_matrix, vectorizer
    
    def multiple_clustering_approaches(self, tfidf_matrix):
        # Multiple clustering techniques
        results = {}
        
        # K-means Clustering
        kmeans = KMeans(n_clusters=5, random_state=42)
        kmeans_labels = kmeans.fit_predict(tfidf_matrix)
        results['KMeans'] = {
            'labels': kmeans_labels,
            'silhouette_score': silhouette_score(tfidf_matrix, kmeans_labels)
        }
        
        # DBSCAN Clustering
        try:
            dbscan = DBSCAN(eps=0.5, min_samples=3)
            dbscan_labels = dbscan.fit_predict(tfidf_matrix)
            results['DBSCAN'] = {
                'labels': dbscan_labels,
                'silhouette_score': silhouette_score(tfidf_matrix, dbscan_labels)
            }
        except Exception as e:
            print("DBSCAN encountered an issue:", e)
        
        return results
    
    def classification_models(self, tfidf_matrix):
        # Prepare data for classification
        X = tfidf_matrix
        y = self.combined_data['izin_encoded']
        
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Multiple classification models
        models = {
            'SVM': SVC(kernel='linear'),
            'Random Forest': RandomForestClassifier(n_estimators=100)
        }
        
        results = {}
        for name, model in models.items():
            # Cross-validation
            cv_scores = cross_val_score(model, X_train, y_train, cv=5)
            
            # Fit the model
            model.fit(X_train, y_train)
            
            # Predictions
            y_pred = model.predict(X_test)
            
            results[name] = {
                'cross_val_scores': cv_scores,
                'mean_cv_score': cv_scores.mean(),
                'classification_report': classification_report(y_test, y_pred)
            }
        
        return results
    
    def deep_learning_approach(self):
        # Tokenization
        tokenizer = Tokenizer(num_words=100)
        tokenizer.fit_on_texts(self.combined_data['keperluan_cleaned'])
        
        # Sequence preparation
        sequences = tokenizer.texts_to_sequences(self.combined_data['keperluan_cleaned'])
        padded_sequences = pad_sequences(sequences, maxlen=20)
        
        # Prepare labels
        y = self.combined_data['izin_encoded']
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            padded_sequences, y, test_size=0.2, random_state=42
        )
        
        # LSTM Model
        model = Sequential([
            Embedding(100, 32, input_length=20),
            LSTM(64),
            Dense(16, activation='relu'),
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
        # Train the model
        history = model.fit(
            X_train, y_train, 
            epochs=10, 
            validation_split=0.2, 
            verbose=0
        )
        
        # Evaluate
        loss, accuracy = model.evaluate(X_test, y_test)
        
        return {
            'model': model,
            'loss': loss,
            'accuracy': accuracy,
            'training_history': history
        }
    
    def visualize_results(self, clustering_results, classification_results):
        # Visualize clustering results
        plt.figure(figsize=(12, 6))
        
        # Silhouette Scores
        plt.subplot(1, 2, 1)
        silhouette_data = [results['silhouette_score'] for results in clustering_results.values()]
        plt.bar(clustering_results.keys(), silhouette_data)
        plt.title('Clustering Silhouette Scores')
        plt.ylabel('Silhouette Score')
        
        # Classification Accuracy
        plt.subplot(1, 2, 2)
        cv_scores = [results['mean_cv_score'] for results in classification_results.values()]
        plt.bar(classification_results.keys(), cv_scores)
        plt.title('Classification Cross-Validation Scores')
        plt.ylabel('Mean CV Score')
        
        plt.tight_layout()
        plt.show()
    
    def run_project(self):
        # Feature Extraction
        tfidf_matrix, vectorizer = self.feature_extraction()
        
        # Clustering Approaches
        clustering_results = self.multiple_clustering_approaches(tfidf_matrix)
        
        # Classification Models
        classification_results = self.classification_models(tfidf_matrix)
        
        # Deep Learning Approach
        deep_learning_results = self.deep_learning_approach()
        
        # Visualize Results
        self.visualize_results(clustering_results, classification_results)
        
        return {
            'clustering': clustering_results,
            'classification': classification_results,
            'deep_learning': deep_learning_results
        }

# Main Execution
def main():
    file_path = "dataset.xlsx"
    project = AdvancedMLProject(file_path)
    results = project.run_project()
    
    # Print detailed results
    print("\nClustering Results:")
    for algo, result in results['clustering'].items():
        print(f"{algo} - Silhouette Score: {result['silhouette_score']}")
    
    print("\nClassification Results:")
    for model, result in results['classification'].items():
        print(f"{model}:")
        print(f"  Mean CV Score: {result['mean_cv_score']}")
        print("  Classification Report:")
        print(result['classification_report'])
    
    print("\nDeep Learning Results:")
    print(f"Accuracy: {results['deep_learning']['accuracy']}")
    print(f"Loss: {results['deep_learning']['loss']}")

if __name__ == "__main__":
    main()