In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, silhouette_score, confusion_matrix
from sklearn.cluster import KMeans, AgglomerativeClustering
import umap.umap_ as umap
from scipy.cluster.hierarchy import dendrogram, linkage
import pickle
import numpy as np

# Load datasets
dataset1 = pd.read_csv('dataset_to_447109517.csv').drop(columns=['name'], errors='ignore')
dataset2 = pd.read_csv('kaggle_Int_755833365.csv').drop(columns=['name'], errors='ignore')

# Data Preprocessing Functions
def drop_high_missing_columns(data, threshold=0.5):
    """
    Drop columns with missing values exceeding a specified threshold.
    """
    missing_ratios = data.isnull().mean()
    columns_to_drop = missing_ratios[missing_ratios > threshold].index
    print(f"Dropping columns with more than {threshold*100}% missing values: {columns_to_drop.tolist()}")
    data = data.drop(columns=columns_to_drop)
    return data

def encode_non_numeric(data):
    """
    Encode non-numeric columns using LabelEncoder.
    """
    non_numeric_columns = data.select_dtypes(include=['object']).columns
    label_encoder = LabelEncoder()
    for col in non_numeric_columns:
        data[col] = label_encoder.fit_transform(data[col].astype(str))
    return data

def preprocess_data(data, columns_to_scale=None, categorical_columns=None, scaling_method='standard'):
    """
    Preprocess data with encoding for non-numeric columns and scaling.
    """
    # Drop columns with excessive missing values

    # Encode non-numeric columns
    data = encode_non_numeric(data)

    # Fill missing values for numeric columns
    if columns_to_scale:
        valid_columns = [col for col in columns_to_scale if col in data.columns]
        data[valid_columns] = data[valid_columns].fillna(data[valid_columns].mean())

    # Fill missing values for categorical columns
    if categorical_columns:
        for col in categorical_columns:
            if col in data.columns:
                data[col] = data[col].fillna(data[col].mode()[0])

    # Scale numeric columns
    if columns_to_scale:
        scaler = None
        if scaling_method == 'standard':
            scaler = StandardScaler()
        elif scaling_method == 'minmax':
            scaler = MinMaxScaler()
        elif scaling_method == 'robust':
            scaler = RobustScaler()
        else:
            raise ValueError("Invalid scaling method. Choose 'standard', 'minmax', or 'robust'.")
        
        valid_columns = [col for col in columns_to_scale if col in data.columns]
        data[valid_columns] = scaler.fit_transform(data[valid_columns])

    return data


# Preprocess both datasets
columns_to_scale1 = dataset1.select_dtypes(include=['float64', 'int64']).columns.tolist()
columns_to_scale2 = dataset2.select_dtypes(include=['float64', 'int64']).columns.tolist()
dataset1 = preprocess_data(dataset1, columns_to_scale=columns_to_scale1)
columns_to_scale1 = dataset1.select_dtypes(include=['float64', 'int64']).columns.tolist()  # Recalculate after preprocessing
dataset2 = preprocess_data(dataset2, columns_to_scale=columns_to_scale2)
columns_to_scale2 = dataset2.select_dtypes(include=['float64', 'int64']).columns.tolist()  # Recalculate after preprocessing

# PCA
pca_results = {}
for dataset, name in zip([dataset1, dataset2], ['Dataset 1', 'Dataset 2']):
    numeric_data = dataset.select_dtypes(include=['float64', 'int64']).dropna()
    if numeric_data.empty:
        print(f"Skipping PCA for {name} as no numeric data is available.")
        continue
    pca = PCA()
    pca_result = pca.fit_transform(StandardScaler().fit_transform(numeric_data))

    explained_variance_ratio = pca.explained_variance_ratio_
    cumulative_variance = explained_variance_ratio.cumsum()

    pca_results[name] = pca_result

    plt.figure()
    plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, alpha=0.5, label='Individual Variance')
    plt.step(range(1, len(cumulative_variance) + 1), cumulative_variance, where='mid', label='Cumulative Variance')
    plt.title(f"{name} - PCA Explained Variance")
    plt.xlabel('Principal Components')
    plt.ylabel('Variance Ratio')
    plt.legend()
    plt.show()

# UMAP
umap_results = {}
for dataset, name in zip([dataset1, dataset2], ['Dataset 1', 'Dataset 2']):
    numeric_data = dataset.select_dtypes(include=['float64', 'int64']).dropna()
    if numeric_data.empty:
        print(f"Skipping UMAP for {name} as no numeric data is available.")
        continue
    umap_model = umap.UMAP(n_components=2, random_state=42)
    umap_result = umap_model.fit_transform(StandardScaler().fit_transform(numeric_data))

    plt.figure()
    plt.scatter(umap_result[:, 0], umap_result[:, 1], s=10, alpha=0.7)
    plt.title(f"{name} - UMAP 2D")
    plt.show()

    umap_results[name] = umap_result

# Elbow and Silhouette Scores with Dendrograms
for dataset, name, pca_data, umap_data in zip([dataset1, dataset2], ['Dataset 1', 'Dataset 2'], pca_results.values(), umap_results.values()):
    for data, data_name in zip([dataset.select_dtypes(include=['float64', 'int64']), pca_data, umap_data], ['Original', 'PCA-Reduced', 'UMAP-Reduced']):
        scaled_data = StandardScaler().fit_transform(data)

        # Elbow Method
        inertias = []
        silhouettes = []
        for n_clusters in range(2, 11):
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            labels = kmeans.fit_predict(scaled_data)
            inertias.append(kmeans.inertia_)
            silhouettes.append(silhouette_score(scaled_data, labels))

        plt.figure()
        plt.plot(range(2, 11), inertias, marker='o', label='Inertia')
        plt.title(f"{name} - {data_name} Data - Elbow Plot")
        plt.xlabel('Number of Clusters')
        plt.ylabel('Inertia')
        plt.legend()
        plt.show()

        plt.figure()
        plt.plot(range(2, 11), silhouettes, marker='o', label='Silhouette Score')
        plt.title(f"{name} - {data_name} Data - Silhouette Scores")
        plt.xlabel('Number of Clusters')
        plt.ylabel('Silhouette Score')
        plt.legend()
        plt.show()

        # Dendrogram for Hierarchical Clustering
        linkage_matrix = linkage(scaled_data, method='ward')
        plt.figure(figsize=(10, 7))
        dendrogram(linkage_matrix)
        plt.title(f"{name} - {data_name} Data - Hierarchical Clustering Dendrogram")
        plt.xlabel('Samples')
        plt.ylabel('Distance')
        plt.show()

        # K-Means Clusters Plot
        optimal_k = 4  # Replace with the determined optimal number of clusters
        kmeans = KMeans(n_clusters=optimal_k, random_state=42)
        labels = kmeans.fit_predict(scaled_data)

        plt.figure()
        plt.scatter(scaled_data[:, 0], scaled_data[:, 1], c=labels, cmap='viridis', s=10)
        plt.title(f"{name} - {data_name} Data - K-Means Clustering")
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')
        plt.colorbar(label='Cluster')
        plt.show()
# Evaluation for Original, PCA, and UMAP Data
for dataset, name, target_column, pca_data, umap_data in zip([dataset1, dataset2], ['Dataset 1', 'Dataset 2'], ['PROFILE', 'group'], pca_results.values(), umap_results.values()):
    if target_column in dataset.columns:
        X = dataset.drop(columns=[target_column])
        y = dataset[target_column]

        for data, data_name in zip([X, pca_data, umap_data], ['Original', 'PCA-Reduced', 'UMAP-Reduced']):
            X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=42)

            ada_model = AdaBoostClassifier(n_estimators=100, random_state=42, algorithm="SAMME")
            rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

            # Train and evaluate AdaBoost
            ada_model.fit(X_train, y_train)
            y_test_pred_ada = ada_model.predict(X_test)
            y_test_proba_ada = ada_model.predict_proba(X_test)[:, 1] if hasattr(ada_model, "predict_proba") else None

            # Train and evaluate Random Forest
            rf_model.fit(X_train, y_train)
            y_test_pred_rf = rf_model.predict(X_test)
            y_test_proba_rf = rf_model.predict_proba(X_test)[:, 1] if hasattr(rf_model, "predict_proba") else None

            # Metrics
            metrics = pd.DataFrame([
                ["AdaBoost", data_name, accuracy_score(y_test, y_test_pred_ada),
                 f1_score(y_test, y_test_pred_ada, average='weighted'),
                 precision_score(y_test, y_test_pred_ada, average='weighted'),
                 recall_score(y_test, y_test_pred_ada, average='weighted')],
                ["Random Forest", data_name, accuracy_score(y_test, y_test_pred_rf),
                 f1_score(y_test, y_test_pred_rf, average='weighted'),
                 precision_score(y_test, y_test_pred_rf, average='weighted'),
                 recall_score(y_test, y_test_pred_rf, average='weighted')]
            ], columns=["Model", "Data", "Accuracy", "F1-Score", "Precision", "Recall"])

            print(f"\nMetrics for {name} - {data_name} Data:")
            print(metrics)

            # Confusion Matrices
            for model, model_name, y_pred in zip(
                [ada_model, rf_model], ["AdaBoost", "Random Forest"], [y_test_pred_ada, y_test_pred_rf]
            ):
                cm = confusion_matrix(y_test, y_pred)
                plt.figure()
                sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
                plt.title(f"{model_name} - Confusion Matrix ({data_name})")
                plt.xlabel("Predicted")
                plt.ylabel("Actual")
                plt.show()

            # ROC Curve
            if len(y.unique()) == 2:  # Binary classification
                plt.figure()
                for model, label, y_proba in zip(
                    [ada_model, rf_model],
                    ["AdaBoost", "Random Forest"],
                    [y_test_proba_ada, y_test_proba_rf],
                ):
                    if y_proba is not None:
                        fpr, tpr, _ = roc_curve(y_test, y_proba)
                        auc_score = roc_auc_score(y_test, y_proba)
                        plt.plot(fpr, tpr, label=f"{label} (AUC = {auc_score:.2f})")
                plt.plot([0, 1], [0, 1], 'k--', lw=1)
                plt.title(f"ROC Curve ({data_name})")
                plt.xlabel("False Positive Rate")
                plt.ylabel("True Positive Rate")
                plt.legend()
                plt.show()

            # K-Means Clusters Plot
            optimal_k = 4  # Replace with the determined optimal number of clusters
            kmeans = KMeans(n_clusters=optimal_k, random_state=42)
            labels = kmeans.fit_predict(X_test)

            # Ensure compatibility for plotting
            if isinstance(X_test, pd.DataFrame):
                X_test_array = X_test.values  # Convert to NumPy array
            else:
                X_test_array = X_test

            if X_test_array.shape[1] >= 2:  # Check for at least two dimensions
                plt.figure()
                plt.scatter(X_test_array[:, 0], X_test_array[:, 1], c=labels, cmap='viridis', s=10)
                plt.title(f"{name} - {data_name} Data - K-Means Clustering")
                plt.xlabel('Feature 1')
                plt.ylabel('Feature 2')
                plt.colorbar(label='Cluster')
                plt.show()
            else:
                print(f"Insufficient dimensions for K-Means plot: {name} - {data_name} Data.")

            # Dendrogram for Hierarchical Clustering
            linkage_matrix = linkage(X_test_array, method='ward')
            plt.figure(figsize=(10, 7))
            dendrogram(linkage_matrix)
            plt.title(f"{name} - {data_name} Data - Hierarchical Clustering Dendrogram")
            plt.xlabel("Sample Index")
            plt.ylabel("Distance")
            plt.show()

            # Silhouette Score and Elbow Plot
            inertias = []
            silhouettes = []
            for n_clusters in range(2, 11):
                kmeans_model = KMeans(n_clusters=n_clusters, random_state=42)
                cluster_labels = kmeans_model.fit_predict(X_test_array)
                inertias.append(kmeans_model.inertia_)
                silhouettes.append(silhouette_score(X_test_array, cluster_labels))

            plt.figure()
            plt.plot(range(2, 11), inertias, marker='o', label='Inertia')
            plt.title(f"{name} - {data_name} Data - Elbow Plot")
            plt.xlabel("Number of Clusters")
            plt.ylabel("Inertia")
            plt.legend()
            plt.show()

            plt.figure()
            plt.plot(range(2, 11), silhouettes, marker='o', label='Silhouette Score')
            plt.title(f"{name} - {data_name} Data - Silhouette Scores")
            plt.xlabel("Number of Clusters")
            plt.ylabel("Silhouette Score")
            plt.legend()
            plt.show()




  warn(
  warn(


Analysis completed. Check the output_plots directory for results.
