In [10]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

file_path = r'normalized_data.csv'

# Step 2: Check if the file exists
print(f"Checking file at: {file_path}")

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
else:
    print(f"File found: {file_path}")

    # Step 3: Load the data
    try:
        data = pd.read_csv(file_path)
        print("Data successfully loaded!")
    except Exception as e:
        print(f"Error loading data: {e}")
        exit()

    # Step 5: Apply PCA
    try:
        pca = PCA(n_components=2)  # Reduce to 2 dimensions for visualization
        pca_data = pca.fit_transform(data.iloc[:, 1:])  # Assuming first column is non-numerical
        print("PCA applied successfully!")
        print(f"PCA Data Shape: {pca_data.shape}")
    except Exception as e:
        print(f"Error during PCA: {e}")
        exit()

    # Step 6: Perform DBSCAN clustering
    try:
        # Debugging: Check for NaN or Inf values
        if pd.DataFrame(pca_data).isnull().values.any():
            print("PCA Data contains NaN values. Please check your input.")
            exit()

        # Adjust DBSCAN parameters as needed
        dbscan = DBSCAN(eps=0.5, min_samples=5)  # Modify eps and min_samples for your dataset
        labels = dbscan.fit_predict(pca_data)

        # Debugging: Output cluster labels
        print(f"DBSCAN Labels: {labels}")

        # Add the cluster labels to the dataset
        data['Cluster'] = labels
        print("DBSCAN clustering completed!")
    except Exception as e:
        print(f"Error during clustering: {e}")
        exit()

    # Step 7: Find the approximate cluster centers
    cluster_centers = {}
    try:
        for cluster_label in np.unique(labels):
            if cluster_label != -1:  # Ignore noise points (-1)
                cluster_points = pca_data[labels == cluster_label]
                center = np.mean(cluster_points, axis=0)
                cluster_centers[cluster_label] = center

        # Debugging: Output the cluster centers
        print(f"Cluster centers: {cluster_centers}")
    except Exception as e:
        print(f"Error during cluster center calculation: {e}")
        exit()

    # Step 8: Save rows corresponding to each cluster to a CSV file
    try:
        # Create separate DataFrames for core points, border points, and noise points
        core_points = pd.DataFrame()
        border_points = pd.DataFrame()
        noise_points = pd.DataFrame()

        # Get the core points
        core_mask = (labels != -1)  # Exclude noise points
        for i, label in enumerate(labels):
            if label != -1 and np.sum(labels == label) >= dbscan.min_samples:
                core_points = pd.concat([core_points, data.iloc[i:i+1]], ignore_index=True)

        # Get the border points (those within epsilon of core points but not core points themselves)
        for i, label in enumerate(labels):
            if label != -1 and np.sum(labels == label) < dbscan.min_samples:
                border_points = pd.concat([border_points, data.iloc[i:i+1]], ignore_index=True)

        # Get the noise points (those labeled as -1)
        noise_points = data[labels == -1]

        # Save the data to separate CSV files
        output_dir = r'C:\Users\Mili\Desktop\Project'  # Change path if necessary
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Save each category of points to CSV files
        core_file = os.path.join(output_dir, 'core_points.csv')
        border_file = os.path.join(output_dir, 'border_points.csv')
        noise_file = os.path.join(output_dir, 'noise_points.csv')

        core_points.to_csv(core_file, index=False)
        border_points.to_csv(border_file, index=False)
        noise_points.to_csv(noise_file, index=False)

        print(f"Core points saved to: {core_file}")
        print(f"Border points saved to: {border_file}")
        print(f"Noise points saved to: {noise_file}")
    except Exception as e:
        print(f"Error saving clustered data with centers: {e}")
        exit()

    # Step 9: Plot DBSCAN results
    try:
        plt.figure(figsize=(10, 6))

        # Plot clusters using the PCA-reduced data
        plt.scatter(pca_data[:, 0], pca_data[:, 1], c=labels, cmap='viridis', marker='o', s=50, alpha=0.6)

        # Highlight noise points (label = -1)
        noise_points = (labels == -1)
        plt.scatter(pca_data[noise_points, 0], pca_data[noise_points, 1], c='red', marker='x', label='Noise', s=100)

        # Plot cluster centers
        for center in cluster_centers.values():
            plt.scatter(center[0], center[1], c='black', marker='x', s=200, edgecolors='k', label='Cluster Center')

        plt.title('DBSCAN Clustering Results with PCA')
        plt.xlabel('Principal Component 1')
        plt.ylabel('Principal Component 2')
        plt.legend()
        plt.grid(True)
        plt.show()
    except Exception as e:
        print(f"Error during plotting: {e}")


Checking file at: normalized_data.csv
File found: normalized_data.csv
Data successfully loaded!
PCA applied successfully!
PCA Data Shape: (54675, 2)


In [13]:
import os
print(f"Current working directory: {os.getcwd()}")
print(f"Files in the current directory: {os.listdir()}")

Current working directory: c:\Users\Mili\Desktop\Project
Files in the current directory: ['Main.ipynb', 'normalized_data.csv.csv']
