<a href="https://colab.research.google.com/github/Propa-Punam/Wifi-RSS-Crowdsensing/blob/main/cluster/spectral_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Install required libraries
!pip install pandas numpy scikit-learn hdbscan pyclustering  # pyclustering for X-Means

# Import libraries
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN, OPTICS, MeanShift, AffinityPropagation, AgglomerativeClustering, SpectralClustering, Birch
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors
import hdbscan  # For HDBSCAN
from pyclustering.cluster.xmeans import xmeans  # For X-Means
from pyclustering.utils import read_sample
from google.colab import drive
import warnings # Import the warnings module




In [1]:
# Install required libraries
!pip install pandas numpy scikit-learn

# Import libraries
import pandas as pd
import numpy as np
from sklearn.cluster import SpectralClustering
from sklearn.metrics import accuracy_score
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Specify the path to your CSV file in Google Drive
file_path = '/content/drive/My Drive/student_vectors.csv'  # Adjust path if needed

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Extract RSS data (columns 3 to last)
rss_data = df.iloc[:, 2:].values  # RSS columns only
true_labels = df['room'].values   # Room labels for accuracy
student_ids = df['StudentID'].values  # Student IDs for output
rss_columns = df.columns[2:].tolist()  # Column names of RSS values

# Apply Spectral Clustering
# Note: scikit-learn requires n_clusters; using 5 as a fallback since automatic detection isn't built-in
n_clusters = 5  # Set to 5 based on your expectation; no native auto-detection in scikit-learn
clustering = SpectralClustering(n_clusters=n_clusters, affinity='rbf', random_state=42)
cluster_labels = clustering.fit_predict(rss_data)

# Number of clusters (fixed by n_clusters parameter)
print(f"Number of clusters found: {len(set(cluster_labels))}")

# Create a DataFrame for easier sorting and display
results_df = pd.DataFrame({
    'StudentID': student_ids,
    'Room': true_labels,
    'Cluster': cluster_labels
})

# Add RSS columns to results_df
for i, col in enumerate(rss_columns):
    results_df[col] = rss_data[:, i]

# Sort by cluster number
results_df = results_df.sort_values(by='Cluster')

# Print cluster assignments sorted by cluster with room distribution and RSS values
print("\nCluster assignments (sorted by cluster):")
for cluster in sorted(set(cluster_labels)):
    print(f"\nCluster {cluster}:")
    cluster_data = results_df[results_df['Cluster'] == cluster]

    # Count number of people from each room in this cluster
    room_counts = cluster_data['Room'].value_counts().to_dict()
    room_summary = {
        '203': room_counts.get('203', 0),
        '204': room_counts.get('204', 0),
        'l1': room_counts.get('l1', 0),
        'l2': room_counts.get('l2', 0),
        'l3': room_counts.get('l3', 0)
    }
    print(f"Room distribution - 203: {room_summary['203']}, 204: {room_summary['204']}, l1: {room_summary['l1']}, l2: {room_summary['l2']}, l3: {room_summary['l3']}")

    # List students in this cluster with RSS values
    for _, row in cluster_data.iterrows():
        rss_values = [f"{col}: {row[col]}" for col in rss_columns]
        print(f"StudentID: {row['StudentID']}, Room: {row['Room']}, RSS: {{{', '.join(rss_values)}}}")

# Map clusters to room labels via majority voting
cluster_to_room = {}
for cluster in set(cluster_labels):
    cluster_indices = np.where(cluster_labels == cluster)[0]
    cluster_rooms = true_labels[cluster_indices]
    most_common_room = pd.Series(cluster_rooms).mode()[0]  # Most frequent room in cluster
    cluster_to_room[cluster] = most_common_room

# Predict room labels based on clusters
predicted_labels = [cluster_to_room[label] for label in cluster_labels]

# Calculate accuracy
room_mapping = {'203': 0, '204': 1, 'l1': 2, 'l2': 3, 'l3': 4}
true_numeric = [room_mapping[label] for label in true_labels]
pred_numeric = [room_mapping[label] for label in predicted_labels]

accuracy = accuracy_score(true_numeric, pred_numeric)
print(f"\nAccuracy (students in same room grouped correctly): {accuracy * 100:.2f}%")

Mounted at /content/drive
Number of clusters found: 2

Cluster assignments (sorted by cluster):

Cluster 0:
Room distribution - 203: 23, 204: 21, l1: 1, l2: 1, l3: 1
StudentID: 2005045, Room: 203, RSS: {CSE-306: -72, CSE-304: -67, CSE-401: -77, CSE-104: -70, CSE-205: -60, CSE-G04: -73, CSE-204: -57, CSE-206: -69, CSE-303: -52, CSE-214: -100, DataLab@BUET: -66}
StudentID: 2005074, Room: 204, RSS: {CSE-306: -100, CSE-304: -61, CSE-401: -100, CSE-104: -82, CSE-205: -71, CSE-G04: -100, CSE-204: -69, CSE-206: -74, CSE-303: -64, CSE-214: -100, DataLab@BUET: -79}
StudentID: 2005094, Room: 204, RSS: {CSE-306: -100, CSE-304: -71, CSE-401: -100, CSE-104: -83, CSE-205: -72, CSE-G04: -74, CSE-204: -63, CSE-206: -100, CSE-303: -67, CSE-214: -100, DataLab@BUET: -77}
StudentID: 2005084, Room: 204, RSS: {CSE-306: -94, CSE-304: -58, CSE-401: -87, CSE-104: -80, CSE-205: -74, CSE-G04: -77, CSE-204: -54, CSE-206: -74, CSE-303: -63, CSE-214: -100, DataLab@BUET: -71}
StudentID: 2005090, Room: 204, RSS: {CSE

  return fit_method(estimator, *args, **kwargs)
