<a href="https://colab.research.google.com/github/Propa-Punam/Wifi-RSS-Crowdsensing/blob/main/cluster/meanshift.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Install required libraries (usually pre-installed in Colab)
!pip install pandas numpy scikit-learn

# Import libraries
import pandas as pd
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics import accuracy_score
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Specify the path to your CSV file in Google Drive
file_path = '/content/drive/My Drive/student_vectors.csv'  # Adjust path if needed

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Extract RSS data (columns 3 to last)
rss_data = df.iloc[:, 2:].values  # RSS columns only
true_labels = df['room'].values   # Room labels for accuracy
student_ids = df['StudentID'].values  # Student IDs for output
rss_columns = df.columns[2:].tolist()  # Column names of RSS values

# Estimate initial bandwidth
initial_bandwidth = estimate_bandwidth(rss_data, quantile=0.2)  # quantile controls scale
print(f"Estimated bandwidth: {initial_bandwidth}")

# Test a range of bandwidth values to find ~5 clusters
bandwidth_values = [initial_bandwidth * factor for factor in [0.5, 0.75, 1.0, 1.25, 1.5]]  # Adjust range around initial estimate

print("Testing different bandwidth values to find ~5 clusters:")
for bandwidth in bandwidth_values:
    clustering = MeanShift(bandwidth=bandwidth)
    cluster_labels = clustering.fit_predict(rss_data)
    n_clusters = len(set(cluster_labels))
    print(f"bandwidth={bandwidth:.2f}: Number of clusters = {n_clusters}")

# Choose a bandwidth that gives ~5 clusters (update based on output above)
chosen_bandwidth = initial_bandwidth  # Placeholder; update this after seeing the output
clustering = MeanShift(bandwidth=chosen_bandwidth)
cluster_labels = clustering.fit_predict(rss_data)

# Number of clusters
n_clusters = len(set(cluster_labels))
print(f"\nChosen bandwidth={chosen_bandwidth:.2f}, Number of clusters found: {n_clusters}")

# Create a DataFrame for easier sorting and display
results_df = pd.DataFrame({
    'StudentID': student_ids,
    'Room': true_labels,
    'Cluster': cluster_labels
})

# Add RSS columns to results_df
for i, col in enumerate(rss_columns):
    results_df[col] = rss_data[:, i]

# Sort by cluster number
results_df = results_df.sort_values(by='Cluster')

# Print cluster assignments sorted by cluster with room distribution and RSS values
print("\nCluster assignments (sorted by cluster):")
for cluster in sorted(set(cluster_labels)):
    print(f"\nCluster {cluster}:")
    cluster_data = results_df[results_df['Cluster'] == cluster]

    # Count number of people from each room in this cluster
    room_counts = cluster_data['Room'].value_counts().to_dict()
    room_summary = {
        '203': room_counts.get('203', 0),
        '204': room_counts.get('204', 0),
        'l1': room_counts.get('l1', 0),
        'l2': room_counts.get('l2', 0),
        'l3': room_counts.get('l3', 0)
    }
    print(f"Room distribution - 203: {room_summary['203']}, 204: {room_summary['204']}, l1: {room_summary['l1']}, l2: {room_summary['l2']}, l3: {room_summary['l3']}")

    # List students in this cluster with RSS values
    for _, row in cluster_data.iterrows():
        rss_values = [f"{col}: {row[col]}" for col in rss_columns]
        print(f"StudentID: {row['StudentID']}, Room: {row['Room']}, RSS: {{{', '.join(rss_values)}}}")

# Map clusters to room labels via majority voting
cluster_to_room = {}
for cluster in set(cluster_labels):
    cluster_indices = np.where(cluster_labels == cluster)[0]
    cluster_rooms = true_labels[cluster_indices]
    most_common_room = pd.Series(cluster_rooms).mode()[0]  # Most frequent room in cluster
    cluster_to_room[cluster] = most_common_room

# Predict room labels based on clusters
predicted_labels = [cluster_to_room[label] for label in cluster_labels]

# Calculate accuracy
room_mapping = {'203': 0, '204': 1, 'l1': 2, 'l2': 3, 'l3': 4}
true_numeric = [room_mapping[label] for label in true_labels]
pred_numeric = [room_mapping[label] for label in predicted_labels]

accuracy = accuracy_score(true_numeric, pred_numeric)
print(f"\nAccuracy (students in same room grouped correctly): {accuracy * 100:.2f}%")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Estimated bandwidth: 31.89982600525316
Testing different bandwidth values to find ~5 clusters:
bandwidth=15.95: Number of clusters = 25
bandwidth=23.92: Number of clusters = 15
bandwidth=31.90: Number of clusters = 4
bandwidth=39.87: Number of clusters = 3
bandwidth=47.85: Number of clusters = 1

Chosen bandwidth=31.90, Number of clusters found: 4

Cluster assignments (sorted by cluster):

Cluster 0:
Room distribution - 203: 21, 204: 19, l1: 1, l2: 1, l3: 1
StudentID: 2005045, Room: 203, RSS: {CSE-306: -72, CSE-304: -67, CSE-401: -77, CSE-104: -70, CSE-205: -60, CSE-G04: -73, CSE-204: -57, CSE-206: -69, CSE-303: -52, CSE-214: -100, DataLab@BUET: -66}
StudentID: 2005102, Room: 204, RSS: {CSE-306: -72, CSE-304: -60, CSE-401: -81, CSE-104: -63, CSE-205: -59, CSE-G04: -80, CSE-204: -69, CSE-206: -71, CSE-303: -55, CSE-214: -100, DataLab@BUET: -66}
StudentID: 2005