In [5]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset from the URL
data = pd.read_csv('https://drive.google.com/file/d/1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX/view')

# Omit the last attribute
data = data.iloc[:, :-1]  # This removes the last column (attribute)

# Standardize the dataset (optional but often recommended)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Determine the optimal number of clusters using the Elbow Method
# We will plot the sum of squared distances for different values of k
wcss = []  # Within-cluster sum of squares (WCSS)
for k in range(1, 11):  # Check for cluster sizes from 1 to 10
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(data_scaled)
    wcss.append(kmeans.inertia_)  # Inertia is the WCSS for the current k

# Plot the Elbow Method graph
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

# Apply K-Means with the optimal number of clusters (e.g., from the elbow plot)
# Let's assume the elbow plot suggests k=3 clusters (this should be based on the plot you see)
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=42)
kmeans.fit(data_scaled)

# Step 6: Assign the clusters to the dataset
data['Cluster'] = kmeans.labels_

# Step 7: View the cluster centers and labels
print("Cluster Centers:")
print(kmeans.cluster_centers_)

# View the first few rows of the data with the assigned clusters
print("\nData with assigned clusters:")
print(data.head())

# Optional: Plot the clusters (if 2D data for visualization)
if data_scaled.shape[1] == 2:
    plt.figure(figsize=(8, 6))
    plt.scatter(data_scaled[:, 0], data_scaled[:, 1], c=kmeans.labels_, cmap='viridis')
    plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red', marker='x')
    plt.title('K-Means Clustering')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.show()


ParserError: Error tokenizing data. C error: Expected 1 fields in line 3, saw 2314
