In [6]:
import numpy as np

class KMeans:
    def __init__(self, k=8, max_iterations=500):
        self.k = k
        self.max_iterations = max_iterations

    def fit(self, X):
        self.centroids = X[np.random.choice(range(len(X)), self.k, replace=False)]
        for i in range(self.max_iterations):
            clusters = [[] for _ in range(self.k)]
            for point in X:
                distances = np.linalg.norm(point - self.centroids, axis=1)
                closest_centroid = np.argmin(distances)
                clusters[closest_centroid].append(point)

            prev_centroids = self.centroids.copy()
            for j, cluster in enumerate(clusters):
                if len(cluster) > 0:
                    self.centroids[j] = np.mean(cluster, axis=0)

            if np.allclose(prev_centroids, self.centroids):
                break

    def predict(self, X):
        distances = np.linalg.norm(X - self.centroids[:, np.newaxis], axis=2)
        return np.argmin(distances, axis=0)


In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the CSV file into a Pandas dataframe
X = pd.read_csv('ufo_fullset.csv', usecols=['latitude', 'longitude'])

# Select the features to use for clustering (all except the label column)

# Scale the features using StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Create a KMeans instance with the desired number of clusters
kmeans = KMeans(k=10)

# Fit the model to the data
kmeans.fit(X)

# Predict the cluster assignments of the data
y_pred = kmeans.predict(X)

# Get the centroid points for each cluster
centroids = kmeans.centroids

# Print the centroid points
print(centroids)


# Add the predicted cluster assignments to the dataframe
data['cluster'] = y_pred

# Write the dataframe to a new CSV file
data.to_csv('ufoclustered.csv', index=False)


[[-1.78259144e+00 -1.75002423e+00]
 [ 1.39249358e+00  2.28970355e+00]
 [-2.13211888e-01 -5.27230825e-03]
 [-3.88019169e-01 -7.07578715e-01]
 [-6.79454993e+00  5.05287672e+00]
 [ 4.00945519e-01  1.73133301e-01]
 [-9.39109727e-01  1.41398586e-03]
 [ 1.32120923e-01 -7.49035542e-01]
 [ 1.05126200e+00 -8.84812617e-01]
 [-1.63928071e+00  4.47331299e+00]]


In [8]:
Y = pd.read_csv('ufoclustered.csv')
print(Y)

        latitude   longitude  cluster
0      47.329444 -122.578889        3
1      52.664913   -1.034894        2
2      38.951667  -92.333889        6
3      41.496944  -71.367778        9
4      47.606389 -122.330833        3
...          ...         ...      ...
17995  42.033333  -87.733333        6
17996  43.004444  -71.348889        9
17997  36.866389  -83.888889        1
17998  35.385833  -94.398333        1
17999  29.883056  -97.941111        7

[18000 rows x 3 columns]
