In [6]:
import numpy as np

class KMeans:
    def __init__(self, k=8, max_iterations=500):
        self.k = k
        self.max_iterations = max_iterations

    def fit(self, X):
        self.centroids = X[np.random.choice(range(len(X)), self.k, replace=False)]
        for i in range(self.max_iterations):
            clusters = [[] for _ in range(self.k)]
            for point in X:
                distances = np.linalg.norm(point - self.centroids, axis=1)
                closest_centroid = np.argmin(distances)
                clusters[closest_centroid].append(point)

            prev_centroids = self.centroids.copy()
            for j, cluster in enumerate(clusters):
                if len(cluster) > 0:
                    self.centroids[j] = np.mean(cluster, axis=0)

            if np.allclose(prev_centroids, self.centroids):
                break

    def predict(self, X):
        distances = np.linalg.norm(X - self.centroids[:, np.newaxis], axis=2)
        return np.argmin(distances, axis=0)


In [27]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the CSV file into a Pandas dataframe
data = pd.read_csv('ufo_fullset.csv', usecols=['latitude', 'longitude'])

print(f"The data\n {data.head()}")
print(data.info())

# Handle missing values if any 

missing_values = data.isnull().values.any()
print(f'Are there any missing values? {missing_values}')
if(missing_values):
    data[data.isnull().any(axis=1)]

# Create a KMeans instance with 10 locations
kmeans = KMeans(k=10)

data_train = data.values.astype('float32')
# Fit the model to the data
kmeans.fit(data_train)

# Predict the cluster assignments of the data
y_pred = kmeans.predict(data_train)

# Get the centroid points for each cluster
centroids = kmeans.centroids

# Print the centroid points
print(f'The final locations of the UFO sites are as follows \n {centroids}')


# Add the predicted cluster assignments to the dataframe
data['cluster'] = y_pred

# Write the dataframe to a new CSV file
data.to_csv('ufoclustered.csv', index=False)


The data
     latitude   longitude
0  47.329444 -122.578889
1  52.664913   -1.034894
2  38.951667  -92.333889
3  41.496944  -71.367778
4  47.606389 -122.330833
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   latitude   18000 non-null  float64
 1   longitude  18000 non-null  float64
dtypes: float64(2)
memory usage: 281.4 KB
None
Are there any missing values? False
The final locations of the UFO sites are as follows 
 [[  32.432354  -95.53003 ]
 [  39.60291  -106.66493 ]
 [  35.303215 -117.87051 ]
 [  31.495398   40.178288]
 [  26.050009  -80.66622 ]
 [  41.9247    -90.50136 ]
 [  46.81402  -121.39599 ]
 [  41.55192   -74.30462 ]
 [  42.989964 -153.59532 ]
 [  37.42338   -82.76806 ]]


In [8]:
Y = pd.read_csv('ufoclustered.csv')
print(Y)

        latitude   longitude  cluster
0      47.329444 -122.578889        3
1      52.664913   -1.034894        2
2      38.951667  -92.333889        6
3      41.496944  -71.367778        9
4      47.606389 -122.330833        3
...          ...         ...      ...
17995  42.033333  -87.733333        6
17996  43.004444  -71.348889        9
17997  36.866389  -83.888889        1
17998  35.385833  -94.398333        1
17999  29.883056  -97.941111        7

[18000 rows x 3 columns]
