In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/emnist-letters-train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/emnist-letters-test.csv')

In [None]:
y_train = train_df["23"]

In [None]:
X_train = train_df.drop(train_df.columns[0], axis=1)

In [None]:
y_test = test_df["1"]

In [None]:
X_test = test_df.drop(test_df.columns[0], axis=1)

In [None]:
class KMeans:

    def __init__(self, k=2, tol=0.001, max_iter=300):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter

    def fit(self,data):

        self.centroids = {}
        for i in range(self.k):
            self.centroids[i] = data[i]
        for i in range(self.max_iter):
            self.classifications = {}
            for i in range(self.k):
                self.classifications[i] = []

            for featureset in data:
                distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in self.centroids]
                classification = distances.index(min(distances))
                self.classifications[classification].append(featureset)

            prev_centroids = dict(self.centroids)

            for classification in self.classifications:
                self.centroids[classification] = np.average(self.classifications[classification],axis=0)

            optimized = True

            for c in self.centroids:
                original_centroid = prev_centroids[c]
                current_centroid = self.centroids[c]
                if np.sum((current_centroid-original_centroid)/original_centroid*100.0) > self.tol:
                    print(np.sum((current_centroid-original_centroid)/original_centroid*100.0))
                    optimized = False

            if optimized:
                break

    def predict(self,data):
        distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids]
        classification = distances.index(min(distances))
        return classification 

    

In [None]:
X = np.array(X_train/255.0)
X[X==0]=1
y = np.array(y_train)
X_t = np.array(X_test/255.0)
X_t[X_t==0]==1
y_t = np.array(y_test)

In [None]:
model = KMeans()
model.fit(X)

366564.36134420225
185896.07361262894
976.0631170199995
450.85281279684085
722.7517735722294
315.45381436771476
565.1270513801801
272.28009611232386
484.4708997955159
231.3690641246829
437.24019399748784
181.4785532739334
409.4858983881069
133.71590416476295
389.64133624523436
80.30676790729272
350.74423394284474
26.339814770163436
271.69749338939016
186.35007401550362
119.34822513651308
70.4548085392923
43.34251842860368
24.347653735235184
14.148816942984482
9.9450078031186
5.022447844200903
2.8326197222116853
1.901473876415516
2.4100717245791534
1.4477328803562894
2.0752399079948334
0.4574484220835218
0.15986461267166577
0.2361902330123199
0.16845840010203386
0.1298469971080065
0.41857052118505855
0.2344078075995588
0.16451908818194116
0.2675316332848465


In [None]:
model.predict(X_t)

0