### KNN:

* Supervised learning: Requires labeled data, where each data point has a corresponding class label.
* Goal: Classifies a new data point based on the labels of its nearest neighbors in the training data.
* Applications: Spam filtering, handwriting recognition, image classification.
* Algorithm: Finds the k nearest neighbors (data points) in the training set for a new data point and assigns the most frequent class label among those neighbors.
* Output: Class label for a new, unseen data point.

In [23]:
import numpy as np
from collections import Counter

class KNN:
    # Find the labels of the new data points based on the labels of its nearest neighbors
    def __init__(self, k):
        self.k = k

    def fit(self, X, y):
        """Store training data. """
        self.X_train = X
        self.y_train = y

    def predict(self, X_test):
        """Predict classes for test points."""
        predictions = []
        for x_test in X_test:
            #1. Find the distance bw new point and all the points
            #np.linalg.norm(xt - x_test) default order is 2 i.e. Euclidean distance
            distances = [np.linalg.norm(xt - x_test) for xt in self.X_train]
            #2. Find k nearest points 
            k_indices = np.argsort(distances)[: self.k]  # Indices of k nearest neighbors
            #3. Get their labels
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            #4. Select the label that is most common in k nearest neghbors
            most_common_label = Counter(k_nearest_labels).most_common(1)[0][0]
            predictions.append(most_common_label)
        return predictions


In [24]:
import numpy as np
from collections import Counter

# Sample Dataset
X = np.array([[2, 3],
              [5, 4],
              [9, 6],
              [4, 7],
              [8, 1],
              [7, 2]])
y = np.array([0, 0, 1, 1, 0, 0])  # Labels: 0 or 1

# Create KNN classifier (let's use k = 3)
clf = KNN(k=3)

# Fit the model with our sample data
clf.fit(X, y)

# Test points for prediction
test_points = np.array([[3, 4.5], [6, 5]])

# Make predictions
predictions = clf.predict(test_points)
print(predictions) 


[0, 1]


### Clustering:

* Unsupervised learning: Deals with unlabeled data, where data points don't have predefined categories.
* Goal: Groups similar data points together based on their features or characteristics.
* Applications: Customer segmentation, anomaly detection, image compression.
* Common algorithms: K-Means, Hierarchical clustering, DBSCAN.
* Output: Clusters (groups) of data points with similar features.



In [20]:
import random

class KMeans2:
    def __init__(self, n_clusters, n_epochs=20) -> None:
        self.n_clusters = n_clusters
        self.iters = n_epochs

    def __initialize_centroids(self, data):
        ### Using random centroids as a starting point from the data points itself
        return np.array(random.sample(list(data), self.n_clusters))
    
    def __calculate_distance(self, data, centroids):
        # Euclidean Distance
        distances = np.sqrt(((data - centroids[:, np.newaxis])**2).sum(axis=2))
        return distances
    
    def _find_closest_centroids(self, X, centroids):
        # lets say n_centroids are 4 and 6 data points, then shape of the distance is n_centroid x n_points --> 
        # n_centroid[0][0] --> distance between point 1 and centroid 1
        # n_centroid[1][0] --> distance between point 1 and centroid 2 .... 
        # when you take argmin along axis=0(rows), so min value along all the row is taken 
        distances = self.__calculate_distance(X, centroids)
        return np.argmin(distances, axis=0)

    def _update_centroids(self, labels, X):
        """Compute new centroid positions as the mean of points within a cluster."""
        centroids = np.zeros((self.n_clusters, X.shape[1]))

        for k in range(self.n_clusters):
            centroids[k, :] = np.mean(X[labels==k, :], axis=0)
        
        return centroids

    def fit(self, X):
        self.centroids = self.__initialize_centroids(X)

        for _ in range(self.iters):
            initial_centroids = self.centroids
            labels = self._find_closest_centroids(X, self.centroids)
            new_centroids = self._update_centroids(labels, X)
            self.centroids = new_centroids
            
            # Check for convergence
            if np.all(self.centroids == initial_centroids):
                break

    def predict(self, test_point):
        return f"Label : {self._find_closest_centroids(test_point, self.centroids)}"
        



# 1. We need Data
# 2. A class that takes k clusters and no. of iterations as an input.
## 2.a Function that initializes the centroid --> using the data points
## 2.b A function that calculates the distance between the data points and the centroid and points closest to the centroid
###    are grouped together
### 2.c a function that performs all above functions --> fit

# Sample Dataset 
X = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11]])

# Create K-Means model (let's use k = 2)
kmeans = KMeans2(n_clusters=2)
kmeans.fit(X)

# Predict cluster for a new point
new_point = [10, 9]
cluster_label = kmeans.predict([new_point]) 
print(cluster_label)

Label : [1]
