In [10]:
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split

In [11]:
class KModes:
    def __init__(self, k, max_iter=100):
        self.k = k
        self.max_iter = max_iter

    def initialize_centroids(self, data):
        n_samples, n_features = data.shape
        centroids = np.zeros((self.k, n_features), dtype=object)  # Initialize centroids array with dtype=object

        for i in range(self.k):
            centroid = []
            for j in range(n_features):
                unique_values = np.unique(data.iloc[:, j].astype(str))  # Convert column to string type
                centroid.append(np.random.choice(unique_values))
            centroids[i] = centroid

        return centroids

    def compute_distance(self, data, centroids):
        # Distance is a 2_D array that will contain a row for each of the data point and then array for each row will contain the distance from the k-modes.
        distance = np.zeros((data.shape[0], len(centroids)), dtype=int)
        # i and centroid will be used to iterate simultenously using the 2d-array, enumerate function will be used for that .
        for i, centroid in enumerate(centroids):
            for j in range(data.shape[0]):
                # we are iterating through each of the data point and then we compare the number of mismatches and then do the sum.
                distance[j, i] = np.sum(data.iloc[j] != centroid)
        return distance

    def assign_clusters(self, data, centroids):
        # calculate the distance for each of the data points.
        distances = self.compute_distance(data, centroids)
        # Then get the min index of the centroid of each of the data point and then return the array.
        clusters = np.argmin(distances, axis=1)
        return clusters

    def update_centroids(self, data, clusters):
        centroids = []
        # used for storing the updates centroids.
        for cluster_index in range(self.k):
            # cluster_index is used to access the individual clusters
            # we then filter out the data points and extract the points only belonging to that cluster.
            cluster_data = data[clusters == cluster_index]
            centroid = []
            # we will iterate through the columns of the filtered cluster and then get the mode of each of the feature and 
            # then add that as the feature of the centroid and then return the centroid.
            for feature_column in cluster_data.columns:
                mode = cluster_data[feature_column].mode()[0]
                centroid.append(mode)
            centroids.append(centroid)
        return np.array(centroids)

    def fit(self, data):
        self.centroids = self.initialize_centroids(data)
        # this is basically doing the iterations to find the best fit and then stopping if the clusters don't change.
        for _ in range(self.max_iter):
            old_centroids = self.centroids.copy()
            clusters = self.assign_clusters(data, self.centroids)
            self.centroids = self.update_centroids(data, clusters)
            if np.array_equal(old_centroids, self.centroids):
                break
        self.clusters = clusters


In [12]:
data = pd.read_csv("Combined.csv")

data.drop(columns=["Title"])
# 2. Preprocess the data if needed

# 3. Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)  # Adjust test_size as needed

# 4. Instantiate the KModes class
k_modes = KModes(k=3)  # Specify the number of clusters (k)
# 5. Fit the model using the train data
k_modes.fit(train_data)

# 6. Predict clusters for test data
test_clusters = k_modes.assign_clusters(test_data, k_modes.centroids)

# 7. Calculate the Silhouette Score
silhouette_avg = silhouette_score(test_data, test_clusters)
print("Silhouette Score:", silhouette_avg)

  data = pd.read_csv("Combined.csv")


ValueError: could not convert string to float: 'Breakfast Club, The (1985)'