<a href="https://colab.research.google.com/github/Rujan0833/DM_DW_LAB/blob/main/LAB4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#K-Means

In [1]:
import pandas as pd
import numpy as np

def euclidean_distance(point1, point2):

    return np.sqrt(np.sum((point1 - point2)**2))

def assign_to_clusters(data, centroids):

    assignments = np.zeros(data.shape[0], dtype=int)
    for i, point in enumerate(data):
        distances = [euclidean_distance(point, centroid) for centroid in centroids]
        assignments[i] = np.argmin(distances)
    return assignments

def update_centroids(data, assignments, k):

    new_centroids = np.zeros((k, data.shape[1]))
    for cluster_id in range(k):
        points_in_cluster = data[assignments == cluster_id]
        if len(points_in_cluster) > 0:
            new_centroids[cluster_id] = np.mean(points_in_cluster, axis=0)
    return new_centroids

def kmeans(data, k, max_iterations=100, random_state=None):

    if isinstance(data, pd.DataFrame):
        data_np = data.values
    else:
        data_np = data

    if random_state is not None:
        np.random.seed(random_state)

    # Step 1: Initialize centroids randomly
    # Select k random data points as initial centroids
    initial_centroid_indices = np.random.choice(data_np.shape[0], k, replace=False)
    centroids = data_np[initial_centroid_indices]

    for iteration in range(max_iterations):
        # Step 2: Assign data points to the closest centroid
        assignments = assign_to_clusters(data_np, centroids)

        # Step 3: Update centroids
        new_centroids = update_centroids(data_np, assignments, k)

        # Check for convergence (if centroids don't change significantly)
        if np.allclose(centroids, new_centroids):
            print(f"K-Means converged after {iteration + 1} iterations.")
            break
        centroids = new_centroids
    else:
        print(f"K-Means reached max iterations ({max_iterations}) without convergence.")

    return assignments, centroids

# --- Example Usage ---
if __name__ == "__main__":
    # Load the dataset
    try:
        df = pd.read_csv("data.csv")
        # Assuming the dataset has numerical features for clustering
        # For demonstration, let's use all columns if they are numerical
        # Or select specific columns if needed, e.g., df[['feature1', 'feature2']]
        data_for_clustering = df.select_dtypes(include=np.number)

        if data_for_clustering.empty:
            print("No numerical columns found in data.csv for clustering.")
        else:
            # Drop rows with any NaN values in the selected numerical columns
            data_for_clustering = data_for_clustering.dropna()

            if data_for_clustering.empty:
                print("No valid numerical data after dropping NaNs in data.csv.")
            else:
                k_value = 3  # Example: Number of clusters
                print(f"Running K-Means with k={k_value} on data.csv...")
                cluster_assignments, final_centroids = kmeans(data_for_clustering, k=k_value, random_state=42)

                print("\nFinal Cluster Assignments:")
                print(cluster_assignments)

                print("\nFinal Centroids:")
                print(final_centroids)

                # You can add the cluster assignments back to your original DataFrame
                df['KMeans_Cluster'] = pd.Series(cluster_assignments, index=data_for_clustering.index)
                print("\nDataFrame with K-Means Clusters:")
                print(df.head())

    except FileNotFoundError:
        print("Error: data.csv not found. Please make sure the file is in the same directory.")
    except Exception as e:
        print(f"An error occurred: {e}")


Running K-Means with k=3 on data.csv...
K-Means converged after 5 iterations.

Final Cluster Assignments:
[2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 0]

Final Centroids:
[[7.         1.        ]
 [5.88276264 5.7448722 ]
 [1.82512542 2.29187589]]

DataFrame with K-Means Clusters:
   Feature_1  Feature_2  KMeans_Cluster
0   2.248357   1.930868               2
1   2.323844   2.761515               2
2   1.882923   1.882932               2
3   2.789606   2.383717               2
4   1.765263   2.271280               2





# K-Means++ Clustering Algorithm



In [2]:
import pandas as pd
import numpy as np

def euclidean_distance(point1, point2):

    return np.sqrt(np.sum((point1 - point2)**2))

def assign_to_clusters(data, centroids):

    assignments = np.zeros(data.shape[0], dtype=int)
    for i, point in enumerate(data):
        distances = [euclidean_distance(point, centroid) for centroid in centroids]
        assignments[i] = np.argmin(distances)
    return assignments

def update_centroids(data, assignments, k):

    new_centroids = np.zeros((k, data.shape[1]))
    for cluster_id in range(k):
        points_in_cluster = data[assignments == cluster_id]
        if len(points_in_cluster) > 0:
            new_centroids[cluster_id] = np.mean(points_in_cluster, axis=0)
    return new_centroids

def kmeans_plusplus_init(data, k, random_state=None):

    if random_state is not None:
        np.random.seed(random_state)

    n_samples, n_features = data.shape
    centroids = np.zeros((k, n_features))

    # Step 1: Choose one center uniformly at random from the data points.
    first_centroid_idx = np.random.choice(n_samples)
    centroids[0] = data[first_centroid_idx]

    # Step 2: For each data point x, compute D(x), the distance between x and the nearest center that has already been chosen.
    # Step 3: Choose new center x' with probability proportional to D(x)^2.
    # Step 4: Repeat Steps 2 and 3 until k centers have been chosen.
    for i in range(1, k):
        distances_sq = np.array([min([euclidean_distance(point, c)**2 for c in centroids[:i]]) for point in data])
        probabilities = distances_sq / np.sum(distances_sq)
        next_centroid_idx = np.random.choice(n_samples, p=probabilities)
        centroids[i] = data[next_centroid_idx]
    return centroids

def kmeans_plusplus(data, k, max_iterations=100, random_state=None):

    if isinstance(data, pd.DataFrame):
        data_np = data.values
    else:
        data_np = data

    # Step 1: Initialize centroids using K-Means++ method
    centroids = kmeans_plusplus_init(data_np, k, random_state=random_state)

    for iteration in range(max_iterations):
        # Step 2: Assign data points to the closest centroid
        assignments = assign_to_clusters(data_np, centroids)

        # Step 3: Update centroids
        new_centroids = update_centroids(data_np, assignments, k)

        # Check for convergence (if centroids don't change significantly)
        if np.allclose(centroids, new_centroids):
            print(f"K-Means++ converged after {iteration + 1} iterations.")
            break
        centroids = new_centroids
    else:
        print(f"K-Means++ reached max iterations ({max_iterations}) without convergence.")

    return assignments, centroids

# --- Example Usage ---
if __name__ == "__main__":
    # Load the dataset
    try:
        df = pd.read_csv("data.csv")
        # Assuming the dataset has numerical features for clustering
        data_for_clustering = df.select_dtypes(include=np.number)

        if data_for_clustering.empty:
            print("No numerical columns found in data.csv for clustering.")
        else:
            # Drop rows with any NaN values in the selected numerical columns
            data_for_clustering = data_for_clustering.dropna()

            if data_for_clustering.empty:
                print("No valid numerical data after dropping NaNs in data.csv.")
            else:
                k_value = 3  # Example: Number of clusters
                print(f"Running K-Means++ with k={k_value} on data.csv...")
                cluster_assignments, final_centroids = kmeans_plusplus(data_for_clustering, k=k_value, random_state=42)

                print("\nFinal Cluster Assignments (K-Means++):")
                print(cluster_assignments)

                print("\nFinal Centroids (K-Means++):")
                print(final_centroids)

                # You can add the cluster assignments back to your original DataFrame
                df['KMeans++_Cluster'] = pd.Series(cluster_assignments, index=data_for_clustering.index)
                print("\nDataFrame with K-Means++ Clusters:")
                print(df.head())

    except FileNotFoundError:
        print("Error: data.csv not found. Please make sure the file is in the same directory.")
    except Exception as e:
        print(f"An error occurred: {e}")


Running K-Means++ with k=3 on data.csv...
K-Means++ converged after 4 iterations.

Final Cluster Assignments (K-Means++):
[0 0 0 0 0 0 0 0 0 0 2 1 2 2 2 2 2 1 1 2 0 1]

Final Centroids (K-Means++):
[[1.82512542 2.29187589]
 [6.44231392 3.94407916]
 [5.72262439 6.0960579 ]]

DataFrame with K-Means++ Clusters:
   Feature_1  Feature_2  KMeans++_Cluster
0   2.248357   1.930868                 0
1   2.323844   2.761515                 0
2   1.882923   1.882932                 0
3   2.789606   2.383717                 0
4   1.765263   2.271280                 0


#K-Medoids (PAM) Clustering Algorithm

In [3]:
import pandas as pd
import numpy as np

def manhattan_distance(point1, point2):
    return np.sum(np.abs(point1 - point2))

def calculate_cost(data, medoids, assignments):
    cost = 0.0
    for i, point in enumerate(data):
        medoid_idx = assignments[i]
        cost += manhattan_distance(point, medoids[medoid_idx])
    return cost

def kmedoids(data, k, max_iterations=100, random_state=None):
    if isinstance(data, pd.DataFrame):
        data_np = data.values
    else:
        data_np = data

    n_samples, n_features = data_np.shape

    if random_state is not None:
        np.random.seed(random_state)

    current_medoid_indices = np.random.choice(n_samples, k, replace=False)
    current_medoids = data_np[current_medoid_indices]

    for iteration in range(max_iterations):
        assignments = np.zeros(n_samples, dtype=int)
        for i, point in enumerate(data_np):
            distances = [manhattan_distance(point, medoid) for medoid in current_medoids]
            assignments[i] = np.argmin(distances)

        current_cost = calculate_cost(data_np, current_medoids, assignments)

        improved = False
        for m_idx in range(k):
            original_medoid = current_medoids[m_idx]
            potential_new_medoid_indices = [i for i in range(n_samples) if i not in current_medoid_indices]

            for p_idx in potential_new_medoid_indices:
                potential_new_medoid = data_np[p_idx]

                temp_medoids = np.copy(current_medoids)
                temp_medoids[m_idx] = potential_new_medoid

                temp_assignments = np.zeros(n_samples, dtype=int)
                for i, point in enumerate(data_np):
                    distances = [manhattan_distance(point, medoid) for medoid in temp_medoids]
                    temp_assignments[i] = np.argmin(distances)

                temp_cost = calculate_cost(data_np, temp_medoids, temp_assignments)

                if temp_cost < current_cost:
                    current_cost = temp_cost
                    current_medoids = temp_medoids
                    current_medoid_indices[m_idx] = p_idx
                    assignments = temp_assignments
                    improved = True
                    break
            if improved:
                break

        if not improved:
            print(f"K-Medoids converged after {iteration + 1} iterations.")
            break
    else:
        print(f"K-Medoids reached max iterations ({max_iterations}) without convergence.")

    return assignments, current_medoids

if __name__ == "__main__":
    try:
        df = pd.read_csv("data.csv")
        data_for_clustering = df.select_dtypes(include=np.number)

        if data_for_clustering.empty:
            print("No numerical columns found in data.csv for clustering.")
        else:
            data_for_clustering = data_for_clustering.dropna()

            if data_for_clustering.empty:
                print("No valid numerical data after dropping NaNs in data.csv.")
            else:
                k_value = 3
                print(f"Running K-Medoids with k={k_value} on data.csv...")
                cluster_assignments, final_medoids = kmedoids(data_for_clustering, k=k_value, random_state=42)

                print("\nFinal Cluster Assignments (K-Medoids):")
                print(cluster_assignments)

                print("\nFinal Medoids (K-Medoids):")
                print(final_medoids)

                df['KMedoids_Cluster'] = pd.Series(cluster_assignments, index=data_for_clustering.index)
                print("\nDataFrame with K-Medoids Clusters:")
                print(df.head())

    except FileNotFoundError:
        print("Error: data.csv not found. Please make sure the file is in the same directory.")
    except Exception as e:
        print(f"An error occurred: {e}")


Running K-Medoids with k=3 on data.csv...
K-Medoids converged after 7 iterations.

Final Cluster Assignments (K-Medoids):
[2 2 2 2 2 2 2 2 2 2 0 0 1 1 1 1 0 0 0 1 1 0]

Final Medoids (K-Medoids):
[[6.57578144 5.14540945]
 [5.61893209 6.07764581]
 [1.88292331 1.88293152]]

DataFrame with K-Medoids Clusters:
   Feature_1  Feature_2  KMedoids_Cluster
0   2.248357   1.930868                 2
1   2.323844   2.761515                 2
2   1.882923   1.882932                 2
3   2.789606   2.383717                 2
4   1.765263   2.271280                 2


#Agglomerative Hierarchical Clustering Algorithm

In [4]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform

def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2)**2))

def single_linkage(cluster1_points, cluster2_points):
    min_dist = float('inf')
    for p1 in cluster1_points:
        for p2 in cluster2_points:
            dist = euclidean_distance(p1, p2)
            if dist < min_dist:
                min_dist = dist
    return min_dist

def complete_linkage(cluster1_points, cluster2_points):
    max_dist = 0.0
    for p1 in cluster1_points:
        for p2 in cluster2_points:
            dist = euclidean_distance(p1, p2)
            if dist > max_dist:
                max_dist = dist
    return max_dist

def average_linkage(cluster1_points, cluster2_points):
    total_dist = 0.0
    count = 0
    for p1 in cluster1_points:
        for p2 in cluster2_points:
            total_dist += euclidean_distance(p1, p2)
            count += 1
    return total_dist / count if count > 0 else 0.0

def agglomerative_clustering(data, n_clusters, linkage_method='single'):
    if isinstance(data, pd.DataFrame):
        data_np = data.values
    else:
        data_np = data

    n_samples = data_np.shape[0]

    clusters = [[i] for i in range(n_samples)]

    if linkage_method == 'single':
        linkage_func = single_linkage
    elif linkage_method == 'complete':
        linkage_func = complete_linkage
    elif linkage_method == 'average':
        linkage_func = average_linkage
    else:
        raise ValueError("Invalid linkage_method. Choose from 'single', 'complete', 'average'.")

    while len(clusters) > n_clusters:
        min_dist = float('inf')
        merge_c1_idx, merge_c2_idx = -1, -1

        for i in range(len(clusters)):
            for j in range(i + 1, len(clusters)):
                c1_points = data_np[clusters[i]]
                c2_points = data_np[clusters[j]]
                dist = linkage_func(c1_points, c2_points)

                if dist < min_dist:
                    min_dist = dist
                    merge_c1_idx = i
                    merge_c2_idx = j

        if merge_c1_idx != -1 and merge_c2_idx != -1:
            new_cluster = clusters[merge_c1_idx] + clusters[merge_c2_idx]
            if merge_c1_idx < merge_c2_idx:
                del clusters[merge_c2_idx]
                del clusters[merge_c1_idx]
            else:
                del clusters[merge_c1_idx]
                del clusters[merge_c2_idx]
            clusters.append(new_cluster)
        else:
            break

    assignments = np.zeros(n_samples, dtype=int)
    for cluster_id, point_indices in enumerate(clusters):
        for idx in point_indices:
            assignments[idx] = cluster_id

    return assignments

if __name__ == "__main__":
    try:
        df = pd.read_csv("data.csv")
        data_for_clustering = df.select_dtypes(include=np.number)

        if data_for_clustering.empty:
            print("No numerical columns found in data.csv for clustering.")
        else:
            data_for_clustering = data_for_clustering.dropna()

            if data_for_clustering.empty:
                print("No valid numerical data after dropping NaNs in data.csv.")
            else:
                n_clusters_value = 3
                print(f"Running Agglomerative Hierarchical Clustering (Single Linkage) with {n_clusters_value} clusters on data.csv...")
                cluster_assignments_single = agglomerative_clustering(data_for_clustering, n_clusters=n_clusters_value, linkage_method='single')

                print("\nFinal Cluster Assignments (Agglomerative - Single Linkage):")
                print(cluster_assignments_single)

                df['Agglomerative_Single_Cluster'] = pd.Series(cluster_assignments_single, index=data_for_clustering.index)
                print("\nDataFrame with Agglomerative (Single Linkage) Clusters:")
                print(df.head())

                print(f"\nRunning Agglomerative Hierarchical Clustering (Complete Linkage) with {n_clusters_value} clusters on data.csv...")
                cluster_assignments_complete = agglomerative_clustering(data_for_clustering, n_clusters=n_clusters_value, linkage_method='complete')

                print("\nFinal Cluster Assignments (Agglomerative - Complete Linkage):")
                print(cluster_assignments_complete)

                df['Agglomerative_Complete_Cluster'] = pd.Series(cluster_assignments_complete, index=data_for_clustering.index)
                print("\nDataFrame with Agglomerative (Complete Linkage) Clusters:")
                print(df.head())

                print(f"\nRunning Agglomerative Hierarchical Clustering (Average Linkage) with {n_clusters_value} clusters on data.csv...")
                cluster_assignments_average = agglomerative_clustering(data_for_clustering, n_clusters=n_clusters_value, linkage_method='average')

                print("\nFinal Cluster Assignments (Agglomerative - Average Linkage):")
                print(cluster_assignments_average)

                df['Agglomerative_Average_Cluster'] = pd.Series(cluster_assignments_average, index=data_for_clustering.index)
                print("\nDataFrame with Agglomerative (Average Linkage) Clusters:")
                print(df.head())

    except FileNotFoundError:
        print("Error: data.csv not found. Please make sure the file is in the same directory.")
    except Exception as e:
        print(f"An error occurred: {e}")


Running Agglomerative Hierarchical Clustering (Single Linkage) with 3 clusters on data.csv...

Final Cluster Assignments (Agglomerative - Single Linkage):
[2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 0]

DataFrame with Agglomerative (Single Linkage) Clusters:
   Feature_1  Feature_2  Agglomerative_Single_Cluster
0   2.248357   1.930868                             2
1   2.323844   2.761515                             2
2   1.882923   1.882932                             2
3   2.789606   2.383717                             2
4   1.765263   2.271280                             2

Running Agglomerative Hierarchical Clustering (Complete Linkage) with 3 clusters on data.csv...

Final Cluster Assignments (Agglomerative - Complete Linkage):
[2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 0]

DataFrame with Agglomerative (Complete Linkage) Clusters:
   Feature_1  Feature_2  Agglomerative_Single_Cluster  \
0   2.248357   1.930868                             2   
1   2.323844   2.761515                

#Divisive Hierarchical Clustering Algorithm


In [5]:
import pandas as pd
import numpy as np

def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2)**2))

def assign_to_clusters(data, centroids):
    assignments = np.zeros(data.shape[0], dtype=int)
    for i, point in enumerate(data):
        distances = [euclidean_distance(point, centroid) for centroid in centroids]
        assignments[i] = np.argmin(distances)
    return assignments

def update_centroids(data, assignments, k):
    new_centroids = np.zeros((k, data.shape[1]))
    for cluster_id in range(k):
        points_in_cluster = data[assignments == cluster_id]
        if len(points_in_cluster) > 0:
            new_centroids[cluster_id] = np.mean(points_in_cluster, axis=0)
    return new_centroids

def kmeans_simple(data, k, max_iterations=50, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)

    n_samples = data.shape[0]
    if n_samples < k:
        return np.zeros(n_samples, dtype=int), data

    initial_centroid_indices = np.random.choice(n_samples, k, replace=False)
    centroids = data[initial_centroid_indices]

    for _ in range(max_iterations):
        assignments = assign_to_clusters(data, centroids)
        new_centroids = update_centroids(data, assignments, k)
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
    return assignments, centroids


def divisive_clustering(data, n_clusters, random_state=None):
    if isinstance(data, pd.DataFrame):
        data_np = data.values
    else:
        data_np = data

    n_samples = data_np.shape[0]
    if n_samples == 0:
        return np.array([])

    if random_state is not None:
        np.random.seed(random_state)

    current_clusters = {0: list(range(n_samples))}
    cluster_id_counter = 1

    while len(current_clusters) < n_clusters:
        if not current_clusters:
            break

        largest_cluster_id = -1
        max_points = -1
        for c_id, indices in current_clusters.items():
            if len(indices) > max_points:
                max_points = len(indices)
                largest_cluster_id = c_id

        if largest_cluster_id == -1 or max_points < 2:
            break

        cluster_to_split_indices = current_clusters[largest_cluster_id]
        cluster_to_split_data = data_np[cluster_to_split_indices]

        if len(cluster_to_split_data) < 2:
            break

        sub_assignments, _ = kmeans_simple(cluster_to_split_data, k=2, random_state=random_state)

        new_cluster1_indices = [cluster_to_split_indices[i] for i, assign in enumerate(sub_assignments) if assign == 0]
        new_cluster2_indices = [cluster_to_split_indices[i] for i, assign in enumerate(sub_assignments) if assign == 1]

        del current_clusters[largest_cluster_id]
        if new_cluster1_indices:
            current_clusters[cluster_id_counter] = new_cluster1_indices
            cluster_id_counter += 1
        if new_cluster2_indices:
            current_clusters[cluster_id_counter] = new_cluster2_indices
            cluster_id_counter += 1

        if len(current_clusters) == n_clusters:
            break

    final_assignments = np.zeros(n_samples, dtype=int)
    for cluster_id, original_indices in current_clusters.items():
        for original_idx in original_indices:
            final_assignments[original_idx] = cluster_id

    return final_assignments

if __name__ == "__main__":
    try:
        df = pd.read_csv("data.csv")
        data_for_clustering = df.select_dtypes(include=np.number)

        if data_for_clustering.empty:
            print("No numerical columns found in data.csv for clustering.")
        else:
            data_for_clustering = data_for_clustering.dropna()

            if data_for_clustering.empty:
                print("No valid numerical data after dropping NaNs in data.csv.")
            else:
                n_clusters_value = 3
                print(f"Running Divisive Hierarchical Clustering with {n_clusters_value} clusters on data.csv...")
                cluster_assignments = divisive_clustering(data_for_clustering, n_clusters=n_clusters_value, random_state=42)

                print("\nFinal Cluster Assignments (Divisive Clustering):")
                print(cluster_assignments)

                df['Divisive_Cluster'] = pd.Series(cluster_assignments, index=data_for_clustering.index)
                print("\nDataFrame with Divisive Clusters:")
                print(df.head())

    except FileNotFoundError:
        print("Error: data.csv not found. Please make sure the file is in the same directory.")
    except Exception as e:
        print(f"An error occurred: {e}")


Running Divisive Hierarchical Clustering with 3 clusters on data.csv...

Final Cluster Assignments (Divisive Clustering):
[3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 4 2]

DataFrame with Divisive Clusters:
   Feature_1  Feature_2  Divisive_Cluster
0   2.248357   1.930868                 3
1   2.323844   2.761515                 3
2   1.882923   1.882932                 3
3   2.789606   2.383717                 3
4   1.765263   2.271280                 3


#DBSCAN Clustering Algorithm

In [6]:
import pandas as pd
import numpy as np

def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2)**2))

def get_neighbors(data, point_idx, eps):
    neighbors = []
    for i in range(data.shape[0]):
        if i == point_idx:
            continue
        if euclidean_distance(data[point_idx], data[i]) <= eps:
            neighbors.append(i)
    return neighbors

def dbscan(data, eps, min_samples):
    if isinstance(data, pd.DataFrame):
        data_np = data.values
    else:
        data_np = data

    n_samples = data_np.shape[0]
    labels = -2 * np.ones(n_samples, dtype=int)
    cluster_id = 0

    for i in range(n_samples):
        if labels[i] != -2:
            continue

        neighbors = get_neighbors(data_np, i, eps)

        if len(neighbors) < min_samples:
            labels[i] = -1
        else:
            labels[i] = cluster_id
            queue = list(neighbors)

            while queue:
                current_neighbor_idx = queue.pop(0)

                if labels[current_neighbor_idx] == -1:
                    labels[current_neighbor_idx] = cluster_id

                if labels[current_neighbor_idx] != -2:
                    continue

                labels[current_neighbor_idx] = cluster_id

                next_neighbors = get_neighbors(data_np, current_neighbor_idx, eps)

                if len(next_neighbors) >= min_samples:
                    for nn_idx in next_neighbors:
                        if labels[nn_idx] == -2 or labels[nn_idx] == -1:
                            queue.append(nn_idx)
            cluster_id += 1

    return labels

if __name__ == "__main__":
    try:
        df2 = pd.read_csv("data2.csv")
        data_for_dbscan = df2.select_dtypes(include=np.number)

        if data_for_dbscan.empty:
            print("No numerical columns found in data2.csv for clustering.")
        else:
            data_for_dbscan = data_for_dbscan.dropna()

            if data_for_dbscan.empty:
                print("No valid numerical data after dropping NaNs in data2.csv.")
            else:
                eps_value = 0.5
                min_samples_value = 5

                print(f"Running DBSCAN with eps={eps_value}, min_samples={min_samples_value} on data2.csv...")
                cluster_assignments = dbscan(data_for_dbscan, eps=eps_value, min_samples=min_samples_value)

                print("\nFinal Cluster Assignments (DBSCAN):")
                print(cluster_assignments)
                print("Note: -1 indicates noise points.")

                df2['DBSCAN_Cluster'] = pd.Series(cluster_assignments, index=data_for_dbscan.index)
                print("\nDataFrame with DBSCAN Clusters:")
                print(df2.head())

                unique_labels, counts = np.unique(cluster_assignments, return_counts=True)
                print("\nCluster distribution:")
                for label, count in zip(unique_labels, counts):
                    if label == -1:
                        print(f"Noise points: {count}")
                    else:
                        print(f"Cluster {label}: {count}")

    except FileNotFoundError:
        print("Error: data2.csv not found. Please make sure the file is in the same directory.")
    except Exception as e:
        print(f"An error occurred: {e}")


Running DBSCAN with eps=0.5, min_samples=5 on data2.csv...

Final Cluster Assignments (DBSCAN):
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0 -1 -1 -1]
Note: -1 indicates noise points.

DataFrame with DBSCAN Clusters:
   Feature_1  Feature_2  DBSCAN_Cluster
0  -1.049426   0.084443               0
1   0.922818   0.457489               0
2   0.656787   0.699597               0
3   1.188940  -0.386528               0
4   0.289265  -0.13

#Outlier Analysis using Z-score Method


In [7]:
import pandas as pd
import numpy as np

def zscore_outlier_detection(data, threshold=3.0):
    if isinstance(data, pd.DataFrame):
        outliers_df = pd.DataFrame(index=data.index, columns=data.columns, dtype=bool)
        zscores_df = pd.DataFrame(index=data.index, columns=data.columns, dtype=float)

        numerical_cols = data.select_dtypes(include=np.number).columns
        if numerical_cols.empty:
            print("Warning: No numerical columns found for Z-score outlier detection.")
            return outliers_df, zscores_df

        for col in numerical_cols:
            series = data[col].dropna()
            if series.empty:
                outliers_df[col] = False
                zscores_df[col] = np.nan
                continue

            mean_val = series.mean()
            std_dev = series.std()

            if std_dev == 0:
                zscores = pd.Series(0.0, index=series.index)
                is_outlier = pd.Series(False, index=series.index)
            else:
                zscores = (series - mean_val) / std_dev
                is_outlier = np.abs(zscores) > threshold

            outliers_df[col] = is_outlier.reindex(data.index, fill_value=False)
            zscores_df[col] = zscores.reindex(data.index, fill_value=np.nan)
        return outliers_df, zscores_df

    elif isinstance(data, pd.Series):
        series = data.dropna()
        if series.empty:
            return pd.Series(False, index=data.index), pd.Series(np.nan, index=data.index)

        mean_val = series.mean()
        std_dev = series.std()

        if std_dev == 0:
            zscores = pd.Series(0.0, index=series.index)
            is_outlier = pd.Series(False, index=series.index)
        else:
            zscores = (series - mean_val) / std_dev
            is_outlier = np.abs(zscores) > threshold

        return is_outlier.reindex(data.index, fill_value=False), zscores.reindex(data.index, fill_value=np.nan)

    elif isinstance(data, np.ndarray):
        if data.ndim > 1:
            print("Warning: Z-score detection for NumPy array assumes 1D data. Processing each column if 2D.")
            outliers_array = np.zeros_like(data, dtype=bool)
            zscores_array = np.zeros_like(data, dtype=float)
            for col_idx in range(data.shape[1]):
                col_data = data[:, col_idx]
                mean_val = np.nanmean(col_data)
                std_dev = np.nanstd(col_data)

                if std_dev == 0:
                    zscores_array[:, col_idx] = 0.0
                    outliers_array[:, col_idx] = False
                else:
                    zscores = (col_data - mean_val) / std_dev
                    outliers_array[:, col_idx] = np.abs(zscores) > threshold
                    zscores_array[:, col_idx] = zscores
            return outliers_array, zscores_array
        else:
            mean_val = np.nanmean(data)
            std_dev = np.nanstd(data)
            if std_dev == 0:
                return np.full_like(data, False, dtype=bool), np.full_like(data, 0.0, dtype=float)
            zscores = (data - mean_val) / std_dev
            is_outlier = np.abs(zscores) > threshold
            return is_outlier, zscores
    else:
        raise TypeError("Input data must be a pandas Series, DataFrame, or NumPy array.")


if __name__ == "__main__":
    try:
        df3 = pd.read_csv("data3.csv")
        print("Original data from data3.csv:")
        print(df3.head())

        numerical_data = df3.select_dtypes(include=np.number)

        if numerical_data.empty:
            print("No numerical columns found in data3.csv for Z-score outlier detection.")
        else:
            zscore_threshold = 3.0
            print(f"\nPerforming Z-score outlier detection with threshold={zscore_threshold}...")
            is_outlier_zscore, zscores = zscore_outlier_detection(numerical_data, threshold=zscore_threshold)

            print("\nIs Outlier (Z-score method):")
            print(is_outlier_zscore.head())

            print("\nCalculated Z-scores:")
            print(zscores.head())

            outlier_rows_zscore = df3[is_outlier_zscore.any(axis=1)]
            if not outlier_rows_zscore.empty:
                print("\nRows identified as outliers (Z-score method):")
                print(outlier_rows_zscore)
            else:
                print("\nNo outliers found using Z-score method with the given threshold.")

    except FileNotFoundError:
        print("Error: data3.csv not found. Please make sure the file is in the same directory.")
    except Exception as e:
        print(f"An error occurred: {e}")


Original data from data3.csv:
   Value
0    1.0
1    2.0
2    3.0
3    4.0
4   10.0

Performing Z-score outlier detection with threshold=3.0...

Is Outlier (Z-score method):
   Value
0  False
1  False
2  False
3  False
4  False

Calculated Z-scores:
      Value
0  0.045781
1  0.180167
2  0.314553
3  0.448939
4  1.255256

No outliers found using Z-score method with the given threshold.


#Outlier Detection using IQR Method

In [8]:
import pandas as pd
import numpy as np

def iqr_outlier_detection(data, k=1.5):
    if isinstance(data, pd.DataFrame):
        outliers_df = pd.DataFrame(index=data.index, columns=data.columns, dtype=bool)
        bounds_info = {}

        numerical_cols = data.select_dtypes(include=np.number).columns
        if numerical_cols.empty:
            print("Warning: No numerical columns found for IQR outlier detection.")
            return outliers_df, bounds_info

        for col in numerical_cols:
            series = data[col].dropna()
            if series.empty:
                outliers_df[col] = False
                bounds_info[col] = {'Q1': np.nan, 'Q3': np.nan, 'IQR': np.nan, 'Lower_Bound': np.nan, 'Upper_Bound': np.nan}
                continue

            Q1 = np.percentile(series, 25)
            Q3 = np.percentile(series, 75)
            IQR = Q3 - Q1

            lower_bound = Q1 - k * IQR
            upper_bound = Q3 + k * IQR

            is_outlier = (series < lower_bound) | (series > upper_bound)
            outliers_df[col] = is_outlier.reindex(data.index, fill_value=False)

            bounds_info[col] = {
                'Q1': Q1,
                'Q3': Q3,
                'IQR': IQR,
                'Lower_Bound': lower_bound,
                'Upper_Bound': upper_bound
            }
        return outliers_df, bounds_info

    elif isinstance(data, pd.Series):
        series = data.dropna()
        if series.empty:
            return pd.Series(False, index=data.index), {'Q1': np.nan, 'Q3': np.nan, 'IQR': np.nan, 'Lower_Bound': np.nan, 'Upper_Bound': np.nan}

        Q1 = np.percentile(series, 25)
        Q3 = np.percentile(series, 75)
        IQR = Q3 - Q1

        lower_bound = Q1 - k * IQR
        upper_bound = Q3 + k * IQR

        is_outlier = (series < lower_bound) | (series > upper_bound)
        bounds_info = {
            'Q1': Q1,
            'Q3': Q3,
            'IQR': IQR,
            'Lower_Bound': lower_bound,
            'Upper_Bound': upper_bound
        }
        return is_outlier.reindex(data.index, fill_value=False), bounds_info

    elif isinstance(data, np.ndarray):
        if data.ndim > 1:
            print("Warning: IQR detection for NumPy array assumes 1D data. Processing each column if 2D.")
            outliers_array = np.zeros_like(data, dtype=bool)
            bounds_info_array = {}
            for col_idx in range(data.shape[1]):
                col_data = data[:, col_idx]
                valid_data = col_data[~np.isnan(col_data)]
                if valid_data.size == 0:
                    outliers_array[:, col_idx] = False
                    bounds_info_array[f'col_{col_idx}'] = {'Q1': np.nan, 'Q3': np.nan, 'IQR': np.nan, 'Lower_Bound': np.nan, 'Upper_Bound': np.nan}
                    continue

                Q1 = np.percentile(valid_data, 25)
                Q3 = np.percentile(valid_data, 75)
                IQR = Q3 - Q1

                lower_bound = Q1 - k * IQR
                upper_bound = Q3 + k * IQR

                is_outlier = (col_data < lower_bound) | (col_data > upper_bound)
                outliers_array[:, col_idx] = is_outlier
                bounds_info_array[f'col_{col_idx}'] = {
                    'Q1': Q1, 'Q3': Q3, 'IQR': IQR,
                    'Lower_Bound': lower_bound, 'Upper_Bound': upper_bound
                }
            return outliers_array, bounds_info_array
        else:
            valid_data = data[~np.isnan(data)]
            if valid_data.size == 0:
                return np.full_like(data, False, dtype=bool), {'Q1': np.nan, 'Q3': np.nan, 'IQR': np.nan, 'Lower_Bound': np.nan, 'Upper_Bound': np.nan}

            Q1 = np.percentile(valid_data, 25)
            Q3 = np.percentile(valid_data, 75)
            IQR = Q3 - Q1

            lower_bound = Q1 - k * IQR
            upper_bound = Q3 + k * IQR

            is_outlier = (data < lower_bound) | (data > upper_bound)
            bounds_info = {
                'Q1': Q1,
                'Q3': Q3,
                'IQR': IQR,
                'Lower_Bound': lower_bound,
                'Upper_Bound': upper_bound
            }
            return is_outlier, bounds_info
    else:
        raise TypeError("Input data must be a pandas Series, DataFrame, or NumPy array.")


if __name__ == "__main__":
    try:
        df3 = pd.read_csv("data3.csv")
        print("Original data from data3.csv:")
        print(df3.head())

        numerical_data = df3.select_dtypes(include=np.number)

        if numerical_data.empty:
            print("No numerical columns found in data3.csv for IQR outlier detection.")
        else:
            iqr_k_value = 1.5
            print(f"\nPerforming IQR outlier detection with k={iqr_k_value}...")
            is_outlier_iqr, bounds_iqr = iqr_outlier_detection(numerical_data, k=iqr_k_value)

            print("\nIs Outlier (IQR method):")
            print(is_outlier_iqr.head())

            print("\nCalculated IQR Bounds per column:")
            for col, info in bounds_iqr.items():
                print(f"  Column '{col}':")
                for key, value in info.items():
                    print(f"    {key}: {value:.2f}")

            outlier_rows_iqr = df3[is_outlier_iqr.any(axis=1)]
            if not outlier_rows_iqr.empty:
                print("\nRows identified as outliers (IQR method):")
                print(outlier_rows_iqr)
            else:
                print("\nNo outliers found using IQR method with the given k value.")

    except FileNotFoundError:
        print("Error: data3.csv not found. Please make sure the file is in the same directory.")
    except Exception as e:
        print(f"An error occurred: {e}")


Original data from data3.csv:
   Value
0    1.0
1    2.0
2    3.0
3    4.0
4   10.0

Performing IQR outlier detection with k=1.5...

Is Outlier (IQR method):
   Value
0  False
1  False
2  False
3  False
4  False

Calculated IQR Bounds per column:
  Column 'Value':
    Q1: -0.76
    Q3: 4.25
    IQR: 5.00
    Lower_Bound: -8.26
    Upper_Bound: 11.76

Rows identified as outliers (IQR method):
    Value
6   -10.0
14  -20.0


#Mini-Batch K-Means Algorithm

In [9]:
import pandas as pd
import numpy as np

def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2)**2))

def assign_to_clusters(data, centroids):
    assignments = np.zeros(data.shape[0], dtype=int)
    for i, point in enumerate(data):
        distances = [euclidean_distance(point, centroid) for centroid in centroids]
        assignments[i] = np.argmin(distances)
    return assignments

def mini_batch_kmeans(data, k, batch_size, max_iterations=100, random_state=None):
    if isinstance(data, pd.DataFrame):
        data_np = data.values
    else:
        data_np = data

    n_samples, n_features = data_np.shape

    if random_state is not None:
        np.random.seed(random_state)

    initial_centroid_indices = np.random.choice(n_samples, k, replace=False)
    centroids = data_np[initial_centroid_indices].astype(float)

    centroid_counts = np.ones(k)

    for iteration in range(max_iterations):
        batch_indices = np.random.choice(n_samples, batch_size, replace=False)
        mini_batch = data_np[batch_indices]

        batch_assignments = assign_to_clusters(mini_batch, centroids)

        for i, point in enumerate(mini_batch):
            assigned_cluster_id = batch_assignments[i]
            centroids[assigned_cluster_id] = (centroids[assigned_cluster_id] * centroid_counts[assigned_cluster_id] + point) / (centroid_counts[assigned_cluster_id] + 1)
            centroid_counts[assigned_cluster_id] += 1

    final_assignments = assign_to_clusters(data_np, centroids)

    return final_assignments, centroids

if __name__ == "__main__":
    try:
        df4 = pd.read_csv("data4.csv")
        data_for_mini_batch_kmeans = df4.select_dtypes(include=np.number)

        if data_for_mini_batch_kmeans.empty:
            print("No numerical columns found in data4.csv for Mini-Batch K-Means.")
        else:
            data_for_mini_batch_kmeans = data_for_mini_batch_kmeans.dropna()

            if data_for_mini_batch_kmeans.empty:
                print("No valid numerical data after dropping NaNs in data4.csv.")
            else:
                k_value = 3
                batch_size_value = 50

                if batch_size_value > len(data_for_mini_batch_kmeans):
                    batch_size_value = len(data_for_mini_batch_kmeans)
                    print(f"Adjusted batch_size to {batch_size_value} as it was larger than the dataset size.")

                print(f"Running Mini-Batch K-Means with k={k_value}, batch_size={batch_size_value} on data4.csv...")
                cluster_assignments, final_centroids = mini_batch_kmeans(
                    data_for_mini_batch_kmeans,
                    k=k_value,
                    batch_size=batch_size_value,
                    random_state=42
                )

                print("\nFinal Cluster Assignments (Mini-Batch K-Means):")
                print(cluster_assignments)

                print("\nFinal Centroids (Mini-Batch K-Means):")
                print(final_centroids)

                df4['MiniBatchKMeans_Cluster'] = pd.Series(cluster_assignments, index=data_for_mini_batch_kmeans.index)
                print("\nDataFrame with Mini-Batch K-Means Clusters:")
                print(df4.head())

    except FileNotFoundError:
        print("Error: data4.csv not found. Please make sure the file is in the same directory.")
    except Exception as e:
        print(f"An error occurred: {e}")


Running Mini-Batch K-Means with k=3, batch_size=50 on data4.csv...

Final Cluster Assignments (Mini-Batch K-Means):
[2 2 1 1 0 0 2 0 2 0 2 2 2 2 2 1 2 0 2 1 2 2 1 2 1 2 2 0 2 2 2 2 0 1 2 2 1
 2 1 2 2 2 0 2 0 2 2 1 2 2 1 2 1 2 2 2 1 2 2 2 1 2 0 1 1 1 2 2 1 2 2 1 2 2
 2 2 2 2 2 2 2 2 2 1 0 1 1 2 2 0 2 2 2 2 2 1 2 2 2 2 1 0 2 1 2 2 0 2 2 1 2
 2 2 1 2 1 0 2 2 2 1 2 2 1 0 2 2 2 2 2 2 0 2 0 1 2 2 2 2 2 2 2 0 2 2 2 2 1
 0 2 2 1 2 0 2 0 1 2 2 2 2 2 2 0 2 2 2 2 1 2 1 2 2 1 2 2 2 2 2 2 2 2 1 0 0
 2 2 2 2 2 2 2 0 0 2 0 2 2 2 2]

Final Centroids (Mini-Batch K-Means):
[[-6.92499219 -6.06989889]
 [-6.61263356 -7.44574048]
 [ 0.97702332  5.5431696 ]]

DataFrame with Mini-Batch K-Means Clusters:
   Feature_1  Feature_2  MiniBatchKMeans_Cluster
0   6.505653   2.447003                        2
1  -5.128943   9.836189                        2
2  -6.891874  -7.777364                        1
3  -8.327712  -8.287573                        1
4  -7.468992  -6.030507                        0
