# Exploratory-Research-in-Neural-Networks-and-SVMs
### Author: Santiago Velasco (santidavid.velasco218@hotmail.com)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from scipy.cluster.hierarchy import dendrogram, linkage
from mpl_toolkits.mplot3d import Axes3D

## Data Preparation:

data_1 = pd.read_csv("dataset1.csv")
data_2 = pd.read_csv("dataset2.csv")

x_1 = data_1.values
print("Features Shape Dataset 1:", x_1.shape)

x_2 = data_2.values
print("Features Shape Dataset 2:", x_2.shape)

In [None]:
#First Dataset
plt.figure(figsize=(8, 6))
plt.scatter(x_1[:, 0], x_1[:, 1], c='blue', alpha=0.6, edgecolors='k')

plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title(f"First Dataset 2D Scatterplot")
plt.show()

In [None]:
#Second Dataset
fig = plt.figure(figsize=(8, 10))
ax = fig.add_subplot(projection='3d')

ax.scatter(x_2[:,0], x_2[:,1], x_2[:,2], c='b', alpha=0.6, marker='.')

ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_zlabel('Feature 3')

ax.set_title('Second Dataset 3D Scatterplot')

ax.view_init(elev=45, azim=45)

plt.show()

In [None]:
fig = plt.figure(figsize=(8, 10))
ax = fig.add_subplot(projection='3d')

ax.scatter(x_2[:,0], x_2[:,1], x_2[:,2], c='b', alpha=0.6, marker='.')

ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_zlabel('Feature 3')

ax.set_title('Second Dataset 3D Scatterplot')

ax.view_init(elev=0, azim=90)

plt.show()

## Loyd's Algorithm:

In [None]:
def k_means(X, k, max_iters=100, tol=1e-4, method='++initialization'):
    
    n_samples, n_features = X.shape

    if method == '++initialization':
        centroids = []
        
        # Choosing first centroid randomly
        first_index = np.random.choice(n_samples)
        first_centroid = X[first_index]  # shape: (n_features,)
        centroids.append(first_centroid)
        
        # Computing initial squared distances from the first centroid
        distances = np.linalg.norm(X - first_centroid, axis=1) ** 2
        
        # Choosing the remaining centroids
        for _ in range(1, k):
            probabilities = distances / distances.sum()
            new_centroid_index = np.random.choice(n_samples, p=probabilities)
            new_centroid = X[new_centroid_index]
            centroids.append(new_centroid)
            
            # Updating distances using squared distance
            new_distances = np.linalg.norm(X - new_centroid, axis=1) ** 2
            distances = np.minimum(distances, new_distances)
            
        centroids = np.array(centroids)
        
    else:
        indices = np.random.choice(n_samples, k, replace=False)
        centroids = X[indices]

    # K Iterations
    for i in range(max_iters):
        
        distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
        
        labels = np.argmin(distances, axis=1)
        
        new_centroids = np.array([
            X[labels == j].mean(axis=0) if np.any(labels == j) else centroids[j]
            for j in range(k)
        ])
        
        # Convergence checking
        if np.linalg.norm(new_centroids - centroids) < tol:
            print(f"Converged after {i+1} iterations.")
            centroids = new_centroids
            break
        
        centroids = new_centroids
    
    return centroids, labels

## First Dataset Training

## K-means Training:

In [None]:
ks = [2, 4, 8, 12, 16, 20, 24, 28, 32]
centroids_labels_uri = []

for k in ks:
    centroids, labels = k_means(x_1, k, method='uri')
    centroids_labels_uri.append([centroids, labels])


centroids_labels_ppinitialization = []

for k in ks:
    centroids, labels = k_means(x_1, k, method='++initialization')
    centroids_labels_ppinitialization.append([centroids, labels])  

In [None]:
for i, k in enumerate(ks):
        centroids, labels = centroids_labels_uri[i]
    
        plt.figure(figsize=(6, 5))
        plt.scatter(x_1[:, 0], x_1[:, 1], c=labels, cmap='viridis', alpha=0.6, edgecolors='k')
        plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='o', s=50, label="Centroids")
    
        plt.xlabel("Feature 1")
        plt.ylabel("Feature 2")
        plt.title(f"Clustering with k={k}")
        plt.legend()
        plt.show()

In [None]:
for i, k in enumerate(ks):
    
        centroids, labels = centroids_labels_ppinitialization[i]  # Extract for each k
    
        plt.figure(figsize=(6, 5))
        plt.scatter(x_1[:, 0], x_1[:, 1], c=labels, cmap='viridis', alpha=0.6, edgecolors='k')
        plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='o', s=50, label="Centroids")
    
        plt.xlabel("Feature 1")
        plt.ylabel("Feature 2")
        plt.title(f"Clustering with k={k}")
        plt.legend()
        plt.show()

In [None]:
def compute_cost(x, centroids, labels):
    cost = 0
    for i, centroid in enumerate(centroids):
        cluster_points = x[labels == i]
        cost += np.sum((cluster_points - centroid)**2)

    print(cost)
    return cost

def plot_cost_vs_k(centroids_labels_list, x, ks, title='Cost vs. Number of Clusters'):

    costs = []
    for centroids, labels in centroids_labels_list:
        cost = compute_cost(x, centroids, labels)
        costs.append(cost)
    
    plt.figure(figsize=(8, 5))
    plt.plot(ks, costs, marker='o', linestyle='-', color='blue')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Cost (Sum of Squared Errors)')
    plt.title(title)
    plt.grid(True)
    plt.show()

In [None]:
plot_cost_vs_k(centroids_labels_uri, x_1, ks, title='Cost vs. Number of Clusters with Uniform Random Initialization')

In [None]:
plot_cost_vs_k(centroids_labels_ppinitialization, x_1, ks, title='Cost vs. Number of Clusters with ++Initialization')

## Hierarchical Agglomerative Clustering

In [None]:
# Average linkage:
Z_average = linkage(x_1, method='average', metric='euclidean')
plt.figure(figsize=(10, 5))
plt.title("Dendrogram (Average Linkage) via SciPy")
dendrogram(Z_average, truncate_mode='level', p=4)
plt.xlabel("Cluster Index or (Number of Points)")
plt.ylabel("Distance")
plt.show()

In [None]:
# Single linkage:
Z_single = linkage(x_1, method='single', metric='euclidean')
plt.figure(figsize=(10, 5))
plt.title("Dendrogram (Single Linkage) via SciPy")
dendrogram(Z_single, truncate_mode='level', p=6)
plt.xlabel("Cluster Index or (Number of Points)")
plt.ylabel("Distance")
plt.show()
