## Initialize
Import packages

In [64]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.decomposition import PCA, NMF


Read in train dataset

In [65]:
train_data = pd.read_csv('train.dat', sep=',', header=None)

In [66]:
X_train = train_data.iloc[:, :-1].values

## Proprocessing

### Normalization

Transform the data into standard normally distributed data

In [67]:
scaler = StandardScaler()
X_normal = scaler.fit_transform(X_train)

### Feature Extraction

Extract k features

In [68]:
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_normal)

In [69]:
# Define the number of clusters to be found
k = 155

# Define the number of iterations to be performed
num_of_iterations = 10

# Initialize the list of clusters to contain the cluster containing all points
clusters = [X_pca]

In [70]:
# Repeat until the list of clusters contains K clusters
while len(clusters) < k:

    # If there are no clusters left to bisect, break
    if len(clusters) < 1:
        break
    
    # Select a cluster from the list of clusters
    cluster = clusters.pop(0)
    
    if len(cluster) < 2:
        continue
    
    sse = float('inf')
    best_clusters = None
    
    # For i = 1 to num_of_iterations do
    for i in range(num_of_iterations):
        
        # Bisect the selected cluster using basic K-means
        kmeans = KMeans(n_clusters=2).fit(cluster)
        labels = kmeans.labels_
        
        # Split the cluster into two clusters based on the labels
        cluster1 = cluster[labels == 0]
        cluster2 = cluster[labels == 1]
        
        # Calculate the SSE of the two new clusters
        sse1 = np.sum(np.square(cluster1 - kmeans.cluster_centers_[0]))
        sse2 = np.sum(np.square(cluster2 - kmeans.cluster_centers_[1]))
        total_sse = sse1 + sse2
        
        # If the total SSE is smaller than the current best, update the best
        if total_sse < sse:
            sse = total_sse
            best_clusters = [cluster1, cluster2]
    
    # Add the two clusters from the bisection to the list of clusters
    clusters += best_clusters



In [71]:
# Concatenate the arrays in clusters into a single array
cluster_labels = np.zeros(X_pca.shape[0], dtype=int)
for i, cluster in enumerate(clusters):
    cluster_labels[np.isin(X_pca, cluster).all(axis=1)] = i

In [72]:
clusters = pd.DataFrame(cluster_labels)

clusters.to_csv('output.dat', index=False, header=False)