In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from collections import Counter 

#loading the dataset

df1 = pd.read_csv('train.csv').drop(columns=['Artist Name', 'Track Name'])
df2 = pd.read_csv('test.csv').drop(columns=['Artist Name', 'Track Name'])

df = pd.concat([df1,df2])


keys = df.keys()
numcols = len(keys)
df

#check for missing values
for k in keys:
    print(f'column {k} has {np.round(df[k].isna().sum()/len(df)*100, 2)}% missing values')
    
#Populatirity has 2.55% missing values -> can be filled
#key, instrumentalness and the Target have very high missing value rates -> mvs should be removed as to not alter the 
#distribution

df = df[df['instrumentalness'].notna()]
df = df[df['key'].notna()]
df = df[df['Class'].notna()]

target = df['Class']
numTargets = len(set(target))
df.drop(columns=['Class'], inplace = True)
keys = df.keys()
numcols = len(keys)

target

#Fill Popularity mv with median, because all popularities seem to be integer
df.fillna(np.mean(df['Popularity']), inplace = True)
df

#Univariate Analysis
means = {}
medians = {}
stdevs = {}
for k in keys:
    mean = np.mean(df[df[k].notna()][k])
    med = np.median(df[df[k].notna()][k])
    std = np.sqrt(np.mean((df[k]-mean)**2))
    means[k] = mean
    medians[k] = med
    stdevs[k] = std

means,medians,stdevs


#Visualise each distribution using a boxplot
n = 3
m = 5
fig, axs = plt.subplots(n,m)


for i in range(n):
    for j in range(m):
        idx = i*m+j 
        if (idx < numcols):
            axs[i][j].boxplot(df[keys[idx]])
            axs[i][j].title.set_text(keys[idx])
            
plt.tight_layout()


#Bivariate Analysis
#Visualize each pair of variables

axes = pd.plotting.scatter_matrix(df, alpha = 0.1, figsize = (15,15), c  =target);
for ax in axes.flatten():
    ax.xaxis.label.set_rotation(90)
    ax.yaxis.label.set_rotation(0)
    ax.yaxis.label.set_ha('right')
    
    
#Normalisation
#No strong non linear dependence can be seen in above plots.
#as such a simple min max scaler to the [0,1] intervall is used
maxs = np.max(df, axis = 0)
mins = np.min(df, axis = 0)


for k in keys:
    df[k] = (df[k]-mins[k])/(maxs[k]-mins[k])

#repeat means and stdev calculation
means = {}
medians = {}
stdevs = {}
for k in keys:
    mean = np.mean(df[df[k].notna()][k])
    med = np.median(df[df[k].notna()][k])
    std = np.sqrt(np.mean((df[k]-mean)**2))
    means[k] = mean
    medians[k] = med
    stdevs[k] = std

    
df


#Calculate and visualize Correlation between each pair of variables

correlations = df.corr()

plt.matshow(correlations)
plt.colorbar()
plt.xticks(range(numcols), keys, rotation = 90);
plt.yticks(range(numcols), keys, rotation = 0);
correlations


#Only one highly correlated variable pair obvious, which is "energy" and "loudness", a correlation that could have
#been expected.
#Witch a correlation of .795 this variable can be removed while barely losing any information.

df.drop(columns=['loudness'], inplace = True)
keys = df.keys()
df


#Map Each cluster label to what true label it most labels
def mapPredAndTrue(tar, pred):
    convMat = np.array(confusion_matrix(tar, pred, normalize = 'true'))
    
    
    translation = np.argmax(convMat, axis = 0)

    predLabel = pred.copy()

    return translation[predLabel]


from sklearn.cluster import AgglomerativeClustering

from sklearn.metrics import confusion_matrix 
from sklearn.metrics import silhouette_score
from sklearn.metrics import homogeneity_score

from tqdm import tqdm

uppLimFak = 3
silhouetteScores = []
homogeneityScores = []

clusterings = []
possMetrics = ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']
nClustersToCheck = range(numTargets, uppLimFak*numTargets)


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score, homogeneity_score

scaler = MinMaxScaler()        # normalization

df_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(df_scaled, columns=df.columns)


k_values = range(3, uppLimFak * numTargets)     # range

sc_silhouette = []
sc_homogeneity = []

for k in tqdm(k_values):  
    
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_scaled)
    
    sc_silhouette.append(silhouette_score(df_scaled, kmeans.labels_))
    sc_homogeneity.append(homogeneity_score(mapPredAndTrue(target, kmeans.labels_), target))


plt.plot(k_values, sc_silhouette, label='Silhouette Score')        # plotting homogenity and silhouette scores
plt.plot(k_values, sc_homogeneity, label='Homogeneity Score')
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.legend()
plt.show()


best_k = k_values[np.argmax(sc_homogeneity)]     # best number of clusters according to homogeneity score


best_kmeans = KMeans(n_clusters=best_k, random_state=42)    # applying KMeans
best_kmeans.fit(df_scaled)


axes = pd.plotting.scatter_matrix(df_scaled, alpha=0.1, figsize=(15, 15), c=best_kmeans.labels_)   # cluster visualization
for ax in axes.flatten():
    ax.xaxis.label.set_rotation(90)
    ax.yaxis.label.set_rotation(0)
    ax.yaxis.label.set_ha('right')
plt.show()


print("KMeans Silhouette Score:", silhouette_score(df_scaled, best_kmeans.labels_))
print("KMeans Homogeneity Score:", homogeneity_score(mapPredAndTrue(target, best_kmeans.labels_), target))


labels = best_kmeans.labels_                          # distances from points to their cluster's centroid
distances = best_kmeans.transform(df_scaled)


mean_dist = np.mean(distances, axis=1)     # mean distance for clusters


outlier_th = 2 * np.mean(mean_dist)    # threshold for outliers



outlier_ind = np.where(distances.max(axis=1) > outlier_th)[0]    # outlier indices identification


df_dropped = df_scaled.drop(index=outlier_ind)   # removing outliers


bestkm_dropped = KMeans(n_clusters=best_k, random_state=42)     # applying KMeans without outliers
bestkm_dropped.fit(df_dropped)


axes = pd.plotting.scatter_matrix(df_dropped, alpha=0.1, figsize=(15, 15), c=bestkm_dropped.labels_)    # cluster visualisation without outliers
for ax in axes.flatten():
    ax.xaxis.label.set_rotation(90)
    ax.yaxis.label.set_rotation(0)
    ax.yaxis.label.set_ha('right')
plt.show()


silhouette_dropped = silhouette_score(df_dropped, bestkm_dropped.labels_)
homogeneity_dropped = homogeneity_score(mapPredAndTrue(target, bestkm_dropped.labels_), target)


print("Silhouette Score (Otliers Removed):", silhouette_dropped)    # scores after outliers removed, remain unchanged
print("Homogeneity Score (Outliers Removed):", homogeneity_dropped)


kmeans_init = KMeans(n_clusters=best_k, init='k-means++', random_state=42)   #  applying KMeans with KMeans++ initialization
kmeans_init.fit(df_scaled)


axes = pd.plotting.scatter_matrix(df_scaled, alpha=0.1, figsize=(15, 15), c=kmeans_init.labels_)
for ax in axes.flatten():
    ax.xaxis.label.set_rotation(90)
    ax.yaxis.label.set_rotation(0)
    ax.yaxis.label.set_ha('right')
plt.show()

silhouette_kminit = silhouette_score(df_scaled, kmeans_init.labels_)        # scores with initialization, remains unchanged
homogeneity_kminit = homogeneity_score(mapPredAndTrue(target, kmeans_init.labels_), target)

print("KMeans++ Silhouette Score:", silhouette_kminit)   
print("KMeans++ Homogeneity Score:", homogeneity_kminit)
