In [None]:
!pip install yellowbrick

In [None]:
!pip install pycountry

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from yellowbrick.cluster import SilhouetteVisualizer
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist

In [None]:
df = pd.read_csv('Data/finalists_cleaned.csv')

In [None]:
# Load and select relevant columns
df_cluster = df[['year', 'country', 'style', 'final_total_points']].copy()

In [None]:
# One-hot encode categorical variables
df_cluster = pd.get_dummies(df_cluster, columns=['country', 'style'])

In [None]:
df_cluster.describe

In [None]:
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_cluster)

In [None]:
pca = PCA(n_components=2)
X = pca.fit_transform(X_scaled)  # assuming your features are scaled


In [None]:
from scipy.spatial.distance import cdist

distortions = []
K = range(2, 10)

for k in K:
    model = KMeans(n_clusters=k, n_init=10, random_state=42)
    model.fit(X)
    # Calculate average minimum distance to cluster centers
    dist = sum(np.min(cdist(X, model.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0]
    distortions.append(dist)

print("Distortion values:", distortions)



In [None]:
plt.title('Elbow Method for Optimal K')
plt.plot(K, distortions, 'bx-')
plt.xlabel('K')
plt.ylabel('Distortion')
plt.show()

In [None]:
# Choose number of clusters
n_clusters = 3
model2 = KMeans(n_clusters=n_clusters, random_state=42)

In [None]:
visualizer = SilhouetteVisualizer(model2, colors='yellowbrick')
visualizer.fit(X)
visualizer.show()

In [None]:
# List of columns to drop
to_drop = [
    'final_draw_position',
    'final_televote_points',
    'final_jury_points',
    'final_televote_votes',
    'final_jury_votes',
    'final_place'
]

# Drop them from df in‑place
df.drop(columns=to_drop, inplace=True)

# Now df only has the 5 columns you need
print(df.head())


In [None]:
# 1. Fit the model and get cluster assignments
model2 = KMeans(n_clusters=n_clusters, random_state=42)
clusters = model2.fit_predict(X)

# 2. Save them into your original df
df['Cluster'] = clusters

# 3. Now you can inspect the Cluster column:
df.head()



In [None]:
unique_clusters = df['Cluster'].unique()
print("Cluster labels:", unique_clusters)

In [None]:
df.to_csv('Data/finalists_clustered.csv', index=False)