In [1]:
# Necessary Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.semi_supervised import LabelSpreading

In [2]:
# Loading the data
adjacency = pd.read_csv("/kaggle/input/da324dataminingproject2/adjacency.csv")
attributes = pd.read_excel("/kaggle/input/da324dataminingproject2/attributes.xlsx")
seed = pd.read_excel("/kaggle/input/da324dataminingproject2/seed.xlsx", header=None)

In [3]:
# Calculate pairwise distances
pairwise_distances = cdist(attributes, attributes, metric='euclidean')

In [4]:
# Min max normalisation for ease when reciprocating
min_val = np.min(pairwise_distances)
max_val = np.max(pairwise_distances)

normalized_distances = 1e-4 + (pairwise_distances - min_val) / (max_val - min_val)

In [5]:
# More weight implies stronger edge, ie shorter distance
edge_weights = 1/(normalized_distances)

In [7]:
# Element wise multiplication to get final adjacency matrix
weight_matrix = np.multiply(edge_weights, adjacency)

In [8]:
# Initialize labels array with -1, for unknown labels
labels = np.full(11952, -1)

In [9]:
# Replace known labels in the array labels
for i in range(seed.shape[0]):
    for j in range(seed.shape[1]):
        node = seed[j][i]
        labels[node] = i

In [10]:
# Initialise and fit the model with KNN kernel
label_spreading_knn = LabelSpreading(kernel='knn', n_neighbors=200)
label_spreading_knn.fit(weight_matrix, labels)

In [11]:
# Get predictions
predictions_knn = label_spreading_knn.transduction_

In [12]:
# Convert predictions array to submission format
df_knn = pd.DataFrame({'LABEL':predictions_knn})
df_knn['ID'] = df_knn.index
df_knn = df_knn.set_index('ID').reset_index()
df_knn.head()

Unnamed: 0,ID,LABEL
0,0,0
1,1,9
2,2,8
3,3,4
4,4,4


In [13]:
df_knn.to_csv('label_spreading_knn.csv', index=False)

In [14]:
# Initialise and fit the model with RBF kernel
label_spreading_rbf = LabelSpreading(kernel='rbf')
label_spreading_rbf.fit(weight_matrix, labels)

In [15]:
# Get predictions
predictions_rbf = label_spreading_rbf.transduction_

In [16]:
# Convert predictions array to submission format
df_rbf = pd.DataFrame({'LABEL':predictions_rbf})
df_rbf['ID'] = df_rbf.index
df_rbf = df_rbf.set_index('ID').reset_index()
df_rbf.head()

Unnamed: 0,ID,LABEL
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [17]:
df_rbf.to_csv('label_spreading_rbf.csv', index=False)