In [3]:
import re

import numpy as np
import pandas as pd
import umap

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff

In [4]:
node_embedding = pd.read_csv ('../Data/Preprocessed/NODE_EMBEDDING.csv', na_values=None, na_filter=None)

In [5]:
node_embedding.head ()

Unnamed: 0,nodeId,embedding,name
0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",whatsa
1,1,"[-1.0598999011790511e-08, -0.28232985734939575...",mowe
2,2,"[0.0700836032629013, 0.15865182876586914, 0.28...",ofada
3,3,"[-0.34612521529197693, 0.018053214997053146, -...",survey
4,4,"[-0.4766041040420532, 0.1477653980255127, 0.19...",sqm


In [6]:
def get_embeddings (x):
  x = re.sub (r'\[|\]|', '', x)
  x = x.split (', ')
  return [np.float64 (i) for i in x]

In [7]:
X = [get_embeddings (i) for i in node_embedding.embedding]

In [8]:
kmeans = KMeans (n_clusters=3, max_iter=100).fit (X)

print (kmeans.cluster_centers_)
print (kmeans.cluster_centers_[:,0])

[[ 0.05298284  0.45107955 -0.00888769  0.07953862  0.21516809 -0.20517742
   0.4415045   0.02912974  0.10147334 -0.3303005   0.20126175  0.0118639
   0.43094503 -0.1372103  -0.10290787 -0.44963304]
 [ 0.09542665  0.69956608 -0.30053731  0.19703729  0.13957843 -0.03801095
   0.49591446 -0.00764693  0.55704728 -0.19377827  0.47081816 -0.20400849
   0.19982181 -0.26814897 -0.21614159 -0.5405545 ]
 [-0.02779446  0.06011547 -0.05954965  0.03272972 -0.0180691  -0.0008761
   0.05585639 -0.010253    0.09699018 -0.03890893  0.06432354 -0.01023728
  -0.00533657 -0.01296038 -0.03718658 -0.03370749]]
[ 0.05298284  0.09542665 -0.02779446]


In [9]:
node_embedding['labels'] = kmeans.labels_

In [10]:
tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(X)



In [11]:
node_embedding [["first_component",'second_component']] = tsne_results

In [12]:
node_embedding.head ()

Unnamed: 0,nodeId,embedding,name,labels,first_component,second_component
0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",whatsa,2,0.870044,-6.004674
1,1,"[-1.0598999011790511e-08, -0.28232985734939575...",mowe,2,0.068927,-4.285151
2,2,"[0.0700836032629013, 0.15865182876586914, 0.28...",ofada,1,-1.520575,0.54686
3,3,"[-0.34612521529197693, 0.018053214997053146, -...",survey,0,4.232285,-0.739078
4,4,"[-0.4766041040420532, 0.1477653980255127, 0.19...",sqm,0,6.21749,0.099292


In [13]:
words = node_embedding.name
label = kmeans.labels_

In [23]:
data_ = [
  go.Scatter (x=tsne_results [:, 0], y=tsne_results[:, 1], mode='markers',
  marker=dict (color=kmeans.labels_, colorscale='Tealgrn_r', opacity=.5), 
  text=[f'label: {a}<br>words: {b}' for a, b in list (zip (label, words))],
  hoverinfo='text'
)]

layout = go.Layout (
  title='TSNE Visualisation',
  # width=1400,
  height=900, 
  colorway=px.colors.sequential.Tealgrn_r,
  xaxis=dict (title='First Component'),
  yaxis=dict(title='Second Component'))

fig = go.Figure (data=data_, layout=layout)

# fig.update_layout (showlegend=True)
fig.show ()