In [None]:
import pandas as pd
import torch
import os
import numpy as np
from tqdm import tqdm
from torch_geometric.utils import to_dense_adj
from utils.getdata import getdata

from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [None]:
dataroot= os.path.join("data","attraction")
clusteringresult_dir = os.path.join("result", "attraction")
attraction  = pd.read_csv(os.path.join(dataroot,"legaldata.csv"))
name = attraction['Name'].tolist()
toldscribe = attraction['Toldescribe'].tolist()

In [None]:
attraction.shape

### KNN looked like result

In [None]:
nlist = torch.load(os.path.join(dataroot,"processed","order.pt"))
print(nlist.size())

In [None]:
def seeresult(target, nblist,name, numofn=6, out=False):
    nbi = []
    if out:
        print(f"{name[target]}:")
    nbi.append(name[target])
    idx = nblist[target][1:numofn].tolist()
    for i in idx:
        if out:
            print(name[i])
        nbi.append(name[i])
    return nbi

In [None]:
testtexts = np.random.randint(low=0, high=len(name), size=10)
print(testtexts)

In [None]:
for i in testtexts:
    r=seeresult(i, nblist=nlist, name=name, numofn=6, out=True)
    print("==============================")

In [None]:
knn = []
numofn = 5
col = ['target']+list(str(i)+"nn" for i in range(numofn))
print(col)
for i, _ in enumerate(name):
    nb = seeresult(target=i, nblist=nlist, name=name, numofn=6)
    knn.append(nb)
attractionknn = pd.DataFrame(knn,columns=col)
attractionknn.to_csv(os.path.join(dataroot, f"KNN_{numofn}.csv"), index=False, encoding='utf-8')

### Eigen of Laplacian

In [None]:
mygraph = getdata(
    datafolder=os.path.join("dataset","attraction",f"K_{5}"),
    nor=False
)

In [None]:
adj = to_dense_adj(mygraph.edge_index)
adj = adj.numpy()[0]

In [None]:
from scipy.sparse.csgraph import laplacian
import math

In [None]:
L = laplacian(adj)

In [None]:
vals, vecs = np.linalg.eig(L)

In [None]:
vals_sorted = np.sort(-vals)
vals_sorted[:5]

In [None]:
topN = vals_sorted.shape[0]
topN = 15
fig = plt.figure(figsize=(12,12))
plt.plot(
    list(i for i in range(topN)), 
    -vals_sorted[:topN]
)
"""
for i in range(topN):
    plt.plot(
        [i]*math.floor(vals[i]),
        list(j for j in range(math.floor(vals[i])))
    )
"""
plt.show()


In [None]:
vals[290:300]

### Clustering look like result

#### TSNE 

In [None]:
told = torch.load(os.path.join(dataroot,"ToldescribeEBD.pt"))
told = told.numpy()
tsneTold = TSNE(n_components=2, learning_rate='auto', init='pca').fit_transform(told)
tsneTold = normalize(tsneTold, axis=0)

#### Get clustering result 

In [None]:
g_num = 6
graph = f"K_5"
clusteringresult = torch.load(os.path.join(clusteringresult_dir,f"{graph}_{g_num}", "cluster.pt"))
clusteringresult = clusteringresult.numpy()
clusteringresult.shape

#### Check if there exists orphans

In [None]:
z = 0
for idx, i in enumerate(clusteringresult):
    s = i.sum()
    if s == 0:
        #print(idx)
        z += 1
print(z)

#### Write each group to a csv file

In [None]:
clusters = {}
for i in range(g_num):
    clusters[i]= []

for idx, i in enumerate(clusteringresult):
    belong = np.nonzero(i)[0].tolist()
    for groupid in belong:
            clusters[groupid].append(idx)
for i in range(g_num):
    print(len(clusters[i]))

In [None]:
outputdir = os.path.join(clusteringresult_dir,f"{graph}_{g_num}","clustering_result")
if not os.path.isdir(outputdir):
    print(outputdir)
    os.mkdir(outputdir)
    os.mkdir(os.path.join(outputdir,"eachC"))


In [None]:
columns=['name','description']+list(str(i) for i in range(g_num))
attrcl = []
for idx, attrgroup in enumerate(clusteringresult):
    thisattr = [name[idx], toldscribe[idx]]
    thisattr = thisattr+attrgroup.tolist()
    attrcl.append(thisattr)
df = pd.DataFrame(attrcl,columns=columns)
df.to_csv(os.path.join(outputdir,"cluster.csv"), index=False, encoding='utf-8')

for i in range(g_num):
    ci = df[df[str(i)]==1]
    ci.to_csv(os.path.join(outputdir,"eachC",f"{g_num}-c{i}.csv"),index=False, encoding='utf-8')

#### Vis

In [None]:
each_g = []
for l in tqdm(range(g_num)):
    gi = []
    for i in range(df.shape[0]):
        if df.iloc[i][str(l)] == 1:
            gi.append(tsneTold[i])
    gj = np.array(gi)
    each_g.append(gj)

In [None]:
glist = list(i for i in range(g_num))
colors = cm.rainbow(np.linspace(0, 1, len(glist)))
plt.figure(figsize=(12,12))
for y, c in tqdm(zip(glist, colors)):
    this_g = each_g[y]
    plt.scatter(this_g[:, 0], this_g[:, 1], color=c)
plt.savefig(os.path.join(outputdir,"vis.jpg"))
plt.close()