In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.cluster import SpectralClustering
from sklearn import metrics
from sklearn.model_selection import train_test_split # Import train_test_split function

In [2]:
class_labels = np.load('D:/NLP Project/Data/class_lables.npy')
graph_df = pd.read_csv('D:/NLP Project/Data/edge_list.csv')
#loading the edgelist and constructing the url graph
G = nx.from_pandas_edgelist(graph_df,'url1','url2',edge_attr='edge_weight')

In [3]:
weighted_adj_mat = nx.to_numpy_matrix(G,weight='edge_weight')
weighted_adj_mat = np.squeeze(np.asarray(weighted_adj_mat))
weighted_adj_mat

array([[0., 7., 8., ..., 0., 0., 0.],
       [7., 0., 6., ..., 0., 0., 0.],
       [8., 6., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 2., 1.],
       [0., 0., 0., ..., 2., 0., 1.],
       [0., 0., 0., ..., 1., 1., 0.]])

In [4]:
adj_mat = nx.to_numpy_matrix(G)
adj_mat = np.squeeze(np.asarray(adj_mat))
adj_mat

array([[0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 1., 1., 0.]])

In [5]:
indices = np.arange(len(adj_mat))
gt = [class_labels[i] for i in G.nodes()]

In [6]:
labels = np.unique(class_labels).tolist()

In [7]:
clf = SpectralClustering(n_clusters = 4, affinity='precomputed', n_init=100, random_state=1)
clf.fit(adj_mat)
clusterlabels = clf.labels_

### Adjacency Matrix

In [11]:
k = [0.2,0.4,0.6,0.8]
for size in k:
    X_train, X_test, y_train, y_test,idx_train,idx_test = train_test_split(adj_mat, class_labels, indices, test_size=size, random_state=0, stratify=class_labels)
    print("\nFor Test size: ",size*100)
    nodeid0 = np.where(clusterlabels==0)
    nodeid1 = np.where(clusterlabels==1)
    nodeid2 = np.where(clusterlabels==2)
    nodeid3 = np.where(clusterlabels==3)

    test_index0 = []
    for indx in nodeid0[0]:
        if indx in idx_test:
            test_index0.append(indx)

    test_index1 = []
    for indx in nodeid1[0]:
        if indx in idx_test:
            test_index1.append(indx)

    test_index2 = []
    for indx in nodeid2[0]:
        if indx in idx_test:
            test_index2.append(indx)

    test_index3 = []
    for indx in nodeid3[0]:
        if indx in idx_test:
            test_index3.append(indx)
            
    ## Local Voting
    predictions = dict()

    for i in range(len(test_index0)):
        votes = dict()
        for j in range(len(labels)):
            votes[labels[j]] = 0
        adj = list(G[test_index0[i]])
        for n in range(len(adj)):
            if adj[n] in nodeid0[0]:
                if adj[n] in idx_train:
                    votes[class_labels[adj[n]]] += 1
        winner = max(votes, key=votes.get)
        predictions[test_index0[i]] = winner

    for i in range(len(test_index1)):
        votes = dict()
        for j in range(len(labels)):
            votes[labels[j]] = 0
        adj = list(G[test_index1[i]])
        for n in range(len(adj)):
            if adj[n] in nodeid1[0]:
                if adj[n] in idx_train:
                    votes[class_labels[adj[n]]] += 1
        winner = max(votes, key=votes.get)
        predictions[test_index1[i]] = winner

    for i in range(len(test_index2)):
        votes = dict()
        for j in range(len(labels)):
            votes[labels[j]] = 0
        adj = list(G[test_index2[i]])
        for n in range(len(adj)):
            if adj[n] in nodeid2[0]:
                if adj[n] in idx_train:
                    votes[class_labels[adj[n]]] += 1
        winner = max(votes, key=votes.get)
        predictions[test_index2[i]] = winner

    for i in range(len(test_index3)):
        votes = dict()
        for j in range(len(labels)):
            votes[labels[j]] = 0
        adj = list(G[test_index3[i]])
        for n in range(len(adj)):
            if adj[n] in nodeid3[0]:
                if adj[n] in idx_train:
                    votes[class_labels[adj[n]]] += 1
        winner = max(votes, key=votes.get)
        predictions[test_index3[i]] = winner

    y_true, y_pred = [],[]
    for k,v in predictions.items():
        y_true.append(class_labels[k])
        y_pred.append(v)

    print("\nLocal Voting Results for adjacency matrix:\n")
    print("Accuracy of Spectral Clustering: {0:.2f} %".format(metrics.accuracy_score(y_true,y_pred)*100))
    print("\nMicro Precision of Spectral Clustering: {0:.2f} %".format(metrics.precision_score(y_true,y_pred,average='micro',zero_division=0)*100))
    print("\nMicro Recall of Spectral Clustering: {0:.2f} %".format(metrics.recall_score(y_true,y_pred,average='micro',zero_division=0)*100))
    print("\nMacro Precision of Spectral Clustering: {0:.2f} %".format(metrics.precision_score(y_true,y_pred,average='macro',zero_division=0)*100))
    print("\nMacro Recall of Spectral Clustering: {0:.2f} %".format(metrics.recall_score(y_true,y_pred,average='macro',zero_division=0)*100))

    train_nodes0 = 0
    for node in nodeid0[0]:
        if node in idx_train:
            train_nodes0 += 1

    train_nodes1 = 0
    for node in nodeid1[0]:
        if node in idx_train:
            train_nodes1 += 1

    train_nodes2 = 0
    for node in nodeid2[0]:
        if node in idx_train:
            train_nodes2 += 1

    train_nodes3 = 0
    for node in nodeid3[0]:
        if node in idx_train:
            train_nodes3 += 1

    print("\nNo. of nodes in cluster 1: ",len(nodeid0[0]))
    print("\nLabelled nodes in Cluster 1: ",train_nodes0)
    print("\nNo. of nodes in cluster 2: ",len(nodeid1[0]))
    print("\nLabelled nodes in Cluster 2: ",train_nodes1)
    print("\nNo. of nodes in cluster 3: ",len(nodeid2[0]))
    print("\nLabelled nodes in Cluster 3: ",train_nodes2)
    print("\nNo. of nodes in cluster 4: ",len(nodeid3[0]))
    print("\nLabelled nodes in Cluster 4: ",train_nodes3)



For Test size:  20.0

Local Voting Results for adjacency matrix:

Accuracy of Spectral Clustering: 60.01 %

Micro Precision of Spectral Clustering: 60.01 %

Micro Recall of Spectral Clustering: 60.01 %

Macro Precision of Spectral Clustering: 65.01 %

Macro Recall of Spectral Clustering: 59.38 %

No. of nodes in cluster 1:  7062

Labelled nodes in Cluster 1:  5613

No. of nodes in cluster 2:  5508

Labelled nodes in Cluster 2:  4423

No. of nodes in cluster 3:  2479

Labelled nodes in Cluster 3:  2019

No. of nodes in cluster 4:  2552

Labelled nodes in Cluster 4:  2025

For Test size:  40.0

Local Voting Results for adjacency matrix:

Accuracy of Spectral Clustering: 59.61 %

Micro Precision of Spectral Clustering: 59.61 %

Micro Recall of Spectral Clustering: 59.61 %

Macro Precision of Spectral Clustering: 64.57 %

Macro Recall of Spectral Clustering: 59.09 %

No. of nodes in cluster 1:  7062

Labelled nodes in Cluster 1:  4245

No. of nodes in cluster 2:  5508

Labelled nodes in C

### Weighted Adjacency matrix

In [12]:
clf = SpectralClustering(n_clusters = 4, affinity='precomputed', n_init=100, random_state=1)
clf.fit(weighted_adj_mat)
clusterlabels = clf.labels_

In [13]:
k = [0.2,0.4,0.6,0.8]
for size in k:
    X_train, X_test, y_train, y_test,idx_train,idx_test = train_test_split(weighted_adj_mat, class_labels, indices, test_size=size, random_state=0, stratify=class_labels)
    print("For Test size: ",size*100)
    nodeid0 = np.where(clusterlabels==0)
    nodeid1 = np.where(clusterlabels==1)
    nodeid2 = np.where(clusterlabels==2)
    nodeid3 = np.where(clusterlabels==3)

    test_index0 = []
    for indx in nodeid0[0]:
        if indx in idx_test:
            test_index0.append(indx)

    test_index1 = []
    for indx in nodeid1[0]:
        if indx in idx_test:
            test_index1.append(indx)

    test_index2 = []
    for indx in nodeid2[0]:
        if indx in idx_test:
            test_index2.append(indx)

    test_index3 = []
    for indx in nodeid3[0]:
        if indx in idx_test:
            test_index3.append(indx)
            
    ## Local Voting
    predictions = dict()

    for i in range(len(test_index0)):
        votes = dict()
        for j in range(len(labels)):
            votes[labels[j]] = 0
        adj = list(G[test_index0[i]])
        for n in range(len(adj)):
            if adj[n] in nodeid0[0]:
                if adj[n] in idx_train:
                    votes[class_labels[adj[n]]] += 1
        winner = max(votes, key=votes.get)
        predictions[test_index0[i]] = winner

    for i in range(len(test_index1)):
        votes = dict()
        for j in range(len(labels)):
            votes[labels[j]] = 0
        adj = list(G[test_index1[i]])
        for n in range(len(adj)):
            if adj[n] in nodeid1[0]:
                if adj[n] in idx_train:
                    votes[class_labels[adj[n]]] += 1
        winner = max(votes, key=votes.get)
        predictions[test_index1[i]] = winner

    for i in range(len(test_index2)):
        votes = dict()
        for j in range(len(labels)):
            votes[labels[j]] = 0
        adj = list(G[test_index2[i]])
        for n in range(len(adj)):
            if adj[n] in nodeid2[0]:
                if adj[n] in idx_train:
                    votes[class_labels[adj[n]]] += 1
        winner = max(votes, key=votes.get)
        predictions[test_index2[i]] = winner

    for i in range(len(test_index3)):
        votes = dict()
        for j in range(len(labels)):
            votes[labels[j]] = 0
        adj = list(G[test_index3[i]])
        for n in range(len(adj)):
            if adj[n] in nodeid3[0]:
                if adj[n] in idx_train:
                    votes[class_labels[adj[n]]] += 1
        winner = max(votes, key=votes.get)
        predictions[test_index3[i]] = winner

    y_true, y_pred = [],[]
    for k,v in predictions.items():
        y_true.append(class_labels[k])
        y_pred.append(v)

    print("\nLocal Voting Results for adjacency matrix:\n")
    print("Accuracy of Spectral Clustering: {0:.2f} %".format(metrics.accuracy_score(y_true,y_pred)*100))
    print("\nMicro Precision of Spectral Clustering: {0:.2f} %".format(metrics.precision_score(y_true,y_pred,average='micro',zero_division=0)*100))
    print("\nMicro Recall of Spectral Clustering: {0:.2f} %".format(metrics.recall_score(y_true,y_pred,average='micro',zero_division=0)*100))
    print("\nMacro Precision of Spectral Clustering: {0:.2f} %".format(metrics.precision_score(y_true,y_pred,average='macro',zero_division=0)*100))
    print("\nMacro Recall of Spectral Clustering: {0:.2f} %".format(metrics.recall_score(y_true,y_pred,average='macro',zero_division=0)*100))

    train_nodes0 = 0
    for node in nodeid0[0]:
        if node in idx_train:
            train_nodes0 += 1

    train_nodes1 = 0
    for node in nodeid1[0]:
        if node in idx_train:
            train_nodes1 += 1

    train_nodes2 = 0
    for node in nodeid2[0]:
        if node in idx_train:
            train_nodes2 += 1

    train_nodes3 = 0
    for node in nodeid3[0]:
        if node in idx_train:
            train_nodes3 += 1

    print("\nNo. of nodes in cluster 1: ",len(nodeid0[0]))
    print("\nLabelled nodes in Cluster 1: ",train_nodes0)
    print("\nNo. of nodes in cluster 2: ",len(nodeid1[0]))
    print("\nLabelled nodes in Cluster 2: ",train_nodes1)
    print("\nNo. of nodes in cluster 3: ",len(nodeid2[0]))
    print("\nLabelled nodes in Cluster 3: ",train_nodes2)
    print("\nNo. of nodes in cluster 4: ",len(nodeid3[0]))
    print("\nLabelled nodes in Cluster 4: ",train_nodes3)

For Test size:  20.0

Local Voting Results for adjacency matrix:

Accuracy of Spectral Clustering: 60.01 %

Micro Precision of Spectral Clustering: 60.01 %

Micro Recall of Spectral Clustering: 60.01 %

Macro Precision of Spectral Clustering: 65.01 %

Macro Recall of Spectral Clustering: 59.38 %

No. of nodes in cluster 1:  7062

Labelled nodes in Cluster 1:  5613

No. of nodes in cluster 2:  5508

Labelled nodes in Cluster 2:  4423

No. of nodes in cluster 3:  2479

Labelled nodes in Cluster 3:  2019

No. of nodes in cluster 4:  2552

Labelled nodes in Cluster 4:  2025
For Test size:  40.0

Local Voting Results for adjacency matrix:

Accuracy of Spectral Clustering: 59.61 %

Micro Precision of Spectral Clustering: 59.61 %

Micro Recall of Spectral Clustering: 59.61 %

Macro Precision of Spectral Clustering: 64.57 %

Macro Recall of Spectral Clustering: 59.09 %

No. of nodes in cluster 1:  7062

Labelled nodes in Cluster 1:  4245

No. of nodes in cluster 2:  5508

Labelled nodes in Clu

'\n## Global voting\nglobal_predictions = dict()\n\nvotes = dict()\nfor j in range(len(labels)):\n    votes[labels[j]] = 0\n\nfor node in nodeid0[0]:\n    if node in idx_train:\n        votes[class_labels[node]] += 1\n\nwinner = max(votes, key=votes.get)\n\nfor n in test_index0:\n    global_predictions[n] = winner\n\nvotes = dict()\nfor j in range(len(labels)):\n    votes[labels[j]] = 0\n\nfor node in nodeid1[0]:\n    if node in idx_train:\n        votes[class_labels[node]] += 1\n\nwinner = max(votes, key=votes.get)\n\nfor n in test_index1:\n    global_predictions[n] = winner\n\nvotes = dict()\nfor j in range(len(labels)):\n    votes[labels[j]] = 0\n\nfor node in nodeid2[0]:\n    if node in idx_train:\n        votes[class_labels[node]] += 1\n\nwinner = max(votes, key=votes.get)\n\nfor n in test_index2:\n    global_predictions[n] = winner\n\nvotes = dict()\nfor j in range(len(labels)):\n    votes[labels[j]] = 0\n\nfor node in nodeid3[0]:\n    if node in idx_train:\n        votes[class_l