In [1]:
import sys
import pickle
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.metrics import confusion_matrix

In [2]:
#loading the feature matrix and labels 
feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

In [3]:
graph_df = pd.read_csv('D:/NLP Project/ICA/edge_list.csv')
#loading the edgelist and constructing the url graph
g = nx.from_pandas_edgelist(graph_df,'url1','url2',edge_attr='edge_weight')

In [4]:
# Pickling weighted URL graph.
#pickle.dump(g, open('D:/NLP Project/ICA/url_graph_pickle.txt','wb'))

In [5]:
#g = pickle.load(open('D:/NLP Project/ICA/url_graph_pickle.txt',mode= 'rb'))

In [None]:
### Nodes with single neighbors won't be considered for thresholding - dangling nodes. (Will avoid index error)

In [4]:
nodes = list(g.nodes)
len(nodes)

17601

In [5]:
iter_labels = class_labels #making a copy of class labels for updations during iteration
# Contains true labels of all documents.

In [6]:
np.unique(class_labels,return_counts=True)

(array([0, 1, 2, 3]), array([4440, 4236, 5143, 3782], dtype=int64))

In [7]:
nx.info(g)

'Graph with 17601 nodes and 45259074 edges'

In [8]:
graph_df.head()

Unnamed: 0,url1,url2,edge_weight
0,5361,5362,7
1,5361,5363,8
2,5361,5364,7
3,5361,5365,6
4,5361,5366,6


In [9]:
len(graph_df)

90518148

In [11]:
edge_weight_counts = graph_df['edge_weight'].value_counts()
print(edge_weight_counts.head(10))

1     62866404
2     17974752
3      4942704
4      2163040
5      1145002
6       676434
7       365660
8       182488
9        87832
10       42580
Name: edge_weight, dtype: int64


In [12]:
g.degree[5361]

7738

In [13]:
dict(g.degree())

{5361: 7738,
 5362: 7359,
 5363: 6370,
 5364: 6867,
 5365: 6585,
 5366: 6427,
 5367: 8021,
 5368: 7431,
 5369: 6041,
 5370: 10597,
 5371: 8173,
 5372: 9535,
 5373: 5882,
 5790: 7060,
 5791: 4801,
 5792: 6735,
 5793: 7524,
 5794: 6307,
 5795: 9270,
 5796: 6818,
 5797: 5563,
 5798: 8674,
 5799: 7191,
 5800: 7134,
 5801: 7198,
 5802: 5231,
 5803: 8013,
 5804: 5899,
 5805: 6400,
 5806: 7544,
 5807: 7624,
 6793: 7654,
 6794: 4839,
 6795: 8446,
 6796: 5787,
 6798: 7733,
 6799: 6212,
 6800: 4397,
 6801: 5433,
 6802: 7229,
 6803: 5624,
 6804: 7716,
 6805: 8371,
 6806: 4923,
 6807: 5218,
 6808: 8343,
 6809: 8029,
 6810: 9925,
 6811: 6409,
 6812: 6485,
 6813: 7564,
 6814: 8849,
 6815: 7383,
 6816: 6793,
 6817: 7243,
 6818: 8914,
 6819: 8135,
 6820: 6746,
 6821: 6365,
 6822: 8373,
 6823: 7130,
 6824: 5936,
 6825: 6952,
 6826: 6211,
 6827: 7030,
 6828: 7679,
 6829: 6814,
 6830: 5529,
 6831: 6968,
 7133: 5478,
 7134: 7015,
 7135: 7332,
 7136: 7430,
 7137: 7300,
 7138: 7131,
 7139: 8585,
 7140: 6920

### Model 1 : Naive Bayes

In [14]:
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_NB = []
precision_list_NB = []
recall_list_NB = []
conf_mat_NB = []

#loading the feature matrix and labels 
feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test = train_test_split(feature_matrix, class_labels,test_size=k, random_state=0, stratify=class_labels)
    clf = GaussianNB() # Bootstrapping using Naive Bayes as Base Classifier
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    confusion_mat = confusion_matrix(y_test,y_pred)

    conf_mat_NB.append(confusion_mat)
    accuracy_list_NB.append(metrics.accuracy_score(y_test,y_pred))
    precision_list_NB.append(metrics.precision_score(y_test,y_pred,average='macro',zero_division=0))
    recall_list_NB.append(metrics.recall_score(y_test,y_pred,average='macro'))

    # Micro
    precision_list_NB.append(metrics.precision_score(y_test,y_pred,average='micro',zero_division=0))
    recall_list_NB.append(metrics.recall_score(y_test,y_pred,average='micro'))


For test size:  0.8
For test size:  0.6
For test size:  0.4
For test size:  0.2


In [15]:
accuracy_list_NB = ["%.2f" % elem for elem in accuracy_list_NB]
precision_list_NB = ["%.2f" % elem for elem in precision_list_NB]
recall_list_NB = ["%.2f" % elem for elem in recall_list_NB]

print(accuracy_list_NB)
print(precision_list_NB)
print(recall_list_NB)
print(conf_mat_NB)

['0.64', '0.64', '0.64', '0.66']
['0.65', '0.64', '0.65', '0.64', '0.66', '0.64', '0.69', '0.66']
['0.65', '0.64', '0.65', '0.64', '0.65', '0.64', '0.67', '0.66']
[array([[2327,  425,  653,  147],
       [ 510, 2068,  476,  335],
       [ 899,  746, 2338,  131],
       [  79,  511,  123, 2313]], dtype=int64), array([[1994,  178,  297,  195],
       [ 540, 1300,  252,  450],
       [1027,  296, 1556,  207],
       [  65,  235,   66, 1903]], dtype=int64), array([[1421,   84,  126,  145],
       [ 372,  775,  123,  425],
       [ 737,  137,  980,  203],
       [  35,  131,   26, 1321]], dtype=int64), array([[732,  34,  55,  67],
       [171, 379,  60, 237],
       [336,  60, 516, 117],
       [  7,  52,   5, 693]], dtype=int64)]


### Model 1 : ICA - NB with Label Counts

In [17]:
##Bootstrapping
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_ICA_Labels = []
precision_list_ICA_Labels = []
recall_list_ICA_Labels = []
conf_mat_ICA_Labels = []

feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test,idx_train,idx_test = train_test_split(feature_matrix, class_labels, indices,test_size=k, random_state=0, stratify=class_labels)
    clf = GaussianNB() # Bootstrapping using Naive Bayes as Base Classifier
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test) # y-pred is bootstrapped labels
    iter_labels = class_labels
    np.put(iter_labels,idx_test,y_pred)  # updating labels of test data with the predecited labels 
    iter_labels_list = list(iter_labels)
    distinct_labels = sorted(list(set(iter_labels_list)))

    # ICA starting

    adj_feats_train = np.zeros((len(X_train),len(distinct_labels)))
    adj_feats_test  = np.zeros((len(X_test),len(distinct_labels)))

    ########################################################################
    ##constructing additional features, train and Iterate until stabilized##

    #updating the adjacent features for training nodes
    for i in range(len(idx_train)):
        adjs = list(g[idx_train[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_train[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1 # Count for single node
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_train[i]] and g[idx_train[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_train[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei)
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]

        for j in range(len(distinct_labels)):
            this_label = distinct_labels[j]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_train[i][this_label] = cnt_of_adjacent_labels
        
          
    X_train_updated = np.concatenate((X_train,adj_feats_train),axis = 1)

    #updating the adjancent features for test nodes
    for i in range(len(idx_test)):
        adjs = list(g[idx_test[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_test[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_test[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]

        for j in range(len(distinct_labels)):
            this_label = distinct_labels[j]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_test[i][this_label] = cnt_of_adjacent_labels

    X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)  

    #learning the new model on updated feature matrix with adjacent labels
    clf_updated = GaussianNB()
    clf_updated.fit(X_train_updated,y_train)
    #print("\nStarting ICA Loop: ...\n")
    #staring the ICA inference loop
    loop_var = 0
    iter_var = 0
    y_pred_current = y_pred
    while (loop_var == 0 and iter_var < 15):
            y_pred_updated = clf_updated.predict(X_test_updated)
            if(np.array_equal(y_pred_current, y_pred_updated)):
                #algorithm stabilized
                #print("ICA Stabilized")
                loop_var = 1        
            else:
                loop_var = 0
                iter_var += 1
                #print("ICA Loop: "+str(iter_var))
                
                #updating the labels for test nodes with new predictions
                for i in range(len(idx_test)):
                    iter_labels_list[idx_test[i]] = y_pred_updated[i]
                
                #updating the adjacent features for test nodes
                for i in range(len(idx_test)):
                    adjs = list(g[idx_test[i]])
                    adjs_threshold = []
                    cnt_of_ew1 = 0
                    for nei in adjs:
                        if g[idx_test[i]][nei]['edge_weight'] == 1:
                            cnt_of_ew1 += 1
                    for nei in adjs:
                        if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                            adjs_threshold.append(nei)
                        if g[idx_test[i]][nei]['edge_weight']>1:
                            adjs_threshold.append(nei) 
                    adjs_threshold = [int(nei) for nei in adjs_threshold]
                    labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]

                    for j in range(len(distinct_labels)):
                        this_label = distinct_labels[j]
                        cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
                        adj_feats_test[i][this_label] = cnt_of_adjacent_labels
                                        
                X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)
                y_pred_current = y_pred_updated   
    
    #print('No. of iterations ICA ran: ',iter_var)            
    final_predictions = y_pred_updated

    #print('\nMetrics for Iterative Classification Algorithm for train size {:.1f}:\n'.format(1-k))
    # ICA
    # Macro
    confusion_mat = confusion_matrix(y_test,y_pred_updated)
    conf_mat_ICA_Labels.append(confusion_mat)

    accuracy_list_ICA_Labels.append(metrics.accuracy_score(y_test,y_pred_updated))
    precision_list_ICA_Labels.append(metrics.precision_score(y_test,y_pred_updated,average='macro',zero_division=0))
    recall_list_ICA_Labels.append(metrics.recall_score(y_test,y_pred_updated,average='macro'))

    # Micro
    precision_list_ICA_Labels.append(metrics.precision_score(y_test,y_pred_updated,average='micro',zero_division=0))
    recall_list_ICA_Labels.append(metrics.recall_score(y_test,y_pred_updated,average='micro'))


For test size:  0.8
For test size:  0.6
For test size:  0.4
For test size:  0.2


In [18]:
accuracy_list_ICA_Labels = ["%.2f" % elem for elem in accuracy_list_ICA_Labels]
precision_list_ICA_Labels = ["%.2f" % elem for elem in precision_list_ICA_Labels]
recall_list_ICA_Labels = ["%.2f" % elem for elem in recall_list_ICA_Labels]

print(accuracy_list_ICA_Labels)
print(precision_list_ICA_Labels)
print(recall_list_ICA_Labels)
print(conf_mat_ICA_Labels)

['0.69', '0.79', '0.83', '0.85']
['0.70', '0.69', '0.79', '0.79', '0.83', '0.83', '0.85', '0.85']
['0.69', '0.69', '0.79', '0.79', '0.83', '0.83', '0.84', '0.85']
[array([[2377,  350,  737,   88],
       [ 398, 2167,  550,  274],
       [ 690,  556, 2778,   90],
       [  57,  484,  115, 2370]], dtype=int64), array([[2324,  186,  266,   46],
       [ 208, 1985,  304,  261],
       [ 395,  232, 2061,   84],
       [  85,  143,   33, 1948]], dtype=int64), array([[1798,  125,  128,   36],
       [ 109, 1281,  140,  136],
       [ 187,  123, 1314,   38],
       [  63,   95,   30, 1438]], dtype=int64), array([[975,  72,  61,  10],
       [ 45, 607,  53,  70],
       [ 74,  40, 617,  23],
       [ 42,  37,   8, 787]], dtype=int64)]


### Model 2 : ICA - NB with Sum of Edge Weights 

In [19]:
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_ICA_EW= []
precision_list_ICA_EW = []
recall_list_ICA_EW = []
conf_mat_ICA_EW = []

feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test,idx_train,idx_test = train_test_split(feature_matrix, class_labels, indices,test_size=k, random_state=0, stratify=class_labels)
    clf = GaussianNB()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    iter_labels = class_labels
    np.put(iter_labels,idx_test,y_pred)  # updating labels of test data with the predecited labels 
    iter_labels_list = list(iter_labels)
    distinct_labels = sorted(list(set(iter_labels_list)))

    # ICA starting

    adj_feats_train = np.zeros((len(X_train),len(distinct_labels)))
    adj_feats_test  = np.zeros((len(X_test),len(distinct_labels)))

    ########################################################################
    ##constructing additional features, train and Iterate until stabilized##

    #updating the adjancent features for training nodes
    for i in range(len(idx_train)):
        adjs = list(g[idx_train[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_train[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_train[i]] and g[idx_train[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_train[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)

        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_train[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_train[i]])
            adj_feats_train[i][this_label] = edge_weight_adj_nodes[this_label]
        
          
    X_train_updated = np.concatenate((X_train,adj_feats_train),axis = 1)

    #updating the adjancent features for test nodes
    for i in range(len(idx_test)):
        adjs = list(g[idx_test[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_test[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_test[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)

        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
            adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]

    X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)  

    #learning the new model on updated feature matrix with adjacent labels
    clf_updated = GaussianNB()
    clf_updated.fit(X_train_updated,y_train)
    #print("\nStarting ICA Loop: ...\n")
    #staring the ICA inference loop
    loop_var = 0
    iter_var = 0
    y_pred_current = y_pred
    while (loop_var == 0 and iter_var < 15):
            y_pred_updated = clf_updated.predict(X_test_updated)
            if(np.array_equal(y_pred_current, y_pred_updated)):
                #algorithm stabilized
                #print("ICA Stabilized")
                loop_var = 1        
            else:
                loop_var = 0
                iter_var += 1
                #print("ICA Loop: "+str(iter_var))
                
                #updating the labels for test nodes with new predictions
                for i in range(len(idx_test)):
                    iter_labels_list[idx_test[i]] = y_pred_updated[i]
                
                #updating the adjancent features for test nodes
                for i in range(len(idx_test)):
                    adjs = list(g[idx_test[i]])
                    adjs_threshold = []
                    cnt_of_ew1 = 0
                    for nei in adjs:
                        if g[idx_test[i]][nei]['edge_weight'] == 1:
                            cnt_of_ew1 += 1
                    for nei in adjs:
                        if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                            adjs_threshold.append(nei)
                        if g[idx_test[i]][nei]['edge_weight']>1:
                            adjs_threshold.append(nei) 
                    adjs_threshold = [int(nei) for nei in adjs_threshold]
                    edge_weight_adj_nodes = [0]*len(distinct_labels)

                    for j in range(len(adjs_threshold)):
                        this_label = iter_labels_list[adjs_threshold[j]]
                        edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
                        adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]     
                                
                X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)
                y_pred_current = y_pred_updated   
    #print('No. of iterations ICA ran: ',iter_var)            
    final_predictions = y_pred_updated

    #print('ICA - Edge Weight Confusion matrix:\n')
    confusion_mat = confusion_matrix(y_test,y_pred_updated)
    conf_mat_ICA_EW.append(confusion_mat)
    #print(confusion_mat)

    #print('\nMetrics for Iterative Classification Algorithm for train size {:.1f}:\n'.format(1-k))
    # ICA
    # Macro
    accuracy_list_ICA_EW.append(metrics.accuracy_score(y_test,y_pred_updated))
    precision_list_ICA_EW.append(metrics.precision_score(y_test,y_pred_updated,average='macro',zero_division=0))
    recall_list_ICA_EW.append(metrics.recall_score(y_test,y_pred_updated,average='macro'))

    # Micro
    precision_list_ICA_EW.append(metrics.precision_score(y_test,y_pred_updated,average='micro',zero_division=0))
    recall_list_ICA_EW.append(metrics.recall_score(y_test,y_pred_updated,average='micro'))


For test size:  0.8
For test size:  0.6
For test size:  0.4
For test size:  0.2


In [20]:
accuracy_list_ICA_EW = ["%.2f" % elem for elem in accuracy_list_ICA_EW]
precision_list_ICA_EW = ["%.2f" % elem for elem in precision_list_ICA_EW]
recall_list_ICA_EW = ["%.2f" % elem for elem in recall_list_ICA_EW]

print(accuracy_list_ICA_EW)
print(precision_list_ICA_EW)
print(recall_list_ICA_EW)
print(conf_mat_ICA_EW)

['0.64', '0.75', '0.81', '0.86']
['0.65', '0.64', '0.75', '0.75', '0.82', '0.81', '0.87', '0.86']
['0.65', '0.64', '0.76', '0.75', '0.81', '0.81', '0.85', '0.86']
[array([[2327,  425,  653,  147],
       [ 510, 2068,  476,  335],
       [ 899,  746, 2338,  131],
       [  79,  511,  123, 2313]], dtype=int64), array([[2341,  149,  206,  126],
       [ 375, 1782,  219,  382],
       [ 566,  232, 1842,  132],
       [  55,  162,   40, 1952]], dtype=int64), array([[1905,   44,   56,   82],
       [ 190, 1156,   83,  237],
       [ 333,  144, 1124,   61],
       [  27,   33,   14, 1552]], dtype=int64), array([[1070,    7,    7,   34],
       [  61,  568,   26,  120],
       [ 119,   52,  548,   35],
       [   7,   11,    3,  853]], dtype=int64)]


### Model 3 : ICA-NB with Combined Features

In [21]:
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_ICA_Combined = []
precision_list_ICA_Combined = []
recall_list_ICA_Combined = []
conf_matrix_ICA_Combined = []

feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test,idx_train,idx_test = train_test_split(feature_matrix, class_labels, indices,test_size=k, random_state=0, stratify=class_labels)
    clf = GaussianNB()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    iter_labels = class_labels
    np.put(iter_labels,idx_test,y_pred)  # updating labels of test data with the predecited labels 
    iter_labels_list = list(iter_labels)
    distinct_labels = sorted(list(set(iter_labels_list)))

    # ICA starting

    adj_feats_train = np.zeros((len(X_train),2*len(distinct_labels))) # Multiply by 2, since we have 8 additional features combined.
    adj_feats_test  = np.zeros((len(X_test),2*len(distinct_labels)))

    ########################################################################
    ##constructing additional features, train and Iterate until stabilized##

    #updating the adjancent features for training nodes
    for i in range(len(idx_train)):       
        adjs = list(g[idx_train[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_train[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_train[i]] and g[idx_train[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_train[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)
        
        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_train[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_train[i]])
            adj_feats_train[i][this_label] = edge_weight_adj_nodes[this_label]
        
        for k in range(len(distinct_labels)):
            this_label = distinct_labels[k]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_train[i][this_label+len(distinct_labels)] = cnt_of_adjacent_labels
          
    X_train_updated = np.concatenate((X_train,adj_feats_train),axis = 1)

    #updating the adjancent features for test nodes
    for i in range(len(idx_test)):       
        adjs = list(g[idx_test[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_test[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_test[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)
        
        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
            adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]
        
        for k in range(len(distinct_labels)):
            this_label = distinct_labels[k]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_test[i][this_label+len(distinct_labels)] = cnt_of_adjacent_labels

    X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)  

    #learning the new model on updated feature matrix with adjacent labels
    clf_updated = GaussianNB()
    clf_updated.fit(X_train_updated,y_train)
    #print("\nStarting ICA Loop: ...\n")
    #staring the ICA inference loop
    loop_var = 0
    iter_var = 0
    y_pred_current = y_pred
    while (loop_var == 0 and iter_var < 15):
            y_pred_updated = clf_updated.predict(X_test_updated)
            if(np.array_equal(y_pred_current, y_pred_updated)):
                #algorithm stabilized
                #print("ICA Stabilized")
                loop_var = 1        
            else:
                loop_var = 0
                iter_var += 1
                #print("ICA Loop: "+str(iter_var))
                
                #updating the labels for test nodes with new predictions
                for i in range(len(idx_test)):
                    iter_labels_list[idx_test[i]] = y_pred_updated[i]
                
                #updating the adjancent features for test nodes
                for i in range(len(idx_test)):       
                    adjs = list(g[idx_test[i]])
                    adjs_threshold = []
                    cnt_of_ew1 = 0
                    for nei in adjs:
                        if g[idx_test[i]][nei]['edge_weight'] == 1:
                            cnt_of_ew1 += 1
                    for nei in adjs:
                        if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                            adjs_threshold.append(nei)
                        if g[idx_test[i]][nei]['edge_weight']>1:
                            adjs_threshold.append(nei) 
                    adjs_threshold = [int(nei) for nei in adjs_threshold]
                    labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]
                    edge_weight_adj_nodes = [0]*len(distinct_labels)
                    
                    for j in range(len(adjs_threshold)):
                        this_label = iter_labels_list[adjs_threshold[j]]
                        edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
                        adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]
                    
                    for k in range(len(distinct_labels)):
                        this_label = distinct_labels[k]
                        cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
                        adj_feats_test[i][this_label+len(distinct_labels)] = cnt_of_adjacent_labels       
                                
                X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)
                y_pred_current = y_pred_updated   
    print('No. of iterations ICA ran: ',iter_var)            
    final_predictions = y_pred_updated

    #print('ICA - Edge Weight Confusion matrix:\n')
    confusion_mat = confusion_matrix(y_test,y_pred_updated)
    conf_matrix_ICA_Combined.append(confusion_mat)
    #print(confusion_mat)

    #print('\nMetrics for Iterative Classification Algorithm for train size {:.1f}:\n'.format(1-k))
    # ICA
    # Macro
    accuracy_list_ICA_Combined.append(metrics.accuracy_score(y_test,y_pred_updated))
    precision_list_ICA_Combined.append(metrics.precision_score(y_test,y_pred_updated,average='macro',zero_division=0))
    recall_list_ICA_Combined.append(metrics.recall_score(y_test,y_pred_updated,average='macro'))

    # Micro
    precision_list_ICA_Combined.append(metrics.precision_score(y_test,y_pred_updated,average='micro',zero_division=0))
    recall_list_ICA_Combined.append(metrics.recall_score(y_test,y_pred_updated,average='micro'))


For test size:  0.8
No. of iterations ICA ran:  1
For test size:  0.6
No. of iterations ICA ran:  1
For test size:  0.4
No. of iterations ICA ran:  1
For test size:  0.2
No. of iterations ICA ran:  1


In [22]:
accuracy_list_ICA_Combined = ["%.2f" % elem for elem in accuracy_list_ICA_Combined]
precision_list_ICA_Combined = ["%.2f" % elem for elem in precision_list_ICA_Combined]
recall_list_ICA_Combined = ["%.2f" % elem for elem in recall_list_ICA_Combined]

print(accuracy_list_ICA_Combined)
print(precision_list_ICA_Combined)
print(recall_list_ICA_Combined)
print(conf_matrix_ICA_Combined)

['0.69', '0.79', '0.83', '0.85']
['0.70', '0.69', '0.79', '0.79', '0.83', '0.83', '0.85', '0.85']
['0.69', '0.69', '0.79', '0.79', '0.83', '0.83', '0.84', '0.85']
[array([[2378,  350,  737,   87],
       [ 397, 2169,  550,  273],
       [ 690,  556, 2778,   90],
       [  56,  484,  115, 2371]], dtype=int64), array([[2324,  186,  266,   46],
       [ 208, 1985,  304,  261],
       [ 395,  232, 2061,   84],
       [  85,  143,   33, 1948]], dtype=int64), array([[1798,  125,  128,   36],
       [ 110, 1282,  138,  136],
       [ 187,  122, 1315,   38],
       [  63,   95,   30, 1438]], dtype=int64), array([[975,  72,  61,  10],
       [ 45, 607,  53,  70],
       [ 74,  40, 617,  23],
       [ 42,  37,   8, 787]], dtype=int64)]
