In [2]:
import sys
import pickle
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.metrics import confusion_matrix

In [3]:
#loading the feature matrix and labels 
feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

In [4]:
graph_df = pd.read_csv('D:/NLP Project/ICA/edge_list.csv')
#loading the edgelist and constructing the url graph
g = nx.from_pandas_edgelist(graph_df,'url1','url2',edge_attr='edge_weight')

In [5]:
nodes = list(g.nodes)
len(nodes)

17601

In [6]:
iter_labels = class_labels #making a copy of class labels for updations during iteration
# Contains true labels of all documents.

In [7]:
np.unique(class_labels,return_counts=True)

(array([0, 1, 2, 3]), array([4440, 4236, 5143, 3782], dtype=int64))

## Random State - 1

### Model 1 : Naive Bayes

In [8]:
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_NB = []
precision_list_NB = []
recall_list_NB = []
conf_mat_NB = []

#loading the feature matrix and labels 
feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test = train_test_split(feature_matrix, class_labels,test_size=k, random_state=1, stratify=class_labels)
    clf = GaussianNB() # Bootstrapping using Naive Bayes as Base Classifier
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    confusion_mat = confusion_matrix(y_test,y_pred)

    conf_mat_NB.append(confusion_mat)
    accuracy_list_NB.append(metrics.accuracy_score(y_test,y_pred))
    precision_list_NB.append(metrics.precision_score(y_test,y_pred,average='macro',zero_division=0))
    recall_list_NB.append(metrics.recall_score(y_test,y_pred,average='macro'))

    # Micro
    precision_list_NB.append(metrics.precision_score(y_test,y_pred,average='micro',zero_division=0))
    recall_list_NB.append(metrics.recall_score(y_test,y_pred,average='micro'))


For test size:  0.8
For test size:  0.6
For test size:  0.4
For test size:  0.2


In [9]:
accuracy_list_NB = ["%.2f" % elem for elem in accuracy_list_NB]
precision_list_NB = ["%.2f" % elem for elem in precision_list_NB]
recall_list_NB = ["%.2f" % elem for elem in recall_list_NB]

print(accuracy_list_NB)
print(precision_list_NB)
print(recall_list_NB)
print(conf_mat_NB)

['0.65', '0.63', '0.64', '0.65']
['0.66', '0.65', '0.65', '0.63', '0.66', '0.64', '0.68', '0.65']
['0.66', '0.65', '0.64', '0.63', '0.65', '0.64', '0.66', '0.65']
[array([[2355,  408,  631,  158],
       [ 514, 2071,  443,  361],
       [ 997,  641, 2332,  144],
       [  73,  391,  128, 2434]], dtype=int64), array([[2050,  174,  260,  180],
       [ 564, 1325,  230,  423],
       [1155,  298, 1416,  217],
       [  66,  255,   70, 1878]], dtype=int64), array([[1416,   75,  131,  154],
       [ 388,  800,  135,  372],
       [ 779,  110,  950,  218],
       [  29,  145,   24, 1315]], dtype=int64), array([[733,  28,  46,  81],
       [164, 390,  68, 225],
       [369,  51, 484, 125],
       [  9,  62,   9, 677]], dtype=int64)]


### Model 2 : ICA - NB with Label Counts

In [10]:
##Bootstrapping
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_ICA_Labels = []
precision_list_ICA_Labels = []
recall_list_ICA_Labels = []
conf_mat_ICA_Labels = []

feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test,idx_train,idx_test = train_test_split(feature_matrix, class_labels, indices,test_size=k, random_state=1, stratify=class_labels)
    clf = GaussianNB() # Bootstrapping using Naive Bayes as Base Classifier
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test) # y-pred is bootstrapped labels
    iter_labels = class_labels
    np.put(iter_labels,idx_test,y_pred)  # updating labels of test data with the predecited labels 
    iter_labels_list = list(iter_labels)
    distinct_labels = sorted(list(set(iter_labels_list)))

    # ICA starting

    adj_feats_train = np.zeros((len(X_train),len(distinct_labels)))
    adj_feats_test  = np.zeros((len(X_test),len(distinct_labels)))

    ########################################################################
    ##constructing additional features, train and Iterate until stabilized##

    #updating the adjacent features for training nodes
    for i in range(len(idx_train)):
        adjs = list(g[idx_train[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_train[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1 # Count for single node
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_train[i]] and g[idx_train[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_train[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei)
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]

        for j in range(len(distinct_labels)):
            this_label = distinct_labels[j]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_train[i][this_label] = cnt_of_adjacent_labels
        
          
    X_train_updated = np.concatenate((X_train,adj_feats_train),axis = 1)

    #updating the adjancent features for test nodes
    for i in range(len(idx_test)):
        adjs = list(g[idx_test[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_test[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_test[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]

        for j in range(len(distinct_labels)):
            this_label = distinct_labels[j]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_test[i][this_label] = cnt_of_adjacent_labels

    X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)  

    #learning the new model on updated feature matrix with adjacent labels
    clf_updated = GaussianNB()
    clf_updated.fit(X_train_updated,y_train)
    #print("\nStarting ICA Loop: ...\n")
    #staring the ICA inference loop
    loop_var = 0
    iter_var = 0
    y_pred_current = y_pred
    while (loop_var == 0 and iter_var < 15):
            y_pred_updated = clf_updated.predict(X_test_updated)
            if(np.array_equal(y_pred_current, y_pred_updated)):
                #algorithm stabilized
                #print("ICA Stabilized")
                loop_var = 1        
            else:
                loop_var = 0
                iter_var += 1
                #print("ICA Loop: "+str(iter_var))
                
                #updating the labels for test nodes with new predictions
                for i in range(len(idx_test)):
                    iter_labels_list[idx_test[i]] = y_pred_updated[i]
                
                #updating the adjacent features for test nodes
                for i in range(len(idx_test)):
                    adjs = list(g[idx_test[i]])
                    adjs_threshold = []
                    cnt_of_ew1 = 0
                    for nei in adjs:
                        if g[idx_test[i]][nei]['edge_weight'] == 1:
                            cnt_of_ew1 += 1
                    for nei in adjs:
                        if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                            adjs_threshold.append(nei)
                        if g[idx_test[i]][nei]['edge_weight']>1:
                            adjs_threshold.append(nei) 
                    adjs_threshold = [int(nei) for nei in adjs_threshold]
                    labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]

                    for j in range(len(distinct_labels)):
                        this_label = distinct_labels[j]
                        cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
                        adj_feats_test[i][this_label] = cnt_of_adjacent_labels
                                        
                X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)
                y_pred_current = y_pred_updated   
    
    #print('No. of iterations ICA ran: ',iter_var)            
    final_predictions = y_pred_updated

    #print('\nMetrics for Iterative Classification Algorithm for train size {:.1f}:\n'.format(1-k))
    # ICA
    # Macro
    confusion_mat = confusion_matrix(y_test,y_pred_updated)
    conf_mat_ICA_Labels.append(confusion_mat)

    accuracy_list_ICA_Labels.append(metrics.accuracy_score(y_test,y_pred_updated))
    precision_list_ICA_Labels.append(metrics.precision_score(y_test,y_pred_updated,average='macro',zero_division=0))
    recall_list_ICA_Labels.append(metrics.recall_score(y_test,y_pred_updated,average='macro'))

    # Micro
    precision_list_ICA_Labels.append(metrics.precision_score(y_test,y_pred_updated,average='micro',zero_division=0))
    recall_list_ICA_Labels.append(metrics.recall_score(y_test,y_pred_updated,average='micro'))


For test size:  0.8
For test size:  0.6
For test size:  0.4
For test size:  0.2


In [11]:
accuracy_list_ICA_Labels = ["%.2f" % elem for elem in accuracy_list_ICA_Labels]
precision_list_ICA_Labels = ["%.2f" % elem for elem in precision_list_ICA_Labels]
recall_list_ICA_Labels = ["%.2f" % elem for elem in recall_list_ICA_Labels]

print(accuracy_list_ICA_Labels)
print(precision_list_ICA_Labels)
print(recall_list_ICA_Labels)
print(conf_mat_ICA_Labels)

['0.70', '0.79', '0.83', '0.87']
['0.70', '0.70', '0.80', '0.79', '0.83', '0.83', '0.86', '0.87']
['0.70', '0.70', '0.80', '0.79', '0.82', '0.83', '0.86', '0.87']
[array([[2409,  367,  696,   80],
       [ 375, 2207,  517,  290],
       [ 731,  549, 2731,  103],
       [  58,  372,  117, 2479]], dtype=int64), array([[2373,  203,  268,   52],
       [ 194, 1968,  243,  210],
       [ 383,  226, 2016,  113],
       [  72,  169,   38, 2033]], dtype=int64), array([[1856,  143,  146,   32],
       [  90, 1258,  105,  121],
       [ 202,  114, 1206,   61],
       [  73,  110,   13, 1511]], dtype=int64), array([[1034,   59,   55,    9],
       [  33,  606,   35,   67],
       [  68,   50,  577,   22],
       [  30,   32,   14,  830]], dtype=int64)]


### Model 3 : ICA - NB with Sum of Edge Weights 

In [12]:
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_ICA_EW= []
precision_list_ICA_EW = []
recall_list_ICA_EW = []
conf_mat_ICA_EW = []

feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test,idx_train,idx_test = train_test_split(feature_matrix, class_labels, indices,test_size=k, random_state=1, stratify=class_labels)
    clf = GaussianNB()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    iter_labels = class_labels
    np.put(iter_labels,idx_test,y_pred)  # updating labels of test data with the predecited labels 
    iter_labels_list = list(iter_labels)
    distinct_labels = sorted(list(set(iter_labels_list)))

    # ICA starting

    adj_feats_train = np.zeros((len(X_train),len(distinct_labels)))
    adj_feats_test  = np.zeros((len(X_test),len(distinct_labels)))

    ########################################################################
    ##constructing additional features, train and Iterate until stabilized##

    #updating the adjancent features for training nodes
    for i in range(len(idx_train)):
        adjs = list(g[idx_train[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_train[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_train[i]] and g[idx_train[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_train[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)

        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_train[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_train[i]])
            adj_feats_train[i][this_label] = edge_weight_adj_nodes[this_label]
        
          
    X_train_updated = np.concatenate((X_train,adj_feats_train),axis = 1)

    #updating the adjancent features for test nodes
    for i in range(len(idx_test)):
        adjs = list(g[idx_test[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_test[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_test[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)

        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
            adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]

    X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)  

    #learning the new model on updated feature matrix with adjacent labels
    clf_updated = GaussianNB()
    clf_updated.fit(X_train_updated,y_train)
    #print("\nStarting ICA Loop: ...\n")
    #staring the ICA inference loop
    loop_var = 0
    iter_var = 0
    y_pred_current = y_pred
    while (loop_var == 0 and iter_var < 15):
            y_pred_updated = clf_updated.predict(X_test_updated)
            if(np.array_equal(y_pred_current, y_pred_updated)):
                #algorithm stabilized
                #print("ICA Stabilized")
                loop_var = 1        
            else:
                loop_var = 0
                iter_var += 1
                #print("ICA Loop: "+str(iter_var))
                
                #updating the labels for test nodes with new predictions
                for i in range(len(idx_test)):
                    iter_labels_list[idx_test[i]] = y_pred_updated[i]
                
                #updating the adjancent features for test nodes
                for i in range(len(idx_test)):
                    adjs = list(g[idx_test[i]])
                    adjs_threshold = []
                    cnt_of_ew1 = 0
                    for nei in adjs:
                        if g[idx_test[i]][nei]['edge_weight'] == 1:
                            cnt_of_ew1 += 1
                    for nei in adjs:
                        if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                            adjs_threshold.append(nei)
                        if g[idx_test[i]][nei]['edge_weight']>1:
                            adjs_threshold.append(nei) 
                    adjs_threshold = [int(nei) for nei in adjs_threshold]
                    edge_weight_adj_nodes = [0]*len(distinct_labels)

                    for j in range(len(adjs_threshold)):
                        this_label = iter_labels_list[adjs_threshold[j]]
                        edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
                        adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]     
                                
                X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)
                y_pred_current = y_pred_updated   
    #print('No. of iterations ICA ran: ',iter_var)            
    final_predictions = y_pred_updated

    #print('ICA - Edge Weight Confusion matrix:\n')
    confusion_mat = confusion_matrix(y_test,y_pred_updated)
    conf_mat_ICA_EW.append(confusion_mat)
    #print(confusion_mat)

    #print('\nMetrics for Iterative Classification Algorithm for train size {:.1f}:\n'.format(1-k))
    # ICA
    # Macro
    accuracy_list_ICA_EW.append(metrics.accuracy_score(y_test,y_pred_updated))
    precision_list_ICA_EW.append(metrics.precision_score(y_test,y_pred_updated,average='macro',zero_division=0))
    recall_list_ICA_EW.append(metrics.recall_score(y_test,y_pred_updated,average='macro'))

    # Micro
    precision_list_ICA_EW.append(metrics.precision_score(y_test,y_pred_updated,average='micro',zero_division=0))
    recall_list_ICA_EW.append(metrics.recall_score(y_test,y_pred_updated,average='micro'))


For test size:  0.8
For test size:  0.6
For test size:  0.4
For test size:  0.2


In [13]:
accuracy_list_ICA_EW = ["%.2f" % elem for elem in accuracy_list_ICA_EW]
precision_list_ICA_EW = ["%.2f" % elem for elem in precision_list_ICA_EW]
recall_list_ICA_EW = ["%.2f" % elem for elem in recall_list_ICA_EW]

print(accuracy_list_ICA_EW)
print(precision_list_ICA_EW)
print(recall_list_ICA_EW)
print(conf_mat_ICA_EW)

['0.65', '0.74', '0.82', '0.87']
['0.66', '0.65', '0.75', '0.74', '0.82', '0.82', '0.87', '0.87']
['0.66', '0.65', '0.74', '0.74', '0.80', '0.82', '0.85', '0.87']
[array([[2355,  408,  631,  158],
       [ 514, 2071,  443,  361],
       [ 997,  641, 2332,  144],
       [  73,  391,  128, 2434]], dtype=int64), array([[2423,  137,  191,  145],
       [ 385, 1654,  191,  385],
       [ 664,  226, 1693,  155],
       [  40,  174,   57, 2041]], dtype=int64), array([[1994,   34,   62,   87],
       [ 164, 1118,   83,  209],
       [ 332,  124, 1046,   81],
       [  31,   68,   17, 1591]], dtype=int64), array([[1111,    6,   14,   26],
       [  65,  544,   28,  104],
       [ 118,   55,  520,   24],
       [   8,   12,    7,  879]], dtype=int64)]


### Model 4 : ICA-NB with Combined Features

In [14]:
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_ICA_Combined = []
precision_list_ICA_Combined = []
recall_list_ICA_Combined = []
conf_matrix_ICA_Combined = []

feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test,idx_train,idx_test = train_test_split(feature_matrix, class_labels, indices,test_size=k, random_state=1, stratify=class_labels)
    clf = GaussianNB()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    iter_labels = class_labels
    np.put(iter_labels,idx_test,y_pred)  # updating labels of test data with the predecited labels 
    iter_labels_list = list(iter_labels)
    distinct_labels = sorted(list(set(iter_labels_list)))

    # ICA starting

    adj_feats_train = np.zeros((len(X_train),2*len(distinct_labels))) # Multiply by 2, since we have 8 additional features combined.
    adj_feats_test  = np.zeros((len(X_test),2*len(distinct_labels)))

    ########################################################################
    ##constructing additional features, train and Iterate until stabilized##

    #updating the adjancent features for training nodes
    for i in range(len(idx_train)):       
        adjs = list(g[idx_train[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_train[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_train[i]] and g[idx_train[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_train[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)
        
        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_train[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_train[i]])
            adj_feats_train[i][this_label] = edge_weight_adj_nodes[this_label]
        
        for k in range(len(distinct_labels)):
            this_label = distinct_labels[k]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_train[i][this_label+len(distinct_labels)] = cnt_of_adjacent_labels
          
    X_train_updated = np.concatenate((X_train,adj_feats_train),axis = 1)

    #updating the adjancent features for test nodes
    for i in range(len(idx_test)):       
        adjs = list(g[idx_test[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_test[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_test[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)
        
        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
            adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]
        
        for k in range(len(distinct_labels)):
            this_label = distinct_labels[k]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_test[i][this_label+len(distinct_labels)] = cnt_of_adjacent_labels

    X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)  

    #learning the new model on updated feature matrix with adjacent labels
    clf_updated = GaussianNB()
    clf_updated.fit(X_train_updated,y_train)
    #print("\nStarting ICA Loop: ...\n")
    #staring the ICA inference loop
    loop_var = 0
    iter_var = 0
    y_pred_current = y_pred
    while (loop_var == 0 and iter_var < 15):
            y_pred_updated = clf_updated.predict(X_test_updated)
            if(np.array_equal(y_pred_current, y_pred_updated)):
                #algorithm stabilized
                #print("ICA Stabilized")
                loop_var = 1        
            else:
                loop_var = 0
                iter_var += 1
                #print("ICA Loop: "+str(iter_var))
                
                #updating the labels for test nodes with new predictions
                for i in range(len(idx_test)):
                    iter_labels_list[idx_test[i]] = y_pred_updated[i]
                
                #updating the adjancent features for test nodes
                for i in range(len(idx_test)):       
                    adjs = list(g[idx_test[i]])
                    adjs_threshold = []
                    cnt_of_ew1 = 0
                    for nei in adjs:
                        if g[idx_test[i]][nei]['edge_weight'] == 1:
                            cnt_of_ew1 += 1
                    for nei in adjs:
                        if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                            adjs_threshold.append(nei)
                        if g[idx_test[i]][nei]['edge_weight']>1:
                            adjs_threshold.append(nei) 
                    adjs_threshold = [int(nei) for nei in adjs_threshold]
                    labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]
                    edge_weight_adj_nodes = [0]*len(distinct_labels)
                    
                    for j in range(len(adjs_threshold)):
                        this_label = iter_labels_list[adjs_threshold[j]]
                        edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
                        adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]
                    
                    for k in range(len(distinct_labels)):
                        this_label = distinct_labels[k]
                        cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
                        adj_feats_test[i][this_label+len(distinct_labels)] = cnt_of_adjacent_labels       
                                
                X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)
                y_pred_current = y_pred_updated   
    print('No. of iterations ICA ran: ',iter_var)            
    final_predictions = y_pred_updated

    #print('ICA - Edge Weight Confusion matrix:\n')
    confusion_mat = confusion_matrix(y_test,y_pred_updated)
    conf_matrix_ICA_Combined.append(confusion_mat)
    #print(confusion_mat)

    #print('\nMetrics for Iterative Classification Algorithm for train size {:.1f}:\n'.format(1-k))
    # ICA
    # Macro
    accuracy_list_ICA_Combined.append(metrics.accuracy_score(y_test,y_pred_updated))
    precision_list_ICA_Combined.append(metrics.precision_score(y_test,y_pred_updated,average='macro',zero_division=0))
    recall_list_ICA_Combined.append(metrics.recall_score(y_test,y_pred_updated,average='macro'))

    # Micro
    precision_list_ICA_Combined.append(metrics.precision_score(y_test,y_pred_updated,average='micro',zero_division=0))
    recall_list_ICA_Combined.append(metrics.recall_score(y_test,y_pred_updated,average='micro'))


For test size:  0.8
No. of iterations ICA ran:  2
For test size:  0.6
No. of iterations ICA ran:  2
For test size:  0.4
No. of iterations ICA ran:  1
For test size:  0.2
No. of iterations ICA ran:  1


In [15]:
accuracy_list_ICA_Combined = ["%.2f" % elem for elem in accuracy_list_ICA_Combined]
precision_list_ICA_Combined = ["%.2f" % elem for elem in precision_list_ICA_Combined]
recall_list_ICA_Combined = ["%.2f" % elem for elem in recall_list_ICA_Combined]

print(accuracy_list_ICA_Combined)
print(precision_list_ICA_Combined)
print(recall_list_ICA_Combined)
print(conf_matrix_ICA_Combined)

['0.70', '0.79', '0.83', '0.87']
['0.70', '0.70', '0.80', '0.79', '0.83', '0.83', '0.86', '0.87']
['0.70', '0.70', '0.80', '0.79', '0.82', '0.83', '0.86', '0.87']
[array([[2408,  367,  697,   80],
       [ 375, 2208,  517,  289],
       [ 729,  549, 2733,  103],
       [  58,  372,  116, 2480]], dtype=int64), array([[2372,  204,  268,   52],
       [ 194, 1968,  243,  210],
       [ 384,  225, 2016,  113],
       [  72,  169,   38, 2033]], dtype=int64), array([[1856,  143,  146,   32],
       [  90, 1258,  105,  121],
       [ 202,  114, 1206,   61],
       [  73,  110,   13, 1511]], dtype=int64), array([[1035,   59,   54,    9],
       [  33,  606,   35,   67],
       [  68,   50,  577,   22],
       [  30,   32,   14,  830]], dtype=int64)]


## Random State - 2

### Model 1 : Naive Bayes

In [16]:
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_NB = []
precision_list_NB = []
recall_list_NB = []
conf_mat_NB = []

#loading the feature matrix and labels 
feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test = train_test_split(feature_matrix, class_labels,test_size=k, random_state=2, stratify=class_labels)
    clf = GaussianNB() # Bootstrapping using Naive Bayes as Base Classifier
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    confusion_mat = confusion_matrix(y_test,y_pred)

    conf_mat_NB.append(confusion_mat)
    accuracy_list_NB.append(metrics.accuracy_score(y_test,y_pred))
    precision_list_NB.append(metrics.precision_score(y_test,y_pred,average='macro',zero_division=0))
    recall_list_NB.append(metrics.recall_score(y_test,y_pred,average='macro'))

    # Micro
    precision_list_NB.append(metrics.precision_score(y_test,y_pred,average='micro',zero_division=0))
    recall_list_NB.append(metrics.recall_score(y_test,y_pred,average='micro'))


For test size:  0.8
For test size:  0.6
For test size:  0.4
For test size:  0.2


In [17]:
accuracy_list_NB = ["%.2f" % elem for elem in accuracy_list_NB]
precision_list_NB = ["%.2f" % elem for elem in precision_list_NB]
recall_list_NB = ["%.2f" % elem for elem in recall_list_NB]

print(accuracy_list_NB)
print(precision_list_NB)
print(recall_list_NB)
print(conf_mat_NB)

['0.66', '0.64', '0.65', '0.65']
['0.66', '0.66', '0.65', '0.64', '0.68', '0.65', '0.68', '0.65']
['0.66', '0.66', '0.65', '0.64', '0.67', '0.65', '0.67', '0.65']
[array([[2390,  427,  588,  147],
       [ 524, 2020,  457,  388],
       [ 942,  604, 2403,  165],
       [  61,  422,  127, 2416]], dtype=int64), array([[2012,  182,  310,  160],
       [ 536, 1280,  264,  462],
       [1040,  236, 1594,  216],
       [  51,  246,   71, 1901]], dtype=int64), array([[1442,   82,  114,  138],
       [ 353,  795,  135,  412],
       [ 751,  131, 1000,  175],
       [  22,  110,   25, 1356]], dtype=int64), array([[720,  32,  65,  71],
       [173, 402,  61, 211],
       [387,  65, 478,  99],
       [  8,  31,  14, 704]], dtype=int64)]


### Model 2 : ICA-NB with Labels Counts

In [18]:
##Bootstrapping
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_ICA_Labels = []
precision_list_ICA_Labels = []
recall_list_ICA_Labels = []
conf_mat_ICA_Labels = []

feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test,idx_train,idx_test = train_test_split(feature_matrix, class_labels, indices,test_size=k, random_state=2, stratify=class_labels)
    clf = GaussianNB() # Bootstrapping using Naive Bayes as Base Classifier
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test) # y-pred is bootstrapped labels
    iter_labels = class_labels
    np.put(iter_labels,idx_test,y_pred)  # updating labels of test data with the predecited labels 
    iter_labels_list = list(iter_labels)
    distinct_labels = sorted(list(set(iter_labels_list)))

    # ICA starting

    adj_feats_train = np.zeros((len(X_train),len(distinct_labels)))
    adj_feats_test  = np.zeros((len(X_test),len(distinct_labels)))

    ########################################################################
    ##constructing additional features, train and Iterate until stabilized##

    #updating the adjacent features for training nodes
    for i in range(len(idx_train)):
        adjs = list(g[idx_train[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_train[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1 # Count for single node
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_train[i]] and g[idx_train[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_train[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei)
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]

        for j in range(len(distinct_labels)):
            this_label = distinct_labels[j]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_train[i][this_label] = cnt_of_adjacent_labels
        
          
    X_train_updated = np.concatenate((X_train,adj_feats_train),axis = 1)

    #updating the adjancent features for test nodes
    for i in range(len(idx_test)):
        adjs = list(g[idx_test[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_test[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_test[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]

        for j in range(len(distinct_labels)):
            this_label = distinct_labels[j]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_test[i][this_label] = cnt_of_adjacent_labels

    X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)  

    #learning the new model on updated feature matrix with adjacent labels
    clf_updated = GaussianNB()
    clf_updated.fit(X_train_updated,y_train)
    #print("\nStarting ICA Loop: ...\n")
    #staring the ICA inference loop
    loop_var = 0
    iter_var = 0
    y_pred_current = y_pred
    while (loop_var == 0 and iter_var < 15):
            y_pred_updated = clf_updated.predict(X_test_updated)
            if(np.array_equal(y_pred_current, y_pred_updated)):
                #algorithm stabilized
                #print("ICA Stabilized")
                loop_var = 1        
            else:
                loop_var = 0
                iter_var += 1
                #print("ICA Loop: "+str(iter_var))
                
                #updating the labels for test nodes with new predictions
                for i in range(len(idx_test)):
                    iter_labels_list[idx_test[i]] = y_pred_updated[i]
                
                #updating the adjacent features for test nodes
                for i in range(len(idx_test)):
                    adjs = list(g[idx_test[i]])
                    adjs_threshold = []
                    cnt_of_ew1 = 0
                    for nei in adjs:
                        if g[idx_test[i]][nei]['edge_weight'] == 1:
                            cnt_of_ew1 += 1
                    for nei in adjs:
                        if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                            adjs_threshold.append(nei)
                        if g[idx_test[i]][nei]['edge_weight']>1:
                            adjs_threshold.append(nei) 
                    adjs_threshold = [int(nei) for nei in adjs_threshold]
                    labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]

                    for j in range(len(distinct_labels)):
                        this_label = distinct_labels[j]
                        cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
                        adj_feats_test[i][this_label] = cnt_of_adjacent_labels
                                        
                X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)
                y_pred_current = y_pred_updated   
    
    #print('No. of iterations ICA ran: ',iter_var)            
    final_predictions = y_pred_updated

    #print('\nMetrics for Iterative Classification Algorithm for train size {:.1f}:\n'.format(1-k))
    # ICA
    # Macro
    confusion_mat = confusion_matrix(y_test,y_pred_updated)
    conf_mat_ICA_Labels.append(confusion_mat)

    accuracy_list_ICA_Labels.append(metrics.accuracy_score(y_test,y_pred_updated))
    precision_list_ICA_Labels.append(metrics.precision_score(y_test,y_pred_updated,average='macro',zero_division=0))
    recall_list_ICA_Labels.append(metrics.recall_score(y_test,y_pred_updated,average='macro'))

    # Micro
    precision_list_ICA_Labels.append(metrics.precision_score(y_test,y_pred_updated,average='micro',zero_division=0))
    recall_list_ICA_Labels.append(metrics.recall_score(y_test,y_pred_updated,average='micro'))


For test size:  0.8
For test size:  0.6
For test size:  0.4
For test size:  0.2


In [19]:
accuracy_list_ICA_Labels = ["%.2f" % elem for elem in accuracy_list_ICA_Labels]
precision_list_ICA_Labels = ["%.2f" % elem for elem in precision_list_ICA_Labels]
recall_list_ICA_Labels = ["%.2f" % elem for elem in recall_list_ICA_Labels]

print(accuracy_list_ICA_Labels)
print(precision_list_ICA_Labels)
print(recall_list_ICA_Labels)
print(conf_mat_ICA_Labels)

['0.70', '0.78', '0.82', '0.85']
['0.71', '0.70', '0.79', '0.78', '0.82', '0.82', '0.84', '0.85']
['0.71', '0.70', '0.79', '0.78', '0.82', '0.82', '0.84', '0.85']
[array([[2427,  366,  698,   61],
       [ 378, 2182,  525,  304],
       [ 685,  495, 2823,  111],
       [  44,  423,  116, 2443]], dtype=int64), array([[2354,  205,  281,   43],
       [ 226, 1889,  261,  216],
       [ 368,  249, 2060,   86],
       [ 102,  185,   55, 1981]], dtype=int64), array([[1861,  121,  138,   31],
       [ 112, 1196,  126,  140],
       [ 201,  122, 1251,   56],
       [  83,   90,   24, 1489]], dtype=int64), array([[991,  71,  73,  12],
       [ 40, 586,  34,  67],
       [ 83,  38, 599,  30],
       [ 49,  39,   7, 802]], dtype=int64)]


### Model 3 : ICA-NB with Edge Weight

In [20]:
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_ICA_EW= []
precision_list_ICA_EW = []
recall_list_ICA_EW = []
conf_mat_ICA_EW = []

feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test,idx_train,idx_test = train_test_split(feature_matrix, class_labels, indices,test_size=k, random_state=2, stratify=class_labels)
    clf = GaussianNB()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    iter_labels = class_labels
    np.put(iter_labels,idx_test,y_pred)  # updating labels of test data with the predecited labels 
    iter_labels_list = list(iter_labels)
    distinct_labels = sorted(list(set(iter_labels_list)))

    # ICA starting

    adj_feats_train = np.zeros((len(X_train),len(distinct_labels)))
    adj_feats_test  = np.zeros((len(X_test),len(distinct_labels)))

    ########################################################################
    ##constructing additional features, train and Iterate until stabilized##

    #updating the adjancent features for training nodes
    for i in range(len(idx_train)):
        adjs = list(g[idx_train[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_train[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_train[i]] and g[idx_train[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_train[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)

        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_train[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_train[i]])
            adj_feats_train[i][this_label] = edge_weight_adj_nodes[this_label]
        
          
    X_train_updated = np.concatenate((X_train,adj_feats_train),axis = 1)

    #updating the adjancent features for test nodes
    for i in range(len(idx_test)):
        adjs = list(g[idx_test[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_test[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_test[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)

        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
            adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]

    X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)  

    #learning the new model on updated feature matrix with adjacent labels
    clf_updated = GaussianNB()
    clf_updated.fit(X_train_updated,y_train)
    #print("\nStarting ICA Loop: ...\n")
    #staring the ICA inference loop
    loop_var = 0
    iter_var = 0
    y_pred_current = y_pred
    while (loop_var == 0 and iter_var < 15):
            y_pred_updated = clf_updated.predict(X_test_updated)
            if(np.array_equal(y_pred_current, y_pred_updated)):
                #algorithm stabilized
                #print("ICA Stabilized")
                loop_var = 1        
            else:
                loop_var = 0
                iter_var += 1
                #print("ICA Loop: "+str(iter_var))
                
                #updating the labels for test nodes with new predictions
                for i in range(len(idx_test)):
                    iter_labels_list[idx_test[i]] = y_pred_updated[i]
                
                #updating the adjancent features for test nodes
                for i in range(len(idx_test)):
                    adjs = list(g[idx_test[i]])
                    adjs_threshold = []
                    cnt_of_ew1 = 0
                    for nei in adjs:
                        if g[idx_test[i]][nei]['edge_weight'] == 1:
                            cnt_of_ew1 += 1
                    for nei in adjs:
                        if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                            adjs_threshold.append(nei)
                        if g[idx_test[i]][nei]['edge_weight']>1:
                            adjs_threshold.append(nei) 
                    adjs_threshold = [int(nei) for nei in adjs_threshold]
                    edge_weight_adj_nodes = [0]*len(distinct_labels)

                    for j in range(len(adjs_threshold)):
                        this_label = iter_labels_list[adjs_threshold[j]]
                        edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
                        adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]     
                                
                X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)
                y_pred_current = y_pred_updated   
    #print('No. of iterations ICA ran: ',iter_var)            
    final_predictions = y_pred_updated

    #print('ICA - Edge Weight Confusion matrix:\n')
    confusion_mat = confusion_matrix(y_test,y_pred_updated)
    conf_mat_ICA_EW.append(confusion_mat)
    #print(confusion_mat)

    #print('\nMetrics for Iterative Classification Algorithm for train size {:.1f}:\n'.format(1-k))
    # ICA
    # Macro
    accuracy_list_ICA_EW.append(metrics.accuracy_score(y_test,y_pred_updated))
    precision_list_ICA_EW.append(metrics.precision_score(y_test,y_pred_updated,average='macro',zero_division=0))
    recall_list_ICA_EW.append(metrics.recall_score(y_test,y_pred_updated,average='macro'))

    # Micro
    precision_list_ICA_EW.append(metrics.precision_score(y_test,y_pred_updated,average='micro',zero_division=0))
    recall_list_ICA_EW.append(metrics.recall_score(y_test,y_pred_updated,average='micro'))


For test size:  0.8
For test size:  0.6
For test size:  0.4
For test size:  0.2


In [21]:
accuracy_list_ICA_EW = ["%.2f" % elem for elem in accuracy_list_ICA_EW]
precision_list_ICA_EW = ["%.2f" % elem for elem in precision_list_ICA_EW]
recall_list_ICA_EW = ["%.2f" % elem for elem in recall_list_ICA_EW]

print(accuracy_list_ICA_EW)
print(precision_list_ICA_EW)
print(recall_list_ICA_EW)
print(conf_mat_ICA_EW)

['0.66', '0.74', '0.82', '0.86']
['0.66', '0.66', '0.74', '0.74', '0.83', '0.82', '0.87', '0.86']
['0.66', '0.66', '0.74', '0.74', '0.81', '0.82', '0.84', '0.86']
[array([[2390,  427,  588,  147],
       [ 524, 2020,  457,  388],
       [ 942,  604, 2403,  165],
       [  61,  422,  127, 2416]], dtype=int64), array([[2406,  153,  196,  128],
       [ 390, 1598,  217,  387],
       [ 602,  278, 1748,  135],
       [  56,  178,   72, 2017]], dtype=int64), array([[1980,   39,   56,   76],
       [ 190, 1069,  105,  210],
       [ 313,  113, 1126,   78],
       [  27,   53,   16, 1590]], dtype=int64), array([[1093,   13,   13,   28],
       [  64,  548,   17,   98],
       [ 123,   59,  525,   43],
       [  18,    6,    2,  871]], dtype=int64)]


### Model 4 : ICA-NB with Combined features

In [22]:
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_ICA_Combined = []
precision_list_ICA_Combined = []
recall_list_ICA_Combined = []
conf_matrix_ICA_Combined = []

feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test,idx_train,idx_test = train_test_split(feature_matrix, class_labels, indices,test_size=k, random_state=2, stratify=class_labels)
    clf = GaussianNB()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    iter_labels = class_labels
    np.put(iter_labels,idx_test,y_pred)  # updating labels of test data with the predecited labels 
    iter_labels_list = list(iter_labels)
    distinct_labels = sorted(list(set(iter_labels_list)))

    # ICA starting

    adj_feats_train = np.zeros((len(X_train),2*len(distinct_labels))) # Multiply by 2, since we have 8 additional features combined.
    adj_feats_test  = np.zeros((len(X_test),2*len(distinct_labels)))

    ########################################################################
    ##constructing additional features, train and Iterate until stabilized##

    #updating the adjancent features for training nodes
    for i in range(len(idx_train)):       
        adjs = list(g[idx_train[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_train[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_train[i]] and g[idx_train[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_train[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)
        
        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_train[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_train[i]])
            adj_feats_train[i][this_label] = edge_weight_adj_nodes[this_label]
        
        for k in range(len(distinct_labels)):
            this_label = distinct_labels[k]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_train[i][this_label+len(distinct_labels)] = cnt_of_adjacent_labels
          
    X_train_updated = np.concatenate((X_train,adj_feats_train),axis = 1)

    #updating the adjancent features for test nodes
    for i in range(len(idx_test)):       
        adjs = list(g[idx_test[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_test[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_test[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)
        
        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
            adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]
        
        for k in range(len(distinct_labels)):
            this_label = distinct_labels[k]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_test[i][this_label+len(distinct_labels)] = cnt_of_adjacent_labels

    X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)  

    #learning the new model on updated feature matrix with adjacent labels
    clf_updated = GaussianNB()
    clf_updated.fit(X_train_updated,y_train)
    #print("\nStarting ICA Loop: ...\n")
    #staring the ICA inference loop
    loop_var = 0
    iter_var = 0
    y_pred_current = y_pred
    while (loop_var == 0 and iter_var < 15):
            y_pred_updated = clf_updated.predict(X_test_updated)
            if(np.array_equal(y_pred_current, y_pred_updated)):
                #algorithm stabilized
                #print("ICA Stabilized")
                loop_var = 1        
            else:
                loop_var = 0
                iter_var += 1
                #print("ICA Loop: "+str(iter_var))
                
                #updating the labels for test nodes with new predictions
                for i in range(len(idx_test)):
                    iter_labels_list[idx_test[i]] = y_pred_updated[i]
                
                #updating the adjancent features for test nodes
                for i in range(len(idx_test)):       
                    adjs = list(g[idx_test[i]])
                    adjs_threshold = []
                    cnt_of_ew1 = 0
                    for nei in adjs:
                        if g[idx_test[i]][nei]['edge_weight'] == 1:
                            cnt_of_ew1 += 1
                    for nei in adjs:
                        if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                            adjs_threshold.append(nei)
                        if g[idx_test[i]][nei]['edge_weight']>1:
                            adjs_threshold.append(nei) 
                    adjs_threshold = [int(nei) for nei in adjs_threshold]
                    labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]
                    edge_weight_adj_nodes = [0]*len(distinct_labels)
                    
                    for j in range(len(adjs_threshold)):
                        this_label = iter_labels_list[adjs_threshold[j]]
                        edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
                        adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]
                    
                    for k in range(len(distinct_labels)):
                        this_label = distinct_labels[k]
                        cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
                        adj_feats_test[i][this_label+len(distinct_labels)] = cnt_of_adjacent_labels       
                                
                X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)
                y_pred_current = y_pred_updated   
    print('No. of iterations ICA ran: ',iter_var)            
    final_predictions = y_pred_updated

    #print('ICA - Edge Weight Confusion matrix:\n')
    confusion_mat = confusion_matrix(y_test,y_pred_updated)
    conf_matrix_ICA_Combined.append(confusion_mat)
    #print(confusion_mat)

    #print('\nMetrics for Iterative Classification Algorithm for train size {:.1f}:\n'.format(1-k))
    # ICA
    # Macro
    accuracy_list_ICA_Combined.append(metrics.accuracy_score(y_test,y_pred_updated))
    precision_list_ICA_Combined.append(metrics.precision_score(y_test,y_pred_updated,average='macro',zero_division=0))
    recall_list_ICA_Combined.append(metrics.recall_score(y_test,y_pred_updated,average='macro'))

    # Micro
    precision_list_ICA_Combined.append(metrics.precision_score(y_test,y_pred_updated,average='micro',zero_division=0))
    recall_list_ICA_Combined.append(metrics.recall_score(y_test,y_pred_updated,average='micro'))


For test size:  0.8
No. of iterations ICA ran:  1
For test size:  0.6
No. of iterations ICA ran:  2
For test size:  0.4
No. of iterations ICA ran:  1
For test size:  0.2
No. of iterations ICA ran:  1


In [23]:
accuracy_list_ICA_Combined = ["%.2f" % elem for elem in accuracy_list_ICA_Combined]
precision_list_ICA_Combined = ["%.2f" % elem for elem in precision_list_ICA_Combined]
recall_list_ICA_Combined = ["%.2f" % elem for elem in recall_list_ICA_Combined]

print(accuracy_list_ICA_Combined)
print(precision_list_ICA_Combined)
print(recall_list_ICA_Combined)
print(conf_matrix_ICA_Combined)

['0.70', '0.78', '0.82', '0.85']
['0.71', '0.70', '0.79', '0.78', '0.82', '0.82', '0.84', '0.85']
['0.71', '0.70', '0.79', '0.78', '0.82', '0.82', '0.84', '0.85']
[array([[2427,  366,  698,   61],
       [ 377, 2183,  525,  304],
       [ 685,  495, 2823,  111],
       [  44,  422,  116, 2444]], dtype=int64), array([[2353,  205,  282,   43],
       [ 227, 1888,  260,  217],
       [ 367,  249, 2061,   86],
       [ 102,  185,   55, 1981]], dtype=int64), array([[1861,  121,  138,   31],
       [ 112, 1198,  124,  140],
       [ 200,  122, 1251,   57],
       [  83,   90,   24, 1489]], dtype=int64), array([[991,  71,  73,  12],
       [ 40, 586,  34,  67],
       [ 83,  38, 599,  30],
       [ 49,  39,   7, 802]], dtype=int64)]


## Random state - 3

### Model 1 : Naive Bayes

In [24]:
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_NB = []
precision_list_NB = []
recall_list_NB = []
conf_mat_NB = []

#loading the feature matrix and labels 
feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test = train_test_split(feature_matrix, class_labels,test_size=k, random_state=3, stratify=class_labels)
    clf = GaussianNB() # Bootstrapping using Naive Bayes as Base Classifier
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    confusion_mat = confusion_matrix(y_test,y_pred)

    conf_mat_NB.append(confusion_mat)
    accuracy_list_NB.append(metrics.accuracy_score(y_test,y_pred))
    precision_list_NB.append(metrics.precision_score(y_test,y_pred,average='macro',zero_division=0))
    recall_list_NB.append(metrics.recall_score(y_test,y_pred,average='macro'))

    # Micro
    precision_list_NB.append(metrics.precision_score(y_test,y_pred,average='micro',zero_division=0))
    recall_list_NB.append(metrics.recall_score(y_test,y_pred,average='micro'))


For test size:  0.8
For test size:  0.6
For test size:  0.4
For test size:  0.2


In [25]:
accuracy_list_NB = ["%.2f" % elem for elem in accuracy_list_NB]
precision_list_NB = ["%.2f" % elem for elem in precision_list_NB]
recall_list_NB = ["%.2f" % elem for elem in recall_list_NB]

print(accuracy_list_NB)
print(precision_list_NB)
print(recall_list_NB)
print(conf_mat_NB)

['0.65', '0.63', '0.63', '0.64']
['0.65', '0.65', '0.65', '0.63', '0.67', '0.63', '0.68', '0.64']
['0.66', '0.65', '0.65', '0.63', '0.65', '0.63', '0.66', '0.64']
[array([[2474,  401,  504,  173],
       [ 502, 1952,  500,  435],
       [1009,  659, 2278,  168],
       [  50,  414,  136, 2426]], dtype=int64), array([[2037,  175,  246,  206],
       [ 532, 1303,  227,  480],
       [1171,  279, 1416,  220],
       [  57,  221,   65, 1926]], dtype=int64), array([[1409,   65,  105,  197],
       [ 360,  795,  116,  424],
       [ 832,  127,  903,  195],
       [  22,   93,   35, 1363]], dtype=int64), array([[724,  26,  44,  94],
       [165, 373,  74, 235],
       [405,  43, 476, 105],
       [  4,  49,  12, 692]], dtype=int64)]


### Model 2 : ICA-NB with Label Counts

In [26]:
##Bootstrapping
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_ICA_Labels = []
precision_list_ICA_Labels = []
recall_list_ICA_Labels = []
conf_mat_ICA_Labels = []

feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test,idx_train,idx_test = train_test_split(feature_matrix, class_labels, indices,test_size=k, random_state=3, stratify=class_labels)
    clf = GaussianNB() # Bootstrapping using Naive Bayes as Base Classifier
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test) # y-pred is bootstrapped labels
    iter_labels = class_labels
    np.put(iter_labels,idx_test,y_pred)  # updating labels of test data with the predecited labels 
    iter_labels_list = list(iter_labels)
    distinct_labels = sorted(list(set(iter_labels_list)))

    # ICA starting

    adj_feats_train = np.zeros((len(X_train),len(distinct_labels)))
    adj_feats_test  = np.zeros((len(X_test),len(distinct_labels)))

    ########################################################################
    ##constructing additional features, train and Iterate until stabilized##

    #updating the adjacent features for training nodes
    for i in range(len(idx_train)):
        adjs = list(g[idx_train[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_train[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1 # Count for single node
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_train[i]] and g[idx_train[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_train[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei)
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]

        for j in range(len(distinct_labels)):
            this_label = distinct_labels[j]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_train[i][this_label] = cnt_of_adjacent_labels
        
          
    X_train_updated = np.concatenate((X_train,adj_feats_train),axis = 1)

    #updating the adjancent features for test nodes
    for i in range(len(idx_test)):
        adjs = list(g[idx_test[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_test[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_test[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]

        for j in range(len(distinct_labels)):
            this_label = distinct_labels[j]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_test[i][this_label] = cnt_of_adjacent_labels

    X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)  

    #learning the new model on updated feature matrix with adjacent labels
    clf_updated = GaussianNB()
    clf_updated.fit(X_train_updated,y_train)
    #print("\nStarting ICA Loop: ...\n")
    #staring the ICA inference loop
    loop_var = 0
    iter_var = 0
    y_pred_current = y_pred
    while (loop_var == 0 and iter_var < 15):
            y_pred_updated = clf_updated.predict(X_test_updated)
            if(np.array_equal(y_pred_current, y_pred_updated)):
                #algorithm stabilized
                #print("ICA Stabilized")
                loop_var = 1        
            else:
                loop_var = 0
                iter_var += 1
                #print("ICA Loop: "+str(iter_var))
                
                #updating the labels for test nodes with new predictions
                for i in range(len(idx_test)):
                    iter_labels_list[idx_test[i]] = y_pred_updated[i]
                
                #updating the adjacent features for test nodes
                for i in range(len(idx_test)):
                    adjs = list(g[idx_test[i]])
                    adjs_threshold = []
                    cnt_of_ew1 = 0
                    for nei in adjs:
                        if g[idx_test[i]][nei]['edge_weight'] == 1:
                            cnt_of_ew1 += 1
                    for nei in adjs:
                        if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                            adjs_threshold.append(nei)
                        if g[idx_test[i]][nei]['edge_weight']>1:
                            adjs_threshold.append(nei) 
                    adjs_threshold = [int(nei) for nei in adjs_threshold]
                    labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]

                    for j in range(len(distinct_labels)):
                        this_label = distinct_labels[j]
                        cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
                        adj_feats_test[i][this_label] = cnt_of_adjacent_labels
                                        
                X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)
                y_pred_current = y_pred_updated   
    
    #print('No. of iterations ICA ran: ',iter_var)            
    final_predictions = y_pred_updated

    #print('\nMetrics for Iterative Classification Algorithm for train size {:.1f}:\n'.format(1-k))
    # ICA
    # Macro
    confusion_mat = confusion_matrix(y_test,y_pred_updated)
    conf_mat_ICA_Labels.append(confusion_mat)

    accuracy_list_ICA_Labels.append(metrics.accuracy_score(y_test,y_pred_updated))
    precision_list_ICA_Labels.append(metrics.precision_score(y_test,y_pred_updated,average='macro',zero_division=0))
    recall_list_ICA_Labels.append(metrics.recall_score(y_test,y_pred_updated,average='macro'))

    # Micro
    precision_list_ICA_Labels.append(metrics.precision_score(y_test,y_pred_updated,average='micro',zero_division=0))
    recall_list_ICA_Labels.append(metrics.recall_score(y_test,y_pred_updated,average='micro'))


For test size:  0.8
For test size:  0.6
For test size:  0.4
For test size:  0.2


In [27]:
accuracy_list_ICA_Labels = ["%.2f" % elem for elem in accuracy_list_ICA_Labels]
precision_list_ICA_Labels = ["%.2f" % elem for elem in precision_list_ICA_Labels]
recall_list_ICA_Labels = ["%.2f" % elem for elem in recall_list_ICA_Labels]

print(accuracy_list_ICA_Labels)
print(precision_list_ICA_Labels)
print(recall_list_ICA_Labels)
print(conf_mat_ICA_Labels)

['0.70', '0.79', '0.83', '0.86']
['0.70', '0.70', '0.79', '0.79', '0.82', '0.83', '0.85', '0.86']
['0.70', '0.70', '0.79', '0.79', '0.82', '0.83', '0.85', '0.86']
[array([[2543,  330,  594,   85],
       [ 376, 2109,  574,  330],
       [ 809,  487, 2712,  106],
       [  39,  402,  128, 2457]], dtype=int64), array([[2432,  184,  291,   47],
       [ 232, 1838,  246,  248],
       [ 340,  277, 1979,   72],
       [  84,  163,   45, 2083]], dtype=int64), array([[1929,  118,  142,   24],
       [ 108, 1201,  119,  131],
       [ 190,   98, 1163,   69],
       [  81,  114,   30, 1524]], dtype=int64), array([[1050,   55,   58,   10],
       [  33,  586,   47,   55],
       [  71,   49,  540,   29],
       [  32,   47,   15,  844]], dtype=int64)]


### Model 3 : ICA-NB with Edge Weights

In [28]:
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_ICA_EW= []
precision_list_ICA_EW = []
recall_list_ICA_EW = []
conf_mat_ICA_EW = []

feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test,idx_train,idx_test = train_test_split(feature_matrix, class_labels, indices,test_size=k, random_state=3, stratify=class_labels)
    clf = GaussianNB()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    iter_labels = class_labels
    np.put(iter_labels,idx_test,y_pred)  # updating labels of test data with the predecited labels 
    iter_labels_list = list(iter_labels)
    distinct_labels = sorted(list(set(iter_labels_list)))

    # ICA starting

    adj_feats_train = np.zeros((len(X_train),len(distinct_labels)))
    adj_feats_test  = np.zeros((len(X_test),len(distinct_labels)))

    ########################################################################
    ##constructing additional features, train and Iterate until stabilized##

    #updating the adjancent features for training nodes
    for i in range(len(idx_train)):
        adjs = list(g[idx_train[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_train[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_train[i]] and g[idx_train[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_train[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)

        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_train[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_train[i]])
            adj_feats_train[i][this_label] = edge_weight_adj_nodes[this_label]
        
          
    X_train_updated = np.concatenate((X_train,adj_feats_train),axis = 1)

    #updating the adjancent features for test nodes
    for i in range(len(idx_test)):
        adjs = list(g[idx_test[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_test[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_test[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)

        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
            adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]

    X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)  

    #learning the new model on updated feature matrix with adjacent labels
    clf_updated = GaussianNB()
    clf_updated.fit(X_train_updated,y_train)
    #print("\nStarting ICA Loop: ...\n")
    #staring the ICA inference loop
    loop_var = 0
    iter_var = 0
    y_pred_current = y_pred
    while (loop_var == 0 and iter_var < 15):
            y_pred_updated = clf_updated.predict(X_test_updated)
            if(np.array_equal(y_pred_current, y_pred_updated)):
                #algorithm stabilized
                #print("ICA Stabilized")
                loop_var = 1        
            else:
                loop_var = 0
                iter_var += 1
                #print("ICA Loop: "+str(iter_var))
                
                #updating the labels for test nodes with new predictions
                for i in range(len(idx_test)):
                    iter_labels_list[idx_test[i]] = y_pred_updated[i]
                
                #updating the adjancent features for test nodes
                for i in range(len(idx_test)):
                    adjs = list(g[idx_test[i]])
                    adjs_threshold = []
                    cnt_of_ew1 = 0
                    for nei in adjs:
                        if g[idx_test[i]][nei]['edge_weight'] == 1:
                            cnt_of_ew1 += 1
                    for nei in adjs:
                        if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                            adjs_threshold.append(nei)
                        if g[idx_test[i]][nei]['edge_weight']>1:
                            adjs_threshold.append(nei) 
                    adjs_threshold = [int(nei) for nei in adjs_threshold]
                    edge_weight_adj_nodes = [0]*len(distinct_labels)

                    for j in range(len(adjs_threshold)):
                        this_label = iter_labels_list[adjs_threshold[j]]
                        edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
                        adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]     
                                
                X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)
                y_pred_current = y_pred_updated   
    #print('No. of iterations ICA ran: ',iter_var)            
    final_predictions = y_pred_updated

    #print('ICA - Edge Weight Confusion matrix:\n')
    confusion_mat = confusion_matrix(y_test,y_pred_updated)
    conf_mat_ICA_EW.append(confusion_mat)
    #print(confusion_mat)

    #print('\nMetrics for Iterative Classification Algorithm for train size {:.1f}:\n'.format(1-k))
    # ICA
    # Macro
    accuracy_list_ICA_EW.append(metrics.accuracy_score(y_test,y_pred_updated))
    precision_list_ICA_EW.append(metrics.precision_score(y_test,y_pred_updated,average='macro',zero_division=0))
    recall_list_ICA_EW.append(metrics.recall_score(y_test,y_pred_updated,average='macro'))

    # Micro
    precision_list_ICA_EW.append(metrics.precision_score(y_test,y_pred_updated,average='micro',zero_division=0))
    recall_list_ICA_EW.append(metrics.recall_score(y_test,y_pred_updated,average='micro'))


For test size:  0.8
For test size:  0.6
For test size:  0.4
For test size:  0.2


In [29]:
accuracy_list_ICA_EW = ["%.2f" % elem for elem in accuracy_list_ICA_EW]
precision_list_ICA_EW = ["%.2f" % elem for elem in precision_list_ICA_EW]
recall_list_ICA_EW = ["%.2f" % elem for elem in recall_list_ICA_EW]

print(accuracy_list_ICA_EW)
print(precision_list_ICA_EW)
print(recall_list_ICA_EW)
print(conf_mat_ICA_EW)

['0.65', '0.74', '0.83', '0.87']
['0.65', '0.65', '0.75', '0.74', '0.84', '0.83', '0.87', '0.87']
['0.66', '0.65', '0.74', '0.74', '0.81', '0.83', '0.84', '0.87']
[array([[2474,  401,  504,  173],
       [ 502, 1952,  500,  435],
       [1009,  659, 2278,  168],
       [  50,  414,  136, 2426]], dtype=int64), array([[2503,  131,  194,  126],
       [ 390, 1594,  177,  403],
       [ 636,  295, 1608,  129],
       [  35,  167,   42, 2131]], dtype=int64), array([[2063,   37,   44,   69],
       [ 174, 1074,   73,  238],
       [ 288,  112, 1025,   95],
       [  19,   47,   21, 1662]], dtype=int64), array([[1111,   12,   16,   34],
       [  59,  534,   33,   95],
       [ 115,   46,  494,   34],
       [  12,    9,    6,  911]], dtype=int64)]


### Model 4 : ICA-NB with Combined features

In [30]:
testSize = [0.8,0.6,0.4,0.2]
accuracy_list_ICA_Combined = []
precision_list_ICA_Combined = []
recall_list_ICA_Combined = []
conf_matrix_ICA_Combined = []

feature_matrix = np.load('D:/NLP Project/ICA/feature_matrix.npy')
class_labels = np.load('D:/NLP Project/ICA/class_lables.npy')
indices = np.arange(len(feature_matrix))

for k in testSize:
    print('For test size: ',k)
    X_train, X_test, y_train, y_test,idx_train,idx_test = train_test_split(feature_matrix, class_labels, indices,test_size=k, random_state=3, stratify=class_labels)
    clf = GaussianNB()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    iter_labels = class_labels
    np.put(iter_labels,idx_test,y_pred)  # updating labels of test data with the predecited labels 
    iter_labels_list = list(iter_labels)
    distinct_labels = sorted(list(set(iter_labels_list)))

    # ICA starting

    adj_feats_train = np.zeros((len(X_train),2*len(distinct_labels))) # Multiply by 2, since we have 8 additional features combined.
    adj_feats_test  = np.zeros((len(X_test),2*len(distinct_labels)))

    ########################################################################
    ##constructing additional features, train and Iterate until stabilized##

    #updating the adjancent features for training nodes
    for i in range(len(idx_train)):       
        adjs = list(g[idx_train[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_train[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_train[i]] and g[idx_train[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_train[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)
        
        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_train[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_train[i]])
            adj_feats_train[i][this_label] = edge_weight_adj_nodes[this_label]
        
        for k in range(len(distinct_labels)):
            this_label = distinct_labels[k]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_train[i][this_label+len(distinct_labels)] = cnt_of_adjacent_labels
          
    X_train_updated = np.concatenate((X_train,adj_feats_train),axis = 1)

    #updating the adjancent features for test nodes
    for i in range(len(idx_test)):       
        adjs = list(g[idx_test[i]])
        adjs_threshold = []
        cnt_of_ew1 = 0
        for nei in adjs:
            if g[idx_test[i]][nei]['edge_weight'] == 1:
                cnt_of_ew1 += 1
        for nei in adjs:
            if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                adjs_threshold.append(nei)
            if g[idx_test[i]][nei]['edge_weight']>1:
                adjs_threshold.append(nei) 
        adjs_threshold = [int(nei) for nei in adjs_threshold]
        labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]
        edge_weight_adj_nodes = [0]*len(distinct_labels)
        
        for j in range(len(adjs_threshold)):
            this_label = iter_labels_list[adjs_threshold[j]]
            edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
            adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]
        
        for k in range(len(distinct_labels)):
            this_label = distinct_labels[k]
            cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
            adj_feats_test[i][this_label+len(distinct_labels)] = cnt_of_adjacent_labels

    X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)  

    #learning the new model on updated feature matrix with adjacent labels
    clf_updated = GaussianNB()
    clf_updated.fit(X_train_updated,y_train)
    #print("\nStarting ICA Loop: ...\n")
    #staring the ICA inference loop
    loop_var = 0
    iter_var = 0
    y_pred_current = y_pred
    while (loop_var == 0 and iter_var < 15):
            y_pred_updated = clf_updated.predict(X_test_updated)
            if(np.array_equal(y_pred_current, y_pred_updated)):
                #algorithm stabilized
                #print("ICA Stabilized")
                loop_var = 1        
            else:
                loop_var = 0
                iter_var += 1
                #print("ICA Loop: "+str(iter_var))
                
                #updating the labels for test nodes with new predictions
                for i in range(len(idx_test)):
                    iter_labels_list[idx_test[i]] = y_pred_updated[i]
                
                #updating the adjancent features for test nodes
                for i in range(len(idx_test)):       
                    adjs = list(g[idx_test[i]])
                    adjs_threshold = []
                    cnt_of_ew1 = 0
                    for nei in adjs:
                        if g[idx_test[i]][nei]['edge_weight'] == 1:
                            cnt_of_ew1 += 1
                    for nei in adjs:
                        if cnt_of_ew1 == g.degree[idx_test[i]] and g[idx_test[i]][nei]['edge_weight']==1:
                            adjs_threshold.append(nei)
                        if g[idx_test[i]][nei]['edge_weight']>1:
                            adjs_threshold.append(nei) 
                    adjs_threshold = [int(nei) for nei in adjs_threshold]
                    labels_of_adjacent_nodes = [iter_labels_list[nei] for nei in adjs_threshold]
                    edge_weight_adj_nodes = [0]*len(distinct_labels)
                    
                    for j in range(len(adjs_threshold)):
                        this_label = iter_labels_list[adjs_threshold[j]]
                        edge_weight_adj_nodes[this_label] +=  int(g[idx_test[i]][adjs_threshold[j]]['edge_weight'])/len(g[idx_test[i]])
                        adj_feats_test[i][this_label] = edge_weight_adj_nodes[this_label]
                    
                    for k in range(len(distinct_labels)):
                        this_label = distinct_labels[k]
                        cnt_of_adjacent_labels = labels_of_adjacent_nodes.count(this_label) ## Voting by neighbor nodes.
                        adj_feats_test[i][this_label+len(distinct_labels)] = cnt_of_adjacent_labels       
                                
                X_test_updated = np.concatenate((X_test,adj_feats_test),axis = 1)
                y_pred_current = y_pred_updated   
    print('No. of iterations ICA ran: ',iter_var)            
    final_predictions = y_pred_updated

    #print('ICA - Edge Weight Confusion matrix:\n')
    confusion_mat = confusion_matrix(y_test,y_pred_updated)
    conf_matrix_ICA_Combined.append(confusion_mat)
    #print(confusion_mat)

    #print('\nMetrics for Iterative Classification Algorithm for train size {:.1f}:\n'.format(1-k))
    # ICA
    # Macro
    accuracy_list_ICA_Combined.append(metrics.accuracy_score(y_test,y_pred_updated))
    precision_list_ICA_Combined.append(metrics.precision_score(y_test,y_pred_updated,average='macro',zero_division=0))
    recall_list_ICA_Combined.append(metrics.recall_score(y_test,y_pred_updated,average='macro'))

    # Micro
    precision_list_ICA_Combined.append(metrics.precision_score(y_test,y_pred_updated,average='micro',zero_division=0))
    recall_list_ICA_Combined.append(metrics.recall_score(y_test,y_pred_updated,average='micro'))


For test size:  0.8
No. of iterations ICA ran:  1
For test size:  0.6
No. of iterations ICA ran:  2
For test size:  0.4
No. of iterations ICA ran:  1
For test size:  0.2
No. of iterations ICA ran:  1


In [31]:
accuracy_list_ICA_Combined = ["%.2f" % elem for elem in accuracy_list_ICA_Combined]
precision_list_ICA_Combined = ["%.2f" % elem for elem in precision_list_ICA_Combined]
recall_list_ICA_Combined = ["%.2f" % elem for elem in recall_list_ICA_Combined]

print(accuracy_list_ICA_Combined)
print(precision_list_ICA_Combined)
print(recall_list_ICA_Combined)
print(conf_matrix_ICA_Combined)

['0.70', '0.79', '0.83', '0.86']
['0.70', '0.70', '0.79', '0.79', '0.82', '0.83', '0.85', '0.86']
['0.70', '0.70', '0.79', '0.79', '0.82', '0.83', '0.85', '0.86']
[array([[2542,  330,  594,   86],
       [ 376, 2109,  574,  330],
       [ 809,  487, 2712,  106],
       [  39,  402,  128, 2457]], dtype=int64), array([[2432,  184,  291,   47],
       [ 232, 1838,  246,  248],
       [ 338,  277, 1981,   72],
       [  84,  163,   45, 2083]], dtype=int64), array([[1929,  118,  142,   24],
       [ 108, 1201,  119,  131],
       [ 190,   98, 1163,   69],
       [  81,  114,   30, 1524]], dtype=int64), array([[1050,   55,   58,   10],
       [  33,  586,   47,   55],
       [  72,   49,  539,   29],
       [  32,   47,   15,  844]], dtype=int64)]
