# HF&ML components

In [1]:
# coding=utf-8
import numpy as np
import pandas as pd
import copy
from sklearn.metrics.pairwise import cosine_similarity


#This input list contains N*N dictionary, which is the value of cosine similarity between node and node
def node_reassemble_cosine(df, threshold):
    verify_set = []
    #initialize
    for i in range(len(df)):
        #Pass Sets
        verify_set.append(set([key for key, value in df[i].items() if value >= threshold]))
    return verify_set

#This has two inputs: the original dataset (processed train data X) and the NRC adjustment standard, 
#which is to use the original dataset for NRC processing (for each features)
#NRC is abbreviation of Node Reassemble Calibrate
def node_reassemble_intensity(X, NRC):
    result = []
    ID = []
    for i in range(len(X)):
        ID.append(i)
    #After the corresponding df, use dictionary corresponding
    dic_X = dict(zip(ID, X))
    for i in range(len(X)):
        #strongest neighbors, and their features
        element_dic_X = [value for key, value in dic_X.items() if key in NRC[i]]
        #Add up their features as denominator
        sum_element_dic_X = sum(element_dic_X)
        #Get the value of the target
        target = [value for key, value in dic_X.items() if key == i][0]
        #Node reassemble
        result.append(((target/sum_element_dic_X)*len(element_dic_X)*target))
    return result
    

def get_NRC_X(path, feature_list, threshold):
    
    #read csv
    df = pd.read_csv(path,names=feature_list)
    #remove NA
    df = df.fillna(0)
    
    
    #Take the available features (assume the first is the ID, the last is the Label)
    X = pd.DataFrame(df, columns = feature_list[1:-1])
    X = np.squeeze(np.asarray(X))
    
    #Cosine Similarity
    Cos_X = cosine_similarity(X)
    
    #Prepare to make a corresponding table (dictionary)
    ID = []
    for i in range(len(Cos_X)):
        ID.append(i)

    result = []
    for i in range(len(Cos_X)):
        result.append(dict(zip(ID, Cos_X[i])))
        
    NRC = node_reassemble_cosine(result, threshold)
    NRC_df = node_reassemble_intensity(X, NRC)
    
    return NRC_df
    
    
#inputs two-dimensional, get the original citation data
def get_citation_ori_data(path, feature_list):
    #read csv
    df = pd.read_csv(path,names=feature_list)
    df = df.fillna(0)

    #Take available features (assuming the first is ID, the last is Label)
    X2 = pd.DataFrame(df, columns = feature_list[:])
    X2 = np.squeeze(np.asarray(X2))
    return X2

#[dic_x] inputs are two-dimensional, assemble the original citation data, a node corresponds to a citation list
def get_citation(df):
    dic_x = {}
    for i in range(len(df)):
        #Make an adjacent matrix used to find the citation relationship (cited group corresponding to each citing)
        #If citing is not currently in dictionary
        if df[i][0] not in dic_x.keys():
            #Create a list
            dic_x[df[i][0]]=[]
            #Add the current target cited node to the set corresponding to the citing node
            dic_x[df[i][0]].append(df[i][1])
        #If citing is currently in dictionary
        elif df[i][0] in dic_x.keys():
            #Add the current target cited node directly to the set corresponding to the citing node
            dic_x[df[i][0]].append(df[i][1])
        else:
            print('Error')
            break
    return dic_x

#[df] inputs are two-dimensional, but we only need to use the citing data, specify the first feature as citing, and count the number of times it is cited
def get_citation_df_for_hop(path, feature_list):
    #read csv
    df = pd.read_csv(path,names=feature_list)
    df = df.fillna(0)
    df2 = df.groupby(feature_list[0]).size().reset_index(name='Count')
    X2 = pd.DataFrame(df2, columns = [feature_list[0],"Count"])
    X2 = np.squeeze(np.asarray(X2))
    return X2

#[dic_citation] inputs are two-dimensional, but we only need to use the citing data to count the number of citations
def get_hop1_citation(df):
    dic_citation = {}
    for i in range(len(df)):
        dic_citation[df[i][0]]=df[i][1]
    return dic_citation


def get_hopk_citation(df, dic_x, dic_citation):
    dic_citation_next = {}
    for i in range(len(df)):
        #Current_node_citation_values
        #Get the Citation table, and the "those" nodes cited by each target node
        cur_ = [value for key, value in dic_x.items() if key == df[i][0]]
        cur_ = np.squeeze(cur_)

        #Current_node_citation_values_transform
        #Get the hop k-1 table and replace those nodes obtained in the previous step with their corresponding values
        cur_t = [value for key, value in dic_citation.items() if key in cur_]
        #sum
        new_ = sum(cur_t)
        #Update
        dic_citation_next[df[i][0]]=new_
    return dic_citation_next

def get_hopk_pre(df2, dic_x, dic_citation, k):
    if k == 1:
        return dic_citation
    elif k > 1:
        for i in range(k-1):
            if i == 0:
                cur_result = get_hopk_citation(df2, dic_x, dic_citation)
            elif i > 0:
                cur_result = get_hopk_citation(df2, dic_x, cur_result)
        return cur_result
    
def get_NRC_hopk(path, feature_list, threshold):
    
    #read csv
    df = pd.read_csv(path,names=feature_list)
    #remove NA
    df = df.fillna(0)
    
    
    # Take available features (assuming the first is ID, the last is Label)
    X = pd.DataFrame(df, columns = feature_list[1:-1])
    X = np.squeeze(np.asarray(X))
    
    #Cosine Similarity
    Cos_X = cosine_similarity(X)
    
    #(dictionary)
    ID = []
    for i in range(len(Cos_X)):
        ID.append(i)

    result = []
    for i in range(len(Cos_X)):
        result.append(dict(zip(ID, Cos_X[i])))
        
    NRC = node_reassemble_cosine(result, threshold)
#     NRC_df = node_reassemble_intensity(X, NRC)
    
    return NRC

def get_hopk(path, feature_list, threshold):
    #[original inputs]
    df1 = get_citation_ori_data(path, feature_list)
    #[dic_x] is converted into, {node A:[node B, node C, ...]} method, each node, corresponding to those other nodes referenced by itself
    dic_x = get_citation(df1)

    #[df] Get the front DATA of the hop 1 comparison table, which is used as the benchmark df for all hop corresponding tables
    df2 = get_citation_df_for_hop(path, feature_list)
    #[dic_citation] Obtain the hop 1 corresponding table through the benchmark df (only the first time the hop is obtained and used as initialize data, and then self-update through hop k)
    dic_citation = get_hop1_citation(df2)
    #[hop k table]
    hop_k = get_hopk_pre(df2, dic_x, dic_citation, threshold)
    return hop_k

def get_pro(df, hop1, hop_NRC, feature_list):
    # Take available features (assuming the first is ID, the last is Label)
    X_hop = pd.DataFrame(df, columns = feature_list[0:1])
    X_hop = np.squeeze(np.asarray(X_hop))


    #The node name corresponding to each number
    ID = []
    for i in range(len(X_hop)):
        ID.append(i)
    #After the corresponding df, use dictionary corresponding
    dic_hop = dict(zip(ID, X_hop))
    dic_hop_bp = copy.deepcopy(dic_hop)

    #Find the hop-k value of the neighbor corresponding to the target node
    for i in range(len(dic_hop_bp)):
        if dic_hop[i] in hop1.keys():
            element_dic_hop = [value for key, value in dic_hop.items() if key in hop_NRC[i]]
            hop1_value = [value for key, value in hop1.items() if key in element_dic_hop]
            dic_hop_bp[i]=hop1_value
        else:
            dic_hop_bp[i]=[0]


    #target node itself corresponds to the value of hop-k
    target = []
    for i in range(len(dic_hop_bp)):
        if dic_hop[i] in hop1.keys():
            target.append(hop1[dic_hop[i]])
        else:
            target.append(0)


    #This is the NP multiplier, if there is no reference information, it will be replaced by 1
    result = []
    for i in range(len(dic_hop_bp)):
        if target[i] == 0:
            result.append(1)
        else:
            tn = (target[i]/sum(dic_hop_bp[i]))*len(dic_hop_bp[i])
            result.append(tn)
    return result

def extractDigits(lst):
    return [[el] for el in lst]

def get_pro_NRC(path, feature_list, threshold):
    
    #read csv
    df = pd.read_csv(path,names=feature_list)
    #remove NA
    df = df.fillna(0)
    
    
    # Take available features (assuming the first is ID, the last is Label)
    X = pd.DataFrame(df, columns = feature_list[1:-1])
    X = np.squeeze(np.asarray(X))
    
    #Cosine Similarity
    Cos_X = cosine_similarity(X)
    
    #(dictionary)
    ID = []
    for i in range(len(Cos_X)):
        ID.append(i)

    result = []
    for i in range(len(Cos_X)):
        result.append(dict(zip(ID, Cos_X[i])))
        
    NRC = node_reassemble_cosine(result, threshold)
#     pro_ = get_pro(df, hopk, NRC, feature_list2)
#     pro_ = extractDigits(pro_)
    NRC_df = node_reassemble_intensity(X, NRC)
#     pro_NRC_df = np.multiply(pro_,NRC_df)
    return NRC_df

def get_pro_NRC_output(path, feature_list, threshold, output_name):
    #NRC calculates T1 (without ID, Label)
    result = get_pro_NRC(path, feature_list, threshold)
    np.savetxt(output_name, result, delimiter=",")
    #Read T1(NRC, ID to be added, Label)
    df = pd.read_csv(output_name,names =feature_list[1:-1])
    #Read T0(raw data, including ID, Label)
    df2 = pd.read_csv(path,names =feature_list)
    #Insert ID into the first feature
    df.insert(loc=0, column=feature_list[0], value=df2[feature_list[0]])
    #Insert Label to the last feature
    df.insert(loc=len(df2.columns)-1, column=feature_list[-1], value=df2[feature_list[-1]])
    df = df.fillna(0)
    df.to_csv(output_name, index=False,header=False,sep ='\t')
    

# (Cora) - Running Calibrate Algorithm

In [2]:
df_list = [] #Store features including ID, Label
feat_list = [] #A list to hold the feature vector for each node
label_list = [] #Used to store the list of categories corresponding to each node
node_map = {} #Recode the node
label_map = {} #Mapping labels to numbers
        
with open('cora/cora.content') as f1:
    for i,each_sample in enumerate(f1.readlines()): #Iterate over the features of each sample
        sample_clean = each_sample.strip().split()
        #Extract the features of each sample, where the first and last elements are the sample name and corresponding label
        feat_list.append(sample_clean[1:-1]) 
        df_list.append(sample_clean[:]) 
        #Map node names to node numbers
        node_map[sample_clean[0]]=i
        label = sample_clean[-1]
        if label not in label_map.keys():
            #Convert labels to numbers
            label_map[label] = len(label_map)
        label_list.append(label_map[label])
    feat_list = np.asarray(feat_list,dtype=np.float64)
    df_list = np.asarray(df_list)
    label_list = np.asarray(label_list,dtype=np.int64)
    
pd.DataFrame(df_list).to_csv('cora_content.csv',index=False, header = False)

cora_names = []
for i in range(len(feat_list[0])):
    cora_names.append(str(i))

cora_names.insert(0,'paper_id')
cora_names.insert(len(feat_list[0])+1,'class_label')

In [3]:
df_list = [] #Store features including ID, Label
feat_list = [] #A list to hold the feature vector for each node
label_list = [] #Used to store the list of categories corresponding to each node
node_map = {} #Recode the node
label_map = {} #Mapping labels to numbers
        
with open('cora/cora.cites') as f1:
    for i,each_sample in enumerate(f1.readlines()): #Iterate over the features of each sample
        sample_clean = each_sample.strip().split()
        #Extract the features of each sample, where the first and last elements are the sample name and corresponding label
        feat_list.append(sample_clean[1:-1]) 
        df_list.append(sample_clean[:]) 
        #Map node names to node numbers
        node_map[sample_clean[0]]=i
        label = sample_clean[-1]
        if label not in label_map.keys():
            #Convert labels to numbers
            label_map[label] = len(label_map)
        label_list.append(label_map[label])
    feat_list = np.asarray(feat_list,dtype=np.float64)
    df_list = np.asarray(df_list)
    label_list = np.asarray(label_list,dtype=np.int64)

pd.DataFrame(df_list).to_csv('cora_cites.csv',index=False, header = False)

In [4]:
#[Node] The first is ID, the last is Label
feature_list_cora_content=cora_names
#[Citation] Citation and Citation - 2D
feature_list_cora_cites=["Citing","Cited"]

In [8]:
#calculating running time
from datetime import datetime
start_time = datetime.now()

#[Node]Data storage
path = 'cora_content.csv'

#[Node]Set Cosine similarity size
threshold = [0.95, 0.90, 0.80, 0.70, 0.60, 0.50]

#[Citation]Data storage 
path2 = 'cora_cites.csv'


for i in threshold:
    #hop-k
    hopk = get_hopk(path2, feature_list_cora_cites, 1)
    #return cora_nc_cos_*.csv
    get_pro_NRC_output(path,feature_list_cora_content, i, 'cora_nc_cos_'+\
                                     str(i)+'.csv')

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

  result.append(((target/sum_element_dic_X)*len(element_dic_X)*target))
  result.append(((target/sum_element_dic_X)*len(element_dic_X)*target))
  result.append(((target/sum_element_dic_X)*len(element_dic_X)*target))
  result.append(((target/sum_element_dic_X)*len(element_dic_X)*target))
  result.append(((target/sum_element_dic_X)*len(element_dic_X)*target))
  result.append(((target/sum_element_dic_X)*len(element_dic_X)*target))
  result.append(((target/sum_element_dic_X)*len(element_dic_X)*target))
  result.append(((target/sum_element_dic_X)*len(element_dic_X)*target))
  result.append(((target/sum_element_dic_X)*len(element_dic_X)*target))
  result.append(((target/sum_element_dic_X)*len(element_dic_X)*target))
  result.append(((target/sum_element_dic_X)*len(element_dic_X)*target))
  result.append(((target/sum_element_dic_X)*len(element_dic_X)*target))
  result.append(((target/sum_element_dic_X)*len(element_dic_X)*target))
  result.append(((target/sum_element_dic_X)*len(element_dic_X)*t

Duration: 0:16:46.207919
