## Variables Define

In [None]:
# Paths
log_path = '../log/'
df_path = '../df/'
ds_path = '../ds/'

In [None]:
# Suffix
log_suffix = '_log.txt'
ds_suffix = '.stp'
df_suffix = '.csv'

In [None]:
# Read/Write Dataframe 
import pandas as pd 
def read_df(target):
    df = pd.read_csv(target)
    return df

def write_df(df, target):
    df.to_csv(target, index=False)

In [None]:
# Get File lists
import os
tmp = os.listdir(log_path)

log_dict = {}
for folder in tmp:
    log_dict.update({each[:8] : log_path + folder + '/' + each for each in os.listdir(log_path + folder + '/')}) 

tmp = os.listdir(ds_path)
ds_dict = {}
for folder in tmp:
    ds_dict.update({each[:8] : ds_path + folder + '/' + each for each in os.listdir(ds_path + folder + '/')})
    
df_dict = os.listdir(df_path)

## For each sample's feature computation, the steps are:
#### 1: Read the sampel as a dataframe
#### 2: Calculate each feature and save it as a new column
#### 3: Save the dataframe as a .csv file

In [None]:
import numpy as np
def read_graph(name):
    with open(name) as f:
        lines = f.readlines()
        arcs = []
        for line in lines:
            if line == '\n': 
                continue
            parts = line.split()
            det = parts[0]
            if det == 'Name':
                name = parts[1]
            elif det == 'Nodes':
                n_vertices = int(parts[1])
            elif det == 'Edges':
                n_edges = int(parts[1])
            elif det == 'E':
                i = int(parts[1])
                j = int(parts[2])
                c = int(parts[3])
                arcij = ((i,j),c)
                arcs.append(arcij)
            elif det == 'Terminals':
                n_terminals = int(parts[1])
        vertices = np.arange(1, int(n_vertices)+1)
        vertices = vertices.tolist()
        terminals = np.arange(1, int(n_terminals)+1)
        terminals = terminals.tolist()
        assert(int(n_edges) == len(arcs))
    f.close()
    ### The format of graphs is D=(V,A,R)
    return [vertices, arcs, terminals]

In [None]:
import re
def read_log(target):
    with open(target) as f:
        lines = [line.strip() for line in f.readlines()]
        ilp_rt = lines[0]
        ilp_c = lines[1]
        lp_rt = lines[2]
        lp_c = lines[3]
        # lines[4] is whether ilp_c == lp_c
        sols = lines[5:]
        b = sols.index('')
        ilp_sol = [re.sub("[()',]"," ", term).split() for term in sols[:b]]
        lp_sol = [re.sub("[()',]"," ", term).split() for term in sols[b+1:]]
    return {"ilp_rt" : ilp_rt, "ilp_c" : ilp_c, "ilp_sol" : ilp_sol, "lp_rt" : lp_rt, "lp_c" : lp_c, "lp_sol" : lp_sol}

In [None]:
import pandas as pd
import networkx as nx
import timeit
def generate_df(ds_target, log_target):
    vertices, arcs, terminals = read_graph(ds_target)
    df = pd.DataFrame(columns = ['Node1', 'Node2', 'Weight'])
    for arc in arcs:
        node = arc[0]
        df = df.append({'Node1' : node[0] , 'Node2' : node[1], 'Weight' : arc[1]}, ignore_index=True)
        
    # Label:
    log = read_log(log_target)
    ilp_sol = log['ilp_sol']
    df.insert(3,'Label',np.zeros(len(df),dtype=np.float32))
    for i,j,v in ilp_sol:
        i,j = int(i), int(j)
        df.loc[((df['Node1'] == i) & (df['Node2'] == j)) | ((df['Node1'] == j) & (df['Node2'] == i)), 'Label'] = v
    
    # Feature 0: LP Relaxation
    lp_sol = log['lp_sol']
    df.insert(3,'LP',np.zeros(len(df),dtype=np.float32))
    for i,j,v in lp_sol:
        i,j = int(i), int(j)
        df.loc[((df['Node1'] == i) & (df['Node2'] == j)) | ((df['Node1'] == j) & (df['Node2'] == i)), 'LP'] = v
      
    start = timeit.default_timer()
    # Feature 1: Normalized weight by dividing the max weight
    tmp = df['Weight'].max()
    col = df['Weight'] / tmp
    df.insert(3,'Normalized Weight',col)
    
    # Feature 2: Variance of Normalizaed weight
    avg = df['Normalized Weight'].mean()
    col = (df['Normalized Weight'] - avg)**2
    df.insert(3,'Variance',col)
    
    # Feature 3: Local rank for edge (i,j) on vertex i and vertex j
    # Loop every vertex
    df.insert(3,'Local Rank i',np.zeros(len(df),dtype=np.int8))
    df.insert(3,'Local Rank j',np.zeros(len(df),dtype=np.int8))
    for i in range(1, max(df['Node1'].max(),df['Node2'].max()) + 1):
        loc_edges = df.loc[(df['Node1'] == i) | (df['Node2'] == i)]
        loc_edges = loc_edges.sort_values(by=['Weight'])
        loc_edges = loc_edges.reset_index(drop=True)
        for index, row in loc_edges.iterrows():
            # When the current vertex is the i vertex for this edge
            if( row['Node1'] == i):
                df.loc[(df['Node1'] == row['Node1']) & (df['Node2'] == row['Node2']), ['Local Rank i']] = (index + 1)/(loc_edges.index.max() + 1)
            # When the current vertex is the j vertex for this edge
            else:
                df.loc[(df['Node1'] == row['Node1']) & (df['Node2'] == row['Node2']), ['Local Rank j']] = (index + 1)/(loc_edges.index.max() + 1)
    
    # Create Graph object
    G = nx.Graph()
    for index, row in df.iterrows():
        i = row['Node1']
        j = row['Node2']
        G.add_edge(i,j)
        
    # Feature 4: Degree Centrality
    cen = nx.degree_centrality(G)
    df.insert(3,'Degree Centrality i',np.zeros(len(df)))
    df.insert(3,'Degree Centrality j',np.zeros(len(df)))
    for key in cen:
        df.loc[df['Node1'] == key, 'Degree Centrality i'] = cen[key]
        df.loc[df['Node2'] == key, 'Degree Centrality j'] = cen[key]
    
    df.insert(3,'Degree Centrality Max',np.zeros(len(df)))
    df.insert(3,'Degree Centrality Min',np.zeros(len(df)))
    for index, row in df.iterrows():
        df.loc[index, 'Degree Centrality Max'] = max(row['Degree Centrality i'], row['Degree Centrality j'])
        df.loc[index, 'Degree Centrality Min'] = min(row['Degree Centrality i'], row['Degree Centrality j'])
    df = df.drop(columns=['Degree Centrality i','Degree Centrality j'])
    
    # Feature 5:Betweenness Centrality
    cen = nx.betweenness_centrality(G,k=10,normalized=True,weight='Weight')
    df.insert(len(df.columns)-1,'between_cen_i',np.zeros(len(df)))
    df.insert(len(df.columns)-1,'between_cen_j',np.zeros(len(df)))
    for key in cen:
        df.loc[df['Node1'] == key, 'between_cen_i'] = cen[key]
        df.loc[df['Node2'] == key, 'between_cen_j'] = cen[key]
    
    df.insert(3,'Betweenness Centrality Max',np.zeros(len(df)))
    df.insert(3,'Betweenness Centrality Min',np.zeros(len(df)))
    for index, row in df.iterrows():
        df.loc[index, 'Betweenness Centrality Max'] = max(row['between_cen_i'], row['between_cen_j'])
        df.loc[index, 'Betweenness Centrality Min'] = min(row['between_cen_i'], row['between_cen_j'])
    df = df.drop(columns=['between_cen_i','between_cen_j'])
    
    # Feature 6: Eigenvector Centrality
    cen = nx.eigenvector_centrality(G,max_iter=99999,weight='Weight')
    df.insert(len(df.columns)-1,'eigen_cen_i',np.zeros(len(df)))
    df.insert(len(df.columns)-1,'eigen_cen_j',np.zeros(len(df)))
    for key in cen:
        df.loc[df['Node1'] == key, 'eigen_cen_i'] = cen[key]
        df.loc[df['Node2'] == key, 'eigen_cen_j'] = cen[key]
        
    df.insert(3,'Eigenvector Centrality Max',np.zeros(len(df)))
    df.insert(3,'Eigenvector Centrality Min',np.zeros(len(df)))
    for index, row in df.iterrows():
        df.loc[index, 'Eigenvector Centrality Max'] = max(row['eigen_cen_i'], row['eigen_cen_j'])
        df.loc[index, 'Eigenvector Centrality Min'] = min(row['eigen_cen_i'], row['eigen_cen_j'])
    df = df.drop(columns=['eigen_cen_i','eigen_cen_j'])
    stop = timeit.default_timer()
    fe_rt = stop - start + float(log["lp_rt"])
    
    # Create the runtime df
    se = pd.Series({'ILP Objective':float(log['ilp_c']), 'ILP Runtime':float(log['ilp_rt']), 'FE Runtime':fe_rt})
    
    return df, se

In [None]:
def calculate_feature():
    target_list = []
    for each in list(log_dict.keys()):
        log_target = log_dict[each]
        with open(log_target) as f:
            if f.readlines()[4].startswith("F"):
                target_list.append(each)
    df = pd.DataFrame()
    for each in target_list:
        log_target = log_dict[each]
        ds_target = ds_dict[each]
        fe_df, se = generate_df(ds_target, log_target)
        path = df_path+each+df_suffix
        fe_df.to_csv(path, index=False)
        df = df.append(se.rename(each))
    path = df_path+"evaluation.csv"
    df.to_csv(path)

In [None]:
calculate_feature()