In [1]:
df_path = '../df/'
ds_path = '../ds/'
ds_suffix = '.stp'
df_suffix = '.csv'

In [2]:
import os
tmp = os.listdir(ds_path)
ds_dict = {}
for folder in tmp:
    ds_dict.update({each[:8] : ds_path + folder + '/' + each for each in os.listdir(ds_path + folder + '/')})

In [3]:
# Get files
import os
import pandas as pd
df_files = os.listdir(df_path)
df_files.remove("evaluation.csv")
df_eval = pd.read_csv(df_path+"evaluation.csv",index_col=0)
df_samples = []
for file in df_files:
    df_samples.append(pd.read_csv(df_path+file,index_col=0))

In [4]:
train_list = df_eval.sort_values(by=['ILP Runtime']).index[:-10]
evalu_list = df_eval.sort_values(by=['ILP Runtime']).index[-10:]
train_df = {filename : pd.read_csv(df_path+filename+df_suffix) for filename in train_list}
evalu_df = {filename : pd.read_csv(df_path+filename+df_suffix) for filename in evalu_list}

In [5]:
import pandas as pd
def split_x_y(df_list):
    df = pd.concat(df_list, ignore_index=True)
    x = df.drop(columns=['Node1','Node2','Weight','Label'])
    y = df['Label']
    return x,y

In [6]:
# Feature Importance Check

In [7]:
X, Y = split_x_y(list(train_df.values()) + list(evalu_df.values()))

In [11]:
from sklearn.ensemble import RandomForestClassifier
def takeSecond(elem):
    return elem[1]
# Default classifier is applied first
clf_rf = RandomForestClassifier(class_weight='balanced')
clf_rf.fit(X,Y)
features = X.columns
feature_importance = list(zip(features, clf_rf.feature_importances_))
df = pd.DataFrame()
for each in sorted(feature_importance,key=takeSecond, reverse=True):
    df.loc[each[0],'Importance'] = each[1]
print(df.to_latex(index=True))

\begin{tabular}{lr}
\toprule
{} &  Importance \\
\midrule
LP                         &    0.461665 \\
Normalized Weight          &    0.108099 \\
Variance                   &    0.082869 \\
Degree Centrality Max      &    0.052033 \\
Eigenvector Centrality Max &    0.047566 \\
Betweenness Centrality Max &    0.047523 \\
Degree Centrality Min      &    0.045958 \\
Local Rank j               &    0.041085 \\
Eigenvector Centrality Min &    0.039031 \\
Betweenness Centrality Min &    0.038027 \\
Local Rank i               &    0.036145 \\
\bottomrule
\end{tabular}



In [None]:
X_train, Y_train = split_x_y(list(train_df.values()))
X_test, Y_test = split_x_y(list(evalu_df.values()))
from sklearn import svm
# Default classifier is applied first
clf_svm = svm.SVC(class_weight='balanced', probability=True)
clf_svm.fit(X_train,Y_train)

In [None]:
# Choose threshold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
my_thresholds = [0.005,0.006,0.007,0.008,0.009]
clf_thresholds = {}
def evaluation_confusion(clf):
    x_pruning = []
    y_loss = []
    clf_thresholds[clf.__class__.__name__] = []
    for threshold in my_thresholds:
        Y_pred_proba = clf.predict_proba(X_test)
        Y_pred = (Y_pred_proba [:,1] >= threshold).astype('int')
        tn, fp, fn, tp = confusion_matrix(Y_test,Y_pred).ravel()
        matrix = confusion_matrix(Y_test,Y_pred)
        x_pruning.append(round(100*(1-(tp+fp)/len(Y_pred)),2))
        y_loss.append(round(100*(1-(tp)/(tp+fn)),2))
        # If the threshold change cannot make change anymore on performance
        if (y_loss[-1] != y_loss[0]) and (y_loss[-1] == y_loss[-2]):
            y_loss = y_loss[:-1]
            x_pruning = x_pruning[:-1]
            print("The threshold <= {} are useless for classifier {}".format(threshold, clf.__class__.__name__))
            clf_thresholds[clf.__class__.__name__] = my_thresholds[:my_thresholds.index(threshold)]
            break
    # Every threshold number is useful
    if len(clf_thresholds[clf.__class__.__name__]) == 0:
        clf_thresholds[clf.__class__.__name__] = my_thresholds.copy()
    print("Purning Rate(%):", x_pruning)
    print("Optimal Edges Loss Rate(%):", y_loss)
    
evaluation_confusion(clf_svm)

In [None]:
import numpy as np
def read_graph(name):
    with open(name) as f:
        lines = f.readlines()
        arcs = []
        for line in lines:
            if line == '\n': 
                continue
            parts = line.split()
            det = parts[0]
            if det == 'Name':
                name = parts[1]
            elif det == 'Nodes':
                n_vertices = int(parts[1])
            elif det == 'Edges':
                n_edges = int(parts[1])
            elif det == 'E':
                i = int(parts[1])
                j = int(parts[2])
                c = int(parts[3])
                arcij = ((i,j),c)
                arcji = ((j,i),c)
                arcs.append(arcij)
                arcs.append(arcji)
            elif det == 'Terminals':
                n_terminals = int(parts[1])
        vertices = np.arange(1, int(n_vertices)+1)
        vertices = vertices.tolist()
        terminals = np.arange(1, int(n_terminals)+1)
        terminals = terminals.tolist()
        assert(int(n_edges) == len(arcs)/2)
    f.close()
    ### The format of graphs is D=(V,A,R)
    return [vertices, arcs, terminals]

In [None]:
import gurobipy as gp
from gurobipy import GRB
def formulation_3_ILP(graph, warm=None, cutoff=None, limit=None, P=None):
    # set define
    # N: All Nodes, E: Arcs, V: Terminals
    # S: N - V Other nodes expect Terminals
    N, E, V = graph
    S = [each for each in N if (each not in V)]
    
    # Choose the first terminal as Root
    root = V[0]
    V = V[1:]
    
    # delete all the arcs that enter the source vertex
    E = [arc for arc in E if not arc[0][1] == root]
    
    # create the tuple dictionary of arcs
    E_dict = gp.tupledict(E)
    
    # model creation
    m = gp.Model("Steiner_formulation_3")
    
    E_dict_keys = E_dict.keys()
    X_dict = []
    for k in V:
        for arc in E_dict_keys:
            X_dict.append((arc[0], arc[1], k))
            
    # add variables
    x = m.addVars(X_dict,lb=0,vtype=GRB.INTEGER, name='x') # size: |V| * |E|
    y = m.addVars(E_dict_keys,vtype=GRB.INTEGER, name='y') # size: |E|
    
    # set objective value 2.1
    m.setObjective(gp.quicksum(E_dict[i, j] * y[i, j] for (i, j) in E_dict_keys), GRB.MINIMIZE)
                
    # set cutoff
    if cutoff != None:
        m.Params.cutoff = cutoff
    
    # add constraints
    for i in N:
        for k in V:
            # constraint 2.2
            if i == root:
                m.addConstr(x.sum(i,'*',k) - x.sum('*',i,k) == 1)
            elif i == k:
                m.addConstr(x.sum(i,'*',k) - x.sum('*',i,k) == -1)
            else:
                m.addConstr(x.sum(i,'*',k) - x.sum('*',i,k) == 0)
    
    # constraint 2.3
    for i,j,k in X_dict:
        m.addConstr(x[i,j,k] <= y[i,j])
    
    # Warm start
    if warm != None:
        for i,j in warm:
            i,j = int(i), int(j)
            y[i,j].Start = 1.0

    # Hard prune SHOULD NOT assign soft to 0
    # Instead, it should replace E with Pruned set
    if (limit != None) & (P != None):
        P = [arc for arc in P if not arc[0][1] == root]
        y_P = [y[i,j] for ((i,j),c) in P]
        m.addConstr(gp.quicksum(y) - gp.quicksum(y_P) <= limit)
    
    # update the model
    m.update()
    
    # optimize the model
    m.optimize()
    
    # save the optimal solution
    opt_cost = m.objVal
    
    opt_edges = []
    opt_vertices = []
    
    for v in m.getVars():
        # save the vertices
        if v.varName.startswith('y') and v.x != 0:
            opt_vertices.append((v.varName[2:-1], v.x))
                
    opt_runtime = m.Runtime
    
    return opt_vertices, opt_cost, opt_runtime

In [None]:
part_b_list = {}
for name in evalu_df.keys():
    sample = evalu_df[name]
    graph = sample[['Node1','Node2','Weight','LP','Label']]
    x = sample.drop(columns=['Node1','Node2','Weight','Label'])
    y_proba = clf_svm.predict_proba(x)
    y = (y_proba[:,1] >= 0.006).astype('int')
    graph.insert(len(graph.columns),'Predict', y)
    graph = graph.loc[(graph['LP'] > 0) | (graph['Predict']) > 0]
    arcs = []
    for index, row in graph.iterrows():
        i = int(row['Node1'])
        j = int(row['Node2'])
        c = int(row['Weight'])
        arcij = ((i,j),c)
        arcji = ((j,i),c)
        arcs.append(arcij)
        arcs.append(arcji)
    # Graph is the un-pruned graph
    graph = read_graph(ds_dict[name])

    # PART A: Calculate ILP by Pruned Set
    # graph: 0: Nodes, 1: Arcs, :2 Terminals
    sol, obj, rt = formulation_3_ILP([graph[0], arcs, graph[2]])
    df_eval.loc[name,'ILP Objective Pruned'] = obj
    df_eval.loc[name,'ILP Runtime Pruned'] = rt
    if float(obj) != float(df_eval.loc[name,'ILP Objective']):
        part_b_list[name] = [sol, obj, arcs]
df_eval.to_csv(df_path+"evaluation.csv")

In [None]:
part_c_list = {}
for name in part_b_list.keys():
    # Graph is the un-pruned graph
    graph = read_graph(ds_dict[name])
    sol, obj, arcs = part_b_list[name]
    
    # PART B:
    # When I didn't get the optimal solution
    # Warm start is applied and loop with N soft pruning
    for N in [1,2]:
        sol = [each[0].split(',') for each in sol]
        sol, obj, rt = formulation_3_ILP(graph, sol, obj, N, arcs)
        df_eval.loc[name,'ILP Objective N = {}'.format(N)] = obj
        df_eval.loc[name,'ILP Runtime N = {}'.format(N)] = rt
        df_eval.to_csv(df_path+"evaluation.csv")
        if float(obj) == float(df_eval.loc[name,'ILP Objective']):
            break
            
    if float(obj) != float(df_eval.loc[name,'ILP Objective']):
        sol, obj, rt = formulation_3_ILP([graph[0], arcs, graph[2]])
        part_c_list[name] = [sol, obj]

In [None]:
for name in part_c_list.keys():
    graph = read_graph(ds_dict[name])
    sol, obj = part_c_list[name]
    
    # PART C
    # if the optimal result is still not gained, use warm only          
    if float(obj) != float(df_eval.loc[name,'ILP Objective']):
        sol, obj, rt = formulation_3_ILP([graph[0], arcs, graph[2]])
        sol = [each[0].split(',') for each in sol]
        sol, obj, rt = formulation_3_ILP(graph, sol, obj)
        df_eval.loc[name,'ILP Objective Warm'] = obj
        df_eval.loc[name,'ILP Runtime Warm'] = rt
        df_eval.to_csv(df_path+"evaluation.csv")

James Fitzpatrick, Deepak Ajwani, Paula Carroll:
Learning to Sparsify Travelling Salesman Problem Instances. CPAIOR 2021: 410-426

Dena Tayebi, Saurabh Ray and Deepak Ajwani: 
Learning to Sparsify instance of k-median and related problems. ALENEX 2022

A dual ascent approach for steiner tree problems on a directed graph