# Feature Engineering

In [1]:
# Declare the dataframe path and file type
df_path = "../df/"

In [2]:
# Read and save dataframe
import pandas as pd 
def read_df(file_name):
    df = pd.read_csv(df_path+file_name)
    return df

def save_df(df, file_name):
    df.to_csv(df_path+file_name, index=False)

In [3]:
# Get the file list
import os
files = os.listdir(df_path)

### For each feature computation, the steps are:
#### 1: Read each dataframe from .csv files
#### 2: Calculate the feature and save it as a new column
#### 3: Write the new dataframe back to the original .csv

In [4]:
def calculate_feature(func):
    for file in files:
        # Read the file into dataframe
        df = read_df(file)
        # Update the dataframe by a new feature column
        func(df)
        # Save back to the file
        save_df(df, file)

In [5]:
def drop_col(col_name):
     for file in files:
        # Read the file into dataframe
        df = read_df(file)
        # Update the dataframe by remove a current column
        df = df.drop(columns=col_name)
        # Save back to the file
        save_df(df, file)

### Feature Type 1: Weight related

#### Feature 1.1: Normalized weight by dividing the max weight

In [6]:
def nor_by_max(df):
    tmp = df['weight'].max()
    col = df['weight'] / tmp
    df.insert(len(df.columns)-1,'weight_nor_max',col)

In [7]:
# calculate_feature(nor_by_max)

#### Feature 1.2: Normalized weight by the absolute weight difference

In [8]:
def nor_by_abs(df):
    tmp_min = df['weight'].min()
    tmp_max = df['weight'].max()
    col = ( df['weight'] - tmp_min ) / (tmp_max - tmp_min )
    df.insert(len(df.columns)-1,'weight_nor_abs',col)

In [9]:
# calculate_feature(nor_by_abs)

#### Feature 1.3: Edge weight normalized chi-square

In [10]:
def abs_variance(df):
    # Use the normailzed weight for calculation
    avg = df['weight_nor_abs'].mean()
    col = (df['weight_nor_abs'] - avg) * (df['weight_nor_abs'] - avg).abs()
    # Normalize to [0,1]
    col -= col.min()
    df.insert(len(df.columns)-1,'weight_abs_var',col)

In [11]:
# calculate_feature(abs_variance)

#### Feature 1.4: Local rank for edge (i,j) on vertex i and vertex j

In [10]:
def loc_rank(df):
    # Loop every vertex
    df.insert(len(df.columns)-1,'loc_rank_i',np.zeros(len(df),dtype=np.int8))
    df.insert(len(df.columns)-1,'loc_rank_j',np.zeros(len(df),dtype=np.int8))
    for i in range(1, max(df['v1'].max(),df['v2'].max()) + 1):
        loc_edges = df.loc[(df['v1'] == i) | (df['v2'] == i)]
        loc_edges = loc_edges.sort_values(by=['weight'])
        loc_edges = loc_edges.reset_index(drop=True)
        for index, row in loc_edges.iterrows():
            # When the current vertex is the i vertex for this edge
            if( row['v1'] == i):
                df.loc[(df['v1'] == row['v1']) & (df['v2'] == row['v2']), ['loc_rank_i']] = (index + 1)/(loc_edges.index.max() + 1)
            # When the current vertex is the j vertex for this edge
            else:
                df.loc[(df['v1'] == row['v1']) & (df['v2'] == row['v2']), ['loc_rank_j']] = (index + 1)/(loc_edges.index.max() + 1)

In [13]:
import numpy as np
# calculate_feature(loc_rank)

### Feature Type 2: Centrality related

#### In this part, most features are calculated by NetworkX build-in functions

In [8]:
import networkx as nx

#### Feature 2.1 Degree Centrality

In [15]:
def degree_cen(df):
    # Create Graph object
    G = nx.Graph()
    for index, row in df.iterrows():
        i = row['v1']
        j = row['v2']
        G.add_edge(i,j)
    cen = nx.degree_centrality(G)
    df.insert(len(df.columns)-1,'degree_cen_i',np.zeros(len(df)))
    df.insert(len(df.columns)-1,'degree_cen_j',np.zeros(len(df)))
    for key in cen:
        df.loc[df['v1'] == key, 'degree_cen_i'] = cen[key]
        df.loc[df['v2'] == key, 'degree_cen_j'] = cen[key]
    
    df.insert(len(df.columns)-1,'degree_cen_max',np.zeros(len(df)))
    df.insert(len(df.columns)-1,'degree_cen_min',np.zeros(len(df)))
    for index, row in df.iterrows():
        df.loc[index, 'degree_cen_max'] = max(row['degree_cen_i'], row['degree_cen_j'])
        df.loc[index, 'degree_cen_min'] = min(row['degree_cen_i'], row['degree_cen_j'])

In [16]:
# calculate_feature(degree_cen)
# drop_col('degree_cen_i')
# drop_col('degree_cen_j')

#### Feature 2.2 Betweenness Centrality

In [17]:
def between_cen(df):
    # Create Graph object
    G = nx.Graph()
    for index, row in df.iterrows():
        i = row['v1']
        j = row['v2']
        w = row['weight']
        G.add_edge(i,j,weight=w)
    cen = nx.betweenness_centrality(G,k=10,normalized=True,weight='weight')
    df.insert(len(df.columns)-1,'between_cen_i',np.zeros(len(df)))
    df.insert(len(df.columns)-1,'between_cen_j',np.zeros(len(df)))
    for key in cen:
        df.loc[df['v1'] == key, 'between_cen_i'] = cen[key]
        df.loc[df['v2'] == key, 'between_cen_j'] = cen[key]
    
    df.insert(len(df.columns)-1,'between_cen_max',np.zeros(len(df)))
    df.insert(len(df.columns)-1,'between_cen_min',np.zeros(len(df)))
    for index, row in df.iterrows():
        df.loc[index, 'between_cen_max'] = max(row['between_cen_i'], row['between_cen_j'])
        df.loc[index, 'between_cen_min'] = min(row['between_cen_i'], row['between_cen_j'])

In [18]:
# calculate_feature(between_cen)
# drop_col('between_cen_i')
# drop_col('between_cen_j')

#### Feature 2.3 Vote Rank

In [6]:
def vote_rank(df):
    # Create Graph object
    G = nx.Graph()
    for index, row in df.iterrows():
        i = row['v1']
        j = row['v2']
        w = row['weight']
        G.add_edge(i,j,weight=w)
    rank = nx.voterank(G)
    df.insert(len(df.columns)-1,'vote_rank_i',np.zeros(len(df)))
    df.insert(len(df.columns)-1,'vote_rank_j',np.zeros(len(df)))
    for v in rank:
        df.loc[df['v1'] == v, 'vote_rank_i'] = 1
        df.loc[df['v2'] == v, 'vote_rank_j'] = 1
        
    df.insert(len(df.columns)-1,'vote_rank_min',np.zeros(len(df)))
    df.insert(len(df.columns)-1,'vote_rank_max',np.zeros(len(df)))
    for index, row in df.iterrows():
        df.loc[index, 'vote_rank_max'] = max(row['vote_rank_i'], row['vote_rank_j'])
        df.loc[index, 'vote_rank_min'] = min(row['vote_rank_i'], row['vote_rank_j'])

In [11]:
# calculate_feature(vote_rank)

In [12]:
# Found this feature useless from my evaluation
# drop_col("vote_rank_i")
# drop_col("vote_rank_j")

#### Feature 2.4 Eigenvector Centrality

In [22]:
def eigen_cen(df):
    # Create Graph object
    G = nx.Graph()
    for index, row in df.iterrows():
        i = row['v1']
        j = row['v2']
        w = row['weight']
        G.add_edge(i,j,weight=w)
    cen = nx.eigenvector_centrality(G,max_iter=99999,weight='weight')
    df.insert(len(df.columns)-1,'eigen_cen_i',np.zeros(len(df)))
    df.insert(len(df.columns)-1,'eigen_cen_j',np.zeros(len(df)))
    for key in cen:
        df.loc[df['v1'] == key, 'eigen_cen_i'] = cen[key]
        df.loc[df['v2'] == key, 'eigen_cen_j'] = cen[key]
        
    df.insert(len(df.columns)-1,'eigen_cen_max',np.zeros(len(df)))
    df.insert(len(df.columns)-1,'eigen_cen_min',np.zeros(len(df)))
    for index, row in df.iterrows():
        df.loc[index, 'eigen_cen_max'] = max(row['eigen_cen_i'], row['eigen_cen_j'])
        df.loc[index, 'eigen_cen_min'] = min(row['eigen_cen_i'], row['eigen_cen_j'])

In [23]:
# calculate_feature(eigen_cen)
# drop_col('eigen_cen_i')
# drop_col('eigen_cen_j')

### Features Type 3: Linear Programming

In [24]:
# Read and save dataframe
import pandas as pd 
def read_df(file_name):
    df = pd.read_csv("../df/"+file_name+".csv")
    return df

def save_df(df, file_name):
    df.to_csv("../df/"+file_name+".csv", index=False)

In [25]:
import numpy as np
def read_graph(name):
    with open(name) as f:
        lines = f.readlines()
        arcs = []
        for line in lines:
            if line == '\n': 
                continue
            parts = line.split()
            det = parts[0]
            if det == 'Name':
                name = parts[1]
            elif det == 'Nodes':
                n_vertices = int(parts[1])
            elif det == 'Edges':
                n_edges = int(parts[1])
            elif det == 'E':
                i = int(parts[1])
                j = int(parts[2])
                c = int(parts[3])
                arcij = ((i,j),c)
                arcji = ((j,i),c)
                arcs.append(arcij)
                arcs.append(arcji)
            elif det == 'Terminals':
                n_terminals = int(parts[1])
        vertices = np.arange(1, int(n_vertices)+1)
        vertices = vertices.tolist()
        terminals = np.arange(1, int(n_terminals)+1)
        terminals = terminals.tolist()
        assert(int(n_edges) == len(arcs)/2)
    f.close()
    ### The format of graphs is dG=(V,dE,Z) 
    return [vertices, arcs, terminals]

In [26]:
import os
graphs = {}
path = "../ds/"
size = "I080/"
files = os.listdir(path+size)
for file in files:
    file_name = file[:-4]
    graph = read_graph(path+size+file)
    graphs[file_name] = graph

In [27]:
import gurobipy as gp
from gurobipy import GRB
def get_lp(graph):
    # read dG=(V,dE,Z) 
    vertices, arcs, terminals = graph
    
    # obtain the size of each set
    n = len(vertices) # n = number of vertices
    m = len(arcs)/2 # m = number of edges
    p = len(terminals) # p = number of ternimals
    
    # choose 1 out of ternimals as the source vertex
    v0 = terminals[0] 

    # delete all the arcs that enter the source vertex
    arcs = [arc for arc in arcs if not arc[0][1] == v0]

    # create the tuple dictionary of arcs
    arcs_dict = gp.tupledict(arcs)
    
    # create the model
    m = gp.Model("Steiner")
    
    # create the decision variables
    # number of variables: 2m + m - 1
    x = m.addVars(arcs_dict.keys(),lb=0,ub=1,vtype=GRB.CONTINUOUS, name='x')
    u = m.addVars(vertices,lb=-1,ub=n, vtype=GRB.INTEGER, name='u' )
    
    # set up the objective function
    # equation (1)
    m.setObjective(gp.quicksum(arcs_dict[i, j] * x[i, j] for (i, j) in arcs_dict.keys()), GRB.MINIMIZE)
    
    # create the constraints
    vertices = vertices[:v0-1] + vertices[v0:]
    for j in vertices: # number: 3n - 3
        # equation (2)
        m.addConstr(x.sum('*', j) <= 1)
        # equation (3)
        m.addConstr(n * x.sum('*', j) >= u[j] + 1)
        # equation (4)
        m.addConstr((n + 1) * x.sum('*', j) <= n * (u[j] + 1))
        
    for ij in arcs_dict.keys(): # number: 4m
        i = ij[0]
        j = ij[1]
        # equation (6)
        m.addConstr(1 - n * (1 - x[i, j] ) <= u[j] - u[i])
        m.addConstr(1 + n * (1 - x[i, j] ) >= u[j] - u[i])
    
    ternimals = terminals[1:]
    for j in terminals: # number: p - 1
        # equation (7)
        m.addConstr( u[j] >= 0 )
    
    # equation (7)    
    m.addConstr( u[v0] == 0 )
    
    # update the model
    m.update()
    
    # optimize the model
    m.optimize()
    
    # get the optimal value
    edges = []
    for v in m.getVars():
        # save the edges
        if v.varName.startswith('x'):
            i,j = v.varName[2:-1].split(',')
            edge = ((int(i),int(j)), v.x)
            edges.append(edge)
    
    return edges

In [28]:
# for file in files:
#     # Read the graph of dataset first
#     file_name = file[:-4]
#     graph = graphs[file_name]
#     edges = get_lp(graph)
#     df = pd.read_csv(df_path+file_name+".csv")
#     df.insert(len(df.columns)-1,'lp_value_i_j',np.zeros(len(df)))
#     df.insert(len(df.columns)-1,'lp_value_j_i',np.zeros(len(df)))
#     for (i, j), v in edges:
#         df.loc[(df['v1'] == i) & (df['v2'] == j), 'lp_value_i_j'] = v
#         df.loc[(df['v2'] == i) & (df['v1'] == j), 'lp_value_j_i'] = v
#     df.insert(len(df.columns)-1,'lp_value',np.zeros(len(df)))
#     for index, row in df.iterrows():
#         df.loc[index, 'lp_value'] = max(row['lp_value_i_j'], row['lp_value_j_i'])
#     df = df.drop(columns='lp_value_i_j')
#     df = df.drop(columns='lp_value_j_i')
#     save_df(df, file_name+".csv")

In [13]:
# drop_col("vote_rank_max")
# drop_col("vote_rank_min")
# drop_col("lp_value")