# Pipeline

In [1]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import pickle as pkl
from page_rank import *
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from utils import *

## Building dataset

Getting train data

In [2]:
train = pd.read_csv('data/train.txt', sep = ' ', names = ['node1', 'node2', 'is_linked'])

Getting node information

In [3]:
node_info = pd.read_csv('data/node_information.csv', header = None)

In [4]:
#node_info = node_info.rename(columns = {0 : 'id'})
col_names = {0 : 'id', '0' : 'id'}
for i, col in enumerate(node_info.columns):
    if i > 0:
        col_names[i] = str(i)

node_info = node_info.rename(columns = col_names)

Computing the graph

In [5]:
graph = nx.Graph()

for _, row in train.iterrows():
    if row['is_linked'] == 1:
        graph.add_edge(row['node1'], row['node2'], capacity = 1)

Adding node features

In [6]:
df = train.merge(node_info, how = 'inner', left_on = ['node1'], right_on = ['id'])
df = df.drop(['id'], axis = 1)
df = df.merge(node_info, how = 'inner', left_on = ['node2'], right_on = ['id'], suffixes = ('_1', '_2'))
df = df.drop(['id'], axis = 1)

Adding max flow (computation is heavy so we load the precomputed flows)

In [7]:
max_flow_data = pd.read_csv('data/cache/train_max_flow.csv')

In [8]:
df = df.merge(max_flow_data[['node1', 'node2', 'max_flow']], how = 'left', on = ['node1', 'node2'])

Adding page rank

In [9]:
page_rank_res = page_rank(graph)

In [10]:
df['page_rank_sum'] = df.apply(lambda x: page_rank_res[x['node1']] + page_rank_res[x['node2']], axis = 1)
df['page_rank_diff'] = df.apply(lambda x: np.abs(page_rank_res[x['node1']] - page_rank_res[x['node2']]), axis = 1)

In [11]:
df.sample(2)

Unnamed: 0,node1,node2,is_linked,1_1,2_1,3_1,4_1,5_1,6_1,7_1,...,926_2,927_2,928_2,929_2,930_2,931_2,932_2,max_flow,page_rank_sum,page_rank_diff
5025,396,3252,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.001154,0.000963
7813,1279,3280,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.00047,0.000282


Adding common neighbors overlap

In [12]:
df['common_neighbors'] = df.apply(lambda x: len(list(nx.common_neighbors(graph, x['node1'], x['node2']))), 
                                   axis = 1)

Adding degrees

In [13]:
df['degree_sum'] = df.apply(lambda x: nx.degree(graph, x['node1']) + nx.degree(graph, x['node2']), axis = 1)
df['degree_diff'] = df.apply(lambda x: np.abs(nx.degree(graph, x['node1']) - nx.degree(graph, x['node2'])), axis = 1)

Adding Jaccard coefficient

In [14]:
df['jaccard'] = df.apply(lambda x: len(set(graph.neighbors(x['node1'])) & set(graph.neighbors(x['node2']))) / len(set(graph.neighbors(x['node1'])) | set(graph.neighbors(x['node2']))), axis = 1)

Adding Adammic / Adar coefficient

In [15]:
df['adamic_adar'] = df.apply(lambda x: adamic_adar(graph, x['node1'], x['node2']), axis = 1)

In [18]:
df.sample(10)

Unnamed: 0,node1,node2,is_linked,1_1,2_1,3_1,4_1,5_1,6_1,7_1,...,931_2,932_2,max_flow,page_rank_sum,page_rank_diff,common_neighbors,degree_sum,degree_diff,jaccard,adamic_adar
10464,842,1441,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,0.000192,1e-06,0,2,0,0.0,0.0
2867,304,2299,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,0.000567,0.000379,0,6,4,0.0,0.0
2499,1525,1672,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3,0.001434,0.000859,0,15,9,0.0,0.0
881,6397,7174,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,0.003134,0.002754,0,34,30,0.03125,0.291207
5426,2908,3577,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1,0.000285,9.3e-05,0,3,1,0.0,0.0
7979,1255,7447,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2,0.000497,8.5e-05,0,5,1,0.0,0.0
1693,1168,4422,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3,0.001789,0.001224,1,19,13,0.055556,0.306928
6691,4611,7210,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5,0.00138,0.000208,1,14,2,0.076923,0.513898
2019,2316,5560,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2,0.00264,0.002263,0,28,24,0.0,0.0
1838,836,6386,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,18,0.004326,0.000745,2,46,8,0.045455,1.701013


Computing KatzB measure using matrix formulation

In [41]:
beta = 0.5

In [42]:
M, katzB_index, katzB_revex = katzB_matrix(graph, beta)

NameError: name 'katzB_matrix' is not defined

Scaling features with standard scaler

In [94]:
# scaler = StandardScaler()
# scaler.fit(df)

In [95]:
# cols = df.columns
# df = scaler.transform(df)
# df = pd.DataFrame(df, columns = cols)

## Train / Test split

In [19]:
train_set, test_set = train_test_split(df, test_size = 0.2)

In [20]:
X_train, y_train = train_set.drop(['is_linked'], axis = 1), train_set['is_linked']
X_test, y_test = test_set.drop(['is_linked'], axis = 1), test_set['is_linked']

## Training model

In [21]:
reg_log = LogisticRegression()
reg_log.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Save model

In [39]:
pkl.dump(reg_log, open('models//model.pkl', 'wb'))

In [156]:
model = pkl.load(open('models//model.pkl', 'rb'))

## Evaluating model

In [22]:
y_pred = reg_log.predict(X_test)

In [23]:
confusion_matrix(y_test, y_pred)

array([[923, 148],
       [360, 669]], dtype=int64)

In [24]:
print(f"Accuracy : {accuracy_score(y_test, y_pred)}")
print(f"Precision : {precision_score(y_test, y_pred)}")
print(f"Recall : {recall_score(y_test, y_pred)}")

Accuracy : 0.758095238095238
Precision : 0.8188494492044064
Recall : 0.6501457725947521


## Predicting on test set

In [29]:
test = pd.read_csv('data/test.txt', sep = ' ', names = ['node1', 'node2'])

In [30]:
df_test = test.merge(node_info, how = 'left', left_on = ['node1'], right_on = ['id'])
df_test = df_test.drop(['id'], axis = 1)
df_test = df_test.merge(node_info, how = 'left', left_on = ['node2'], right_on = ['id'], suffixes = ('_1', '_2'))
df_test = df_test.drop(['id'], axis = 1)

In [31]:
max_flow_data = pd.read_csv('data/cache/test_max_flow.csv')
df_test = df_test.merge(max_flow_data[['node1', 'node2', 'max_flow']], how = 'left', on = ['node1', 'node2'])

In [32]:
df_test['page_rank_sum'] = df_test.apply(lambda x: page_rank_res[x['node1']] + page_rank_res[x['node2']], axis = 1)
df_test['page_rank_diff'] = df_test.apply(lambda x: np.abs(page_rank_res[x['node1']] - page_rank_res[x['node2']]), axis = 1)

In [33]:
df_test['common_neighbors'] = df_test.apply(lambda x: len(list(nx.common_neighbors(graph, x['node1'], x['node2']))), 
                                   axis = 1)

In [34]:
df_test['degree_sum'] = df_test.apply(lambda x: nx.degree(graph, x['node1']) + nx.degree(graph, x['node2']), axis = 1)
df_test['degree_diff'] = df_test.apply(lambda x: np.abs(nx.degree(graph, x['node1']) - nx.degree(graph, x['node2'])), axis = 1)

In [35]:
df_test['jaccard'] = df_test.apply(lambda x: len(set(graph.neighbors(x['node1'])) & set(graph.neighbors(x['node2']))) / len(set(graph.neighbors(x['node1'])) | set(graph.neighbors(x['node2']))), axis = 1)

In [36]:
df_test['adamic_adar'] = df_test.apply(lambda x: adamic_adar(graph, x['node1'], x['node2']), axis = 1)

In [37]:
test['Predicted'] = reg_log.predict(df_test)
test.loc[test.node1 == test.node2, 'Predicted'] = 1

## Write submission

In [38]:
test.to_csv('data/submissions/6.csv', sep = ',', columns = ['Predicted'],
            index = True, index_label = 'ID')