# Pipeline

In [151]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import pickle as pkl
from page_rank import *
from utils import *

## Building dataset

Getting train data

In [3]:
train = pd.read_csv('data/train.txt', sep = ' ', names = ['node1', 'node2', 'is_linked'])

Getting node information

In [112]:
node_info = pd.read_csv('data/node_information.csv', header = None)

In [118]:
node_info = node_info.rename(columns = {'0' : 'id'})
col_names = {}
for i, col in enumerate(node_info.columns):
    if col != 'id':
        col_names[col] = str(i)
node_info = node_info.rename(columns = col_names)

Computing the graph

In [114]:
graph = nx.Graph()

for _, row in train.iterrows():
    if row['is_linked'] == 1:
        graph.add_edge(row['node1'], row['node2'], capacity = 1)

Adding node features

In [120]:
df = train.merge(node_info, how = 'inner', left_on = ['node1'], right_on = ['id'])
df = df.drop(['id'], axis = 1)
df = df.merge(node_info, how = 'inner', left_on = ['node2'], right_on = ['id'], suffixes = ('_1', '_2'))
df = df.drop(['id'], axis = 1)

In [121]:
df.sample(5)

Unnamed: 0,node1,node2,is_linked,1_1,2_1,3_1,4_1,5_1,6_1,7_1,...,923_2,924_2,925_2,926_2,927_2,928_2,929_2,930_2,931_2,932_2
2742,2435,6868,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4047,3269,6761,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10486,3691,4434,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9432,257,311,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1281,3101,5967,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Adding max flow (computation is heavy so we load the precomputed flows)

In [160]:
max_flow_data = pd.read_csv('data/cache/train_max_flow.csv')

In [161]:
max_flow_data.head()

Unnamed: 0,id,node1,node2,is_linked,max_flow
0,0,939,3809,1,1
1,1,2442,5784,1,3
2,2,179,3809,1,4
3,3,857,2280,1,7
4,4,1358,5722,1,2


In [163]:
df = df.merge(max_flow_data[['node1', 'node2', 'max_flow']], how = 'left', on = ['node1', 'node2'])

In [164]:
df.sample(5)

Unnamed: 0,node1,node2,is_linked,1_1,2_1,3_1,4_1,5_1,6_1,7_1,...,926_2,927_2,928_2,929_2,930_2,931_2,932_2,page_rank_sum,page_rank_diff,max_flow
248,1507,3809,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.034007,0.033816,1
4246,437,3705,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001503,0.000372,6
9645,493,4441,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000946,0.000569,2
4064,1681,6225,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001811,0.000866,3
5621,966,1556,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000287,9.5e-05,1


Adding page rank

In [123]:
page_rank_res = page_rank(graph)

In [124]:
df['page_rank_sum'] = df.apply(lambda x: page_rank_res[x['node1']] + page_rank_res[x['node2']], axis = 1)
df['page_rank_diff'] = df.apply(lambda x: np.abs(page_rank_res[x['node1']] - page_rank_res[x['node2']]), axis = 1)

In [166]:
df.sample(2)

Unnamed: 0,node1,node2,is_linked,1_1,2_1,3_1,4_1,5_1,6_1,7_1,...,926_2,927_2,928_2,929_2,930_2,931_2,932_2,page_rank_sum,page_rank_diff,max_flow
5951,4310,7555,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001079,0.000449,2
953,4702,4956,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000479,9.6e-05,2


## Train / Test split

In [167]:
train_set, test_set = train_test_split(df, test_size = 0.2)

In [168]:
X_train, y_train = train_set.drop(['is_linked'], axis = 1), train_set['is_linked']
X_test, y_test = test_set.drop(['is_linked'], axis = 1), test_set['is_linked']

## Training model

In [169]:
reg_log = LogisticRegression()
reg_log.fit(X_train, y_train)

Save model

In [155]:
pkl.dump(reg_log, open('models//model.pkl', 'wb'))

In [156]:
model = pkl.load(open('models//model.pkl', 'rb'))

## Evaluating model

In [171]:
y_pred = reg_log.predict(X_test)

In [172]:
confusion_matrix(y_test, y_pred)

array([[839, 228],
       [411, 622]], dtype=int64)

In [173]:
print(f"Accuracy : {accuracy_score(y_test, y_pred)}")
print(f"Precision : {precision_score(y_test, y_pred)}")
print(f"Recall : {recall_score(y_test, y_pred)}")

Accuracy : 0.6957142857142857
Precision : 0.731764705882353
Recall : 0.6021297192642788


## Predicting on test set

In [132]:
test = pd.read_csv('data/test.txt', sep = ' ', names = ['node1', 'node2'])

In [133]:
df_test = test.merge(node_info, how = 'left', left_on = ['node1'], right_on = ['id'])
df_test = df_test.drop(['id'], axis = 1)
df_test = df_test.merge(node_info, how = 'left', left_on = ['node2'], right_on = ['id'], suffixes = ('_1', '_2'))
df_test = df_test.drop(['id'], axis = 1)

In [134]:
df_test['page_rank_sum'] = df_test.apply(lambda x: page_rank_res[x['node1']] + page_rank_res[x['node2']], axis = 1)
df_test['page_rank_diff'] = df_test.apply(lambda x: np.abs(page_rank_res[x['node1']] - page_rank_res[x['node2']]), axis = 1)

In [146]:
test['Predicted'] = reg_log.predict(df_test)
test.loc[test.node1 == test.node2, 'Predicted'] = 1

## Write submission

In [150]:
test.to_csv('data/submissions/0.csv', sep = ',', columns = ['Predicted'],
            index = True, index_label = 'ID')