# Pipeline

In [80]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import pickle as pkl
from page_rank import *
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from utils import *

## Building dataset

Getting train data

In [115]:
train = pd.read_csv('data/train.txt', sep = ' ', names = ['node1', 'node2', 'is_linked'])

Getting node information

In [116]:
node_info = pd.read_csv('data/node_information.csv', header = None)

In [117]:
#node_info = node_info.rename(columns = {0 : 'id'})
col_names = {0 : 'id', '0' : 'id'}
for i, col in enumerate(node_info.columns):
    if i > 0:
        col_names[i] = str(i)

node_info = node_info.rename(columns = col_names)

Computing the graph

In [118]:
graph = nx.Graph()

for _, row in train.iterrows():
    if row['is_linked'] == 1:
        graph.add_edge(row['node1'], row['node2'], capacity = 1)

Adding node features

In [119]:
df = train.merge(node_info, how = 'inner', left_on = ['node1'], right_on = ['id'])
df = df.drop(['id'], axis = 1)
df = df.merge(node_info, how = 'inner', left_on = ['node2'], right_on = ['id'], suffixes = ('_1', '_2'))
df = df.drop(['id'], axis = 1)

Adding max flow (computation is heavy so we load the precomputed flows)

In [120]:
max_flow_data = pd.read_csv('data/cache/train_max_flow.csv')

In [121]:
df = df.merge(max_flow_data[['node1', 'node2', 'max_flow']], how = 'left', on = ['node1', 'node2'])

Adding page rank

In [122]:
page_rank_res = page_rank(graph)

In [123]:
df['page_rank_sum'] = df.apply(lambda x: page_rank_res[x['node1']] + page_rank_res[x['node2']], axis = 1)
df['page_rank_diff'] = df.apply(lambda x: np.abs(page_rank_res[x['node1']] - page_rank_res[x['node2']]), axis = 1)

In [124]:
df.sample(2)

Unnamed: 0,node1,node2,is_linked,1_1,2_1,3_1,4_1,5_1,6_1,7_1,...,926_2,927_2,928_2,929_2,930_2,931_2,932_2,max_flow,page_rank_sum,page_rank_diff
4303,1324,5312,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.00029,0.0001
8321,2996,6251,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.00038,0.000187


Adding common neighbors overlap

In [125]:
df['common_neighbors'] = df.apply(lambda x: len(list(nx.common_neighbors(graph, x['node1'], x['node2']))), 
                                   axis = 1)

Scaling features with standard scaler

In [94]:
# scaler = StandardScaler()
# scaler.fit(df)

In [95]:
# cols = df.columns
# df = scaler.transform(df)
# df = pd.DataFrame(df, columns = cols)

## Train / Test split

In [128]:
train_set, test_set = train_test_split(df, test_size = 0.2)

In [129]:
X_train, y_train = train_set.drop(['is_linked'], axis = 1), train_set['is_linked']
X_test, y_test = test_set.drop(['is_linked'], axis = 1), test_set['is_linked']

## Training model

In [130]:
reg_log = LogisticRegression()
reg_log.fit(X_train, y_train)

Save model

In [141]:
pkl.dump(reg_log, open('models//model.pkl', 'wb'))

In [156]:
model = pkl.load(open('models//model.pkl', 'rb'))

## Evaluating model

In [131]:
y_pred = reg_log.predict(X_test)

In [132]:
confusion_matrix(y_test, y_pred)

array([[830, 178],
       [427, 665]], dtype=int64)

In [133]:
print(f"Accuracy : {accuracy_score(y_test, y_pred)}")
print(f"Precision : {precision_score(y_test, y_pred)}")
print(f"Recall : {recall_score(y_test, y_pred)}")

Accuracy : 0.7119047619047619
Precision : 0.7888493475682088
Recall : 0.6089743589743589


## Predicting on test set

In [134]:
test = pd.read_csv('data/test.txt', sep = ' ', names = ['node1', 'node2'])

In [135]:
df_test = test.merge(node_info, how = 'left', left_on = ['node1'], right_on = ['id'])
df_test = df_test.drop(['id'], axis = 1)
df_test = df_test.merge(node_info, how = 'left', left_on = ['node2'], right_on = ['id'], suffixes = ('_1', '_2'))
df_test = df_test.drop(['id'], axis = 1)

In [136]:
max_flow_data = pd.read_csv('data/cache/test_max_flow.csv')
df_test = df_test.merge(max_flow_data[['node1', 'node2', 'max_flow']], how = 'left', on = ['node1', 'node2'])

In [137]:
df_test['page_rank_sum'] = df_test.apply(lambda x: page_rank_res[x['node1']] + page_rank_res[x['node2']], axis = 1)
df_test['page_rank_diff'] = df_test.apply(lambda x: np.abs(page_rank_res[x['node1']] - page_rank_res[x['node2']]), axis = 1)

In [138]:
df_test['common_neighbors'] = df_test.apply(lambda x: len(list(nx.common_neighbors(graph, x['node1'], x['node2']))), 
                                   axis = 1)

In [139]:
test['Predicted'] = reg_log.predict(df_test)
test.loc[test.node1 == test.node2, 'Predicted'] = 1

## Write submission

In [140]:
test.to_csv('data/submissions/3.csv', sep = ',', columns = ['Predicted'],
            index = True, index_label = 'ID')