# Exploration

In [30]:
import networkx as nx
import pandas as pd

## Train

In [28]:
train = pd.read_csv('data/train.txt', sep = ' ', names = ['node1', 'node2', 'is_linked'])

In [29]:
train.head()

Unnamed: 0,node1,node2,is_linked
0,939,3809,1
1,2442,5784,1
2,179,3809,1
3,857,2280,1
4,1358,5722,1


In [31]:
graph = nx.Graph()

for _, row in train.iterrows():
    if row['is_linked'] == 1:
        graph.add_edge(row['node1'], row['node2'])

In [36]:
print(f"The graph has {len(list(graph.nodes()))} nodes")
print(f"The graph has {len(list(graph.edges()))} edges")
print(f"Node 2442 neighbors are {list(graph.neighbors(2442))}")

The graph has 3597 nodes
The graph has 5248 edges
Node 2442 neighbors are [5784, 7021, 5864, 6720, 4782, 3682, 437]


## Test

In [41]:
test = pd.read_csv('data/test.txt', sep = ' ', names = ['node1', 'node2'])

In [42]:
test.head()

Unnamed: 0,node1,node2
0,3425,4524
1,1620,2617
2,4832,6317
3,4984,7298
4,385,5481


## Node information

In [2]:
node_info = pd.read_csv('data/node_information.csv')

In [20]:
node_info = node_info.rename(columns = {'0' : 'id'})
col_names = {}
for i, col in enumerate(node_info.columns):
    if col != 'id':
        col_names[col] = str(i)
node_info = node_info.rename(columns = col_names)

In [21]:
node_info.sample(5)

Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,924,925,926,927,928,929,930,931,932,933
2589,5141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
32,61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
3546,7434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1184,2223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1771,3350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [22]:
node_info['nb_features'] = sum(node_info[col] for col in node_info.columns if col not in ['id', 'nb_features'])
node_info[['id', 'nb_features']].sample(5)

Unnamed: 0,id,nb_features
1424,2666,16.0
618,1147,4.0
1597,2997,8.0
2767,5522,10.0
1073,2013,8.0


## Random predictions

In [23]:
random_pred = pd.read_csv('data/random_predictions.csv')

In [25]:
random_pred.shape

(3498, 2)

In [26]:
random_pred.sample(5)

Unnamed: 0,ID,Predicted
1541,1541,1
695,695,1
2161,2161,0
2512,2512,0
611,611,1
