In [None]:
# This notebook was built on Google colab and uses data form google drive
# Also the notebook was built using multiple sessions saving the intermediate outputs to avoid overshooting the RAM

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import networkx as nx
import math
import pickle

### Load data

In [None]:
parent_folder = "/content/drive/MyDrive/AAIC/Projects/Facebook Friend Recommendation /data"
train_graph = nx.read_edgelist(parent_folder + '/train_pos.csv', delimiter=',', create_using=nx.DiGraph(), nodetype=int)
print(nx.info(train_graph))

DiGraph with 1780722 nodes and 7550015 edges


In [None]:
df_final_train = pd.read_hdf(parent_folder + '/Features/sample_with_features_3.h5', 'train_df',mode='r')
df_final_test = pd.read_hdf(parent_folder + '/Features/sample_with_features_3.h5', 'test_df',mode='r')

In [None]:
df_final_train.shape

(100001, 21)

In [None]:
df_final_train.head()

Unnamed: 0,source_node,destination_node,indicator_link,num_followers_s,num_followers_d,num_following_s,num_following_d,inter_followers,inter_following,does_follow_back,...,same_wcc_com,adar_index,jaccard_followers,jaccard_following,cosine_followers,cosine_following,page_rank_s,page_rank_d,katz_s,katz_d
0,273084,1505602,1,11,15,6,8,0,0,0,...,1,0.0,0.0,0.0,0.0,0.0,2.04529e-06,3.459963e-07,0.000773,0.000756
1,1593259,673140,1,8,3,19,8,0,0,0,...,1,0.0,0.0,0.0,0.0,0.0,1.373377e-06,1.886069e-06,0.000762,0.000806
2,355711,1610892,1,9,18,2,1,1,0,1,...,1,0.0,0.1,0.0,0.235702,0.0,1.394824e-06,2.875504e-07,0.000766,0.000739
3,80115,1473881,1,3,55,83,99,1,12,0,...,1,6.12441,0.011765,0.084507,0.063372,0.162623,1.827472e-07,8.809712e-07,0.000745,0.001098
4,336918,226190,1,14,16,7,5,0,0,1,...,1,0.0,0.0,0.0,0.0,0.0,1.198466e-06,5.000005e-07,0.000785,0.000758


In [None]:
df_final_test.shape

(50001, 21)

In [None]:
df_final_test.head()

Unnamed: 0,source_node,destination_node,indicator_link,num_followers_s,num_followers_d,num_following_s,num_following_d,inter_followers,inter_following,does_follow_back,...,same_wcc_com,adar_index,jaccard_followers,jaccard_following,cosine_followers,cosine_following,page_rank_s,page_rank_d,katz_s,katz_d
0,848424,784690,1,6,6,14,9,1,0,1,...,1,0.0,0.052632,0.0,0.109109,0.0,6.557971e-07,1.559547e-06,0.000754,0.000786
1,1190268,217891,1,34,35,17,21,3,5,1,...,1,3.450508,0.0625,0.098039,0.124784,0.184428,1.239734e-06,2.329951e-06,0.000872,0.000799
2,1095925,325140,1,0,2,12,13,0,1,0,...,1,1.183295,0.0,0.071429,0.0,0.196116,1.65565e-07,1.989847e-06,0.000731,0.000777
3,571364,684722,1,1,6,7,2,0,0,0,...,1,0.0,0.0,0.0,0.0,0.0,6.428994e-07,4.050531e-07,0.000735,0.000759
4,1851322,840484,1,5,7,7,13,2,5,1,...,1,12.434176,0.2,0.333333,0.338062,0.524142,1.316063e-06,1.637346e-06,0.00075,0.000757


### Weight features

In a nutshell, more the number of neighbours (in this case followers / following) less is the weight of that node.

\begin{equation}
W = \frac{1}{\sqrt{1+|X|}}
\end{equation}

In our case, X can be 
1. followers
2. following

*  weight_in : weight of incoming edges on destination node (followers of destination node)
*  weight_out : weight of outgoing edges from source node (following by source node)
*  weight_out_plus_in : weight_out + weight_in



In [None]:
from tqdm import tqdm

weight_in = {}
weight_out = {}

for i in tqdm(train_graph.nodes()):
    followers = set(train_graph.predecessors(i))
    w_in = 1/np.sqrt(1+len(followers))
    weight_in[i] = w_in

    following = set(train_graph.successors(i))
    w_out = 1/np.sqrt(1+len(following))
    weight_out[i] = w_out

100%|██████████| 1780722/1780722 [00:19<00:00, 91109.23it/s] 


In [None]:
weight_in_mean = np.mean(list(weight_in.values()))
weight_out_mean = np.mean(list(weight_out.values()))

In [None]:
# for missing weight, imputing mean

# weight_out for source in train
df_final_train['weight_out_s'] = df_final_train.source_node.apply(lambda node : weight_out.get(node, weight_out_mean))

# weight_in for destination in train
df_final_train['weight_in_d'] = df_final_train.destination_node.apply(lambda node : weight_in.get(node, weight_in_mean))

# weight_out + weight_in for pair in train
df_final_train['weight_out_plus_in'] = df_final_train['weight_out_s'] + df_final_train['weight_in_d'] 

# weight_out for source in test
df_final_test['weight_out_s'] = df_final_test.source_node.apply(lambda node : weight_out.get(node, weight_out_mean))

# weight_in for destination in test
df_final_test['weight_in_d'] = df_final_test.destination_node.apply(lambda node : weight_in.get(node, weight_in_mean))

# weight_out + weight_in for pair in test
df_final_test['weight_out_plus_in'] = df_final_test['weight_out_s'] + df_final_test['weight_in_d']

In [None]:
df_final_train.head()

Unnamed: 0,source_node,destination_node,indicator_link,num_followers_s,num_followers_d,num_following_s,num_following_d,inter_followers,inter_following,does_follow_back,...,jaccard_following,cosine_followers,cosine_following,page_rank_s,page_rank_d,katz_s,katz_d,weight_out_s,weight_in_d,weight_out_plus_in
0,273084,1505602,1,11,15,6,8,0,0,0,...,0.0,0.0,0.0,2.04529e-06,3.459963e-07,0.000773,0.000756,0.25,0.377964,0.627964
1,1593259,673140,1,8,3,19,8,0,0,0,...,0.0,0.0,0.0,1.373377e-06,1.886069e-06,0.000762,0.000806,0.5,0.223607,0.723607
2,355711,1610892,1,9,18,2,1,1,0,1,...,0.0,0.235702,0.0,1.394824e-06,2.875504e-07,0.000766,0.000739,0.229416,0.57735,0.806766
3,80115,1473881,1,3,55,83,99,1,12,0,...,0.084507,0.063372,0.162623,1.827472e-07,8.809712e-07,0.000745,0.001098,0.133631,0.109109,0.24274
4,336918,226190,1,14,16,7,5,0,0,1,...,0.0,0.0,0.0,1.198466e-06,5.000005e-07,0.000785,0.000758,0.242536,0.353553,0.596089


### Save results

In [None]:
hdf = pd.HDFStore(parent_folder + '/Features/sample_with_features_4.h5')
hdf.put('train_df',df_final_train, format='table', data_columns=True)
hdf.put('test_df',df_final_test, format='table', data_columns=True)
hdf.close()