In [1]:
# This notebook was built on Google colab and uses data form google drive
# Also the notebook was built using multiple sessions saving the intermediate outputs to avoid overshooting the RAM

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import networkx as nx
import math
import pickle

In [3]:
parent_folder = "/content/drive/MyDrive/AAIC/Projects/Facebook Friend Recommendation /data"
train_graph = nx.read_edgelist(parent_folder + '/train_pos.csv', delimiter=',', create_using=nx.DiGraph(), nodetype=int)
print(nx.info(train_graph))

DiGraph with 1780722 nodes and 7550015 edges


### Load data

In [44]:
import random

total_train_size = 15100030
total_test_size = 3775008

train_sample_size = 100000
test_sample_size = 50000

skip_train = sorted(random.sample(range(1,total_train_size+1),total_train_size-train_sample_size))
skip_test = sorted(random.sample(range(1,total_test_size+1),total_test_size-test_sample_size))

In [66]:
df_final_train = pd.read_csv(parent_folder + '/train_after_eda.csv', skiprows=skip_train, names=['source_node', 'destination_node'])
df_final_train['indicator_link'] = pd.read_csv(parent_folder + '/train_y.csv', skiprows=skip_train, names=['indicator_link'])

In [67]:
df_final_train.shape

(100001, 3)

In [68]:
df_final_train.head()

Unnamed: 0,source_node,destination_node,indicator_link
0,273084,1505602,1
1,1593259,673140,1
2,355711,1610892,1
3,80115,1473881,1
4,336918,226190,1


In [69]:
df_final_test = pd.read_csv(parent_folder + '/test_after_eda.csv', skiprows=skip_test, names=['source_node', 'destination_node'])
df_final_test['indicator_link'] = pd.read_csv(parent_folder + '/test_y.csv', skiprows=skip_test, names=['indicator_link'])

In [70]:
df_final_test.shape

(50001, 3)

In [71]:
df_final_test.head()

Unnamed: 0,source_node,destination_node,indicator_link
0,848424,784690,1
1,1190268,217891,1
2,1095925,325140,1
3,571364,684722,1
4,1851322,840484,1


### Compute features



*   num_followers_s : number of followers of source node
*   num_following_s : number of users followed by source node
*   num_followers_d : number of followers of destination node
*   num_following_d : number of users followed by destination node
*   inter_followers : intersection of followers of source and destination
*   inter_following : intersection of users followed by source and destination




In [72]:
def get_follower_following_based_features(df_final):
    num_followers_s=[]
    num_following_s=[]
    num_followers_d=[]
    num_following_d=[]
    inter_followers=[]
    inter_following=[]
    for i,row in df_final.iterrows():
        try:
            s1=set(train_graph.predecessors(row['source_node']))
            s2=set(train_graph.successors(row['source_node']))
        except:
            s1 = set()
            s2 = set()
        try:
            d1=set(train_graph.predecessors(row['destination_node']))
            d2=set(train_graph.successors(row['destination_node']))
        except:
            d1 = set()
            d2 = set()
        num_followers_s.append(len(s1))
        num_following_s.append(len(s2))

        num_followers_d.append(len(d1))
        num_following_d.append(len(d2))

        inter_followers.append(len(s1.intersection(d1)))
        inter_following.append(len(s2.intersection(d2)))
    
    return  num_followers_s,num_following_s,num_followers_d,num_following_d,inter_followers,inter_following

In [73]:
df_final_train.columns

Index(['source_node', 'destination_node', 'indicator_link'], dtype='object')

In [74]:
df_final_train['num_followers_s'], df_final_train['num_followers_d'], \
df_final_train['num_following_s'], df_final_train['num_following_d'], \
df_final_train['inter_followers'], df_final_train['inter_following']= get_follower_following_based_features(df_final_train)

df_final_test['num_followers_s'], df_final_test['num_followers_d'], \
df_final_test['num_following_s'], df_final_test['num_following_d'], \
df_final_test['inter_followers'], df_final_test['inter_following']= get_follower_following_based_features(df_final_test)

In [75]:
df_final_train.head(2)

Unnamed: 0,source_node,destination_node,indicator_link,num_followers_s,num_followers_d,num_following_s,num_following_d,inter_followers,inter_following
0,273084,1505602,1,11,15,6,8,0,0
1,1593259,673140,1,8,3,19,8,0,0


In [77]:
df_final_test.head(2)

Unnamed: 0,source_node,destination_node,indicator_link,num_followers_s,num_followers_d,num_following_s,num_following_d,inter_followers,inter_following
0,848424,784690,1,6,6,14,9,1,0
1,1190268,217891,1,34,35,17,21,3,5


### Save results

In [78]:
hdf = pd.HDFStore(parent_folder + '/Features/sample_with_features_1.h5')
hdf.put('train_df',df_final_train, format='table', data_columns=True)
hdf.put('test_df',df_final_test, format='table', data_columns=True)
hdf.close()