# Task 1 - Prepare Graph Data for Feature Extraction

- *Load missing edges*
- *Create a DataFrame with missing edges and assign a class of 0*
- *Create a DataFrame with existing edges and assign a class of 1*
- *Combine Positive and Negative samples into one dataframe for feature extraction*

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
# the missing edges set is loaded

file_path = open('./data/github-net-missing-edges-set.p', "rb")
missing_edges_set = pickle.load(file_path)
type(missing_edges_set)

set

In [3]:
# Create DataFrame for negative examples

# Each row has a source and a target that is connected by the missing edge

df_negative_edges = pd.DataFrame(list(missing_edges_set), columns=['Source', 'Target'])
print(df_negative_edges.shape)
df_negative_edges.head(3)

(289003, 2)


Unnamed: 0,Source,Target
0,21385,32676
1,26650,27849
2,19264,17030


In [4]:
# Load positive link data

# These are the existing links/edges from the GitHub Dataset

df_positive_edges = pd.read_csv('./data/musae_git_edges.csv')
df_positive_edges = df_positive_edges.rename(columns = {'id_1':'Source', 'id_2':'Target'})
df_positive_edges = df_positive_edges.drop_duplicates()
print(df_positive_edges.shape)
df_positive_edges.head(3)

(289003, 2)


Unnamed: 0,Source,Target
0,0,23977
1,1,34526
2,1,2370


In [5]:
# Creating a Binary Classification Problem. 

# Assigning Class 1 for existing edges

df_positive_edges['Class'] = 1
df_positive_edges.head(7)

Unnamed: 0,Source,Target,Class
0,0,23977,1
1,1,34526,1
2,1,2370,1
3,1,14683,1
4,1,29982,1
5,1,21142,1
6,1,20363,1


In [6]:
# Assigning Class 2 for missing edges/negative links

df_negative_edges['Class'] = 0
df_negative_edges.head(7)

Unnamed: 0,Source,Target,Class
0,21385,32676,0
1,26650,27849,0
2,19264,17030,0
3,26544,20283,0
4,27487,37391,0
5,27856,20484,0
6,24158,26785,0


In [7]:
# Now, to one Dataframe, Combine positive and negative samples

df_combined = pd.concat([df_positive_edges, df_negative_edges])
print(df_combined.shape)
df_combined.iloc[289001:289006]

(578006, 3)


Unnamed: 0,Source,Target,Class
289001,25879,2347,1
289002,25616,2347,1
0,21385,32676,0
1,26650,27849,0
2,19264,17030,0


# Task 2 - Extract Features from Graph

- *From the combined pandas dataframe, generate a directed graph using NetworkX*
- *Use pagerank() func from NetworkX to generate PageRank values for Source and Target nodes and save them*
- *Compute non-trivial shortest path lengths for NetworkX nodes and save to dataset*
- *Use DiGraph's predecessors() and successors() methods to compute following and followers*
- *Use found values and intersection() to calculate mutual following and followers between source and target nodes*
- *Save all followship features to the network DataFrame*
- *Save the dataset to a csv*

In [8]:
# Generate a Directed Graph from the combined DataFrame

graph = nx.from_pandas_edgelist(df_combined[['Source', 'Target']], source='Source', target='Target', create_using=nx.DiGraph())

In [9]:
# Computing Page Rank values for Source and Target Nodes using in-built function from NetworkX

page_rank_values = nx.pagerank(graph, alpha=0.85)
df_combined['Page_Rank_Source'] = df_combined['Source'].apply(lambda node: page_rank_values.get(node, 0))
df_combined['Page_Rank_Target'] = df_combined['Target'].apply(lambda node: page_rank_values.get(node, 0))

In [10]:
# Function to Get the Shortest Path between all Source and Target Nodes
def get_shortest_path(source, target):
    try:
        if graph.has_edge(source, target): # Check if there is a direct edge between src and tgt
            graph.remove_edge(source, target) # Remove direct edge temporarily
            distance = nx.shortest_path_length(graph, source=source, target=target) # Compute shortest path length
            graph.add_edge(source, target) # Re-add the edge
        else:
            distance = nx.shortest_path_length(graph, source=source, target=target) # Compute shortest path length if no edge exists
        return distance
    except nx.NetworkXNoPath:
        return -1  # Return -1 if no path exists

In [11]:
# Compute Shortest Path Lengths for all Source and Target Nodes and Save in the DataFrame

df_combined['Shortest_Path'] = df_combined.apply(lambda row: get_shortest_path(row['Source'], row['Target']), axis=1)

In [12]:
# Calculate followship features
def extract_followship_features():
  
    source_followers, source_following, target_followers, target_following, mutual_followers, mutual_following = [], [], [], [], [], []
    
    for idx, row in df_combined.iterrows():
        
        # Identify all followers (predecessors) and following (successors) of the source node
        source_predecessors = set(graph.predecessors(row['Source']))
        source_successors = set(graph.successors(row['Source']))
        
        # Identify all followers (predecessors) and following (successors) of the target node
        target_predecessors = set(graph.predecessors(row['Target']))
        target_successors = set(graph.successors(row['Target']))

        # Add the number of source node's followers and following to respective lists
        source_followers.append(len(source_predecessors))
        source_following.append(len(source_successors))
        
        # Add the number of target node's followers and following to respective lists
        target_followers.append(len(target_predecessors))
        target_following.append(len(target_successors))
        
        # Add the number of mutual followers and mutual following to respective lists
        mutual_followers.append(len(source_predecessors.intersection(target_predecessors)))
        mutual_following.append(len(source_successors.intersection(target_successors)))
        
    return source_followers, source_following, target_followers, target_following, mutual_followers, mutual_following

In [13]:
# Save everything

followship_features = extract_followship_features()
df_combined['Source_Followers'], df_combined['Source_Following'], df_combined['Target_Followers'], df_combined['Target_Following'], df_combined['Mutual_Followers'], df_combined['Mutual_Following'] = followship_features

In [14]:
# Current DataFrame

df_combined.head(10)

Unnamed: 0,Source,Target,Class,Page_Rank_Source,Page_Rank_Target,Shortest_Path,Source_Followers,Source_Following,Target_Followers,Target_Following,Mutual_Followers,Mutual_Following
0,0,23977,1,1.2e-05,2.7e-05,3,6,6,26,16,0,0
1,1,34526,1,1.4e-05,3.1e-05,4,9,14,10,7,0,0
2,1,2370,1,1.4e-05,1.8e-05,3,9,14,14,47,0,1
3,1,14683,1,1.4e-05,5.1e-05,4,9,14,54,72,0,1
4,1,29982,1,1.4e-05,0.000383,2,9,14,395,109,0,0
5,1,21142,1,1.4e-05,0.000314,2,9,14,393,306,0,1
6,1,20363,1,1.4e-05,8.8e-05,3,9,14,138,276,0,2
7,1,23830,1,1.4e-05,3.5e-05,3,9,14,32,18,0,0
8,1,34035,1,1.4e-05,1.5e-05,4,9,14,10,10,0,0
9,6067,19720,1,2.2e-05,2.2e-05,4,10,6,16,38,0,0


In [15]:
# Now, the dataset is saved

df_combined.to_csv('./data/github-full-dataset.csv')