In [63]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [64]:
data_path = "./fds-link-prediction-madhura/archive"

In [65]:
df_attributes = pd.read_csv(f"{data_path}/attributes.csv")
df_predictions = pd.read_csv(f"{data_path}/attributes.csv")
df_solutionInput = pd.read_csv(f"{data_path}/solutionInput.csv")

In [66]:
test_edges = df_solutionInput[['int1', 'int2']].astype(str).values.tolist()

df_edges = pd.DataFrame(test_edges, columns=['node1', 'node2'])

df_edges.to_csv('test_edges.edgelist', index=False, header=False)
#Save the inputsolution as test_edges.edgelist

In [67]:
G = nx.read_edgelist(f"{data_path}/edges_train.edgelist", delimiter=',', nodetype=str)
G_test = nx.read_edgelist("test_edges.edgelist", data=False, nodetype=int, delimiter=',')

#import both as a graph

In [68]:
def getFeature(G, i, j):
    features = []
    
    # Preferential attachment
    pa = len(list(G.neighbors(i))) * len(list(G.neighbors(j)))
    features.append(pa)
    
    # Common neighbors
    common_neighbors = len(list(nx.common_neighbors(G, i, j)))
    features.append(common_neighbors)
    
    # Jaccard similarity
    jaccard = list(nx.jaccard_coefficient(G, [(i, j)]))[0][2]
    features.append(jaccard)
    
    return np.array(features)

def extract_edge_features(graph, edges):
    features = []
    for (i, j) in edges:
        edge_features = getFeature(graph, i, j)  # Get features for edge (i, j)
        features.append(edge_features)
    return np.array(features)

In [69]:

def extract_features(graph, nodes):
    features = []
    for node in nodes:
        degree = graph.degree(node)
        clustering = nx.clustering(graph, node)
        features.append([degree, clustering])
    return np.array(features)

# Prepare features for the edges in the training set
train_edges = list(G.edges())
test_edges = list(G_test.edges())

X_train = extract_edge_features(G, train_edges)  # Features for train edges
y_train = [1] * len(train_edges)  # Labels for train edges (assuming 1 for positive class)

X_test = extract_edge_features(G_test, test_edges)  # Features for test edges
y_test = [1] * len(test_edges)  # Labels for test edges (assuming 1 for positive class)

In [72]:
#all train edges are postive, so we will create some negative samples. This is not needed for the test data since its already 50/50

def generate_negative_samples(graph, num_samples):
    nodes = list(graph.nodes())
    negative_samples = []
    while len(negative_samples) < num_samples:
        u, v = np.random.choice(nodes, size=2, replace=False)
        if not graph.has_edge(u, v):
            negative_samples.append((u, v))
    return negative_samples

# Generate negative samples
num_negative_samples = len(edges)  # Example: same size as positive edges
negative_edges = generate_negative_samples(g, num_negative_samples)
X_negative = extract_edge_features(g, negative_edges)  # Use the same function for both positive and negative edges
y_negative = [0] * len(negative_edges)  # Label for negative edges (0 for no link)

# Combine positive and negative samples
X_train = np.vstack((X_train, X_negative))
y_train = np.concatenate((y_train, y_negative))

In [82]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

#model = SVC(kernel='rbf', random_state=42)
#model = KNeighborsClassifier(n_neighbors=5)
model = LogisticRegression(random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(cv_scores):.4f}")

Cross-Validation Accuracy Scores: [0.75530303 0.78787879 0.74469697 0.77083333 0.72765152]
Mean Cross-Validation Accuracy: 0.7573
