In [147]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


In [148]:
data_path = "./fds-link-prediction-madhura/archive"

In [149]:
df_attributes = pd.read_csv(f"{data_path}/attributes.csv")
df_predictions = pd.read_csv(f"{data_path}/attributes.csv")
df_solutionInput = pd.read_csv(f"{data_path}/solutionInput.csv")

In [150]:
#change data structure so it can be use in the model
solution_edges = df_solutionInput[['int1', 'int2']].astype(str).values.tolist()

In [151]:
# Load the graphs
G = nx.read_edgelist(f"{data_path}/edges_train.edgelist", delimiter=',', nodetype=str)



In [168]:
def getFeature(G, i, j):
    features = []
    
    # Preferential attachment
    pa = len(list(G.neighbors(i))) * len(list(G.neighbors(j)))
    features.append(pa)
    
    # Common neighbors
    common_neighbors = len(list(nx.common_neighbors(G, i, j)))
    features.append(common_neighbors)
    
    # Jaccard similarity
    jaccard = list(nx.jaccard_coefficient(G, [(i, j)]))[0][2]
    features.append(jaccard)
    
    return np.array(features)

In [169]:
def extract_edge_features(graph, edges):
    features = []
    for (i, j) in edges:
        edge_features = getFeature(graph, i, j)
        features.append(edge_features)
    return np.array(features)

In [170]:
# Prepare features for the edges in the data set
edges = list(G.edges())
X = extract_edge_features(G, edges)
y = [1] * len(edges)

In [171]:
# Generate negative samples for the data set
def generate_negative_samples(graph, num_samples):
    nodes = list(graph.nodes())
    negative_samples = []
    while len(negative_samples) < num_samples:
        u, v = np.random.choice(nodes, size=2, replace=False)
        if not graph.has_edge(u, v):
            negative_samples.append((u, v))
    return negative_samples

num_negative_samples = int(len(train_edges)*1.00)  # % of negative samples
negative_edges = generate_negative_samples(G, num_negative_samples)
X_negative = extract_edge_features(G, negative_edges)
y_negative = [0] * len(negative_edges)

# Combine positive and negative samples for data set
X = np.vstack((X, X_negative))
y = np.concatenate((y, y_negative))

In [172]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [173]:
clf = LogisticRegression(random_state=0, penalty='l2', C=1.5, solver='lbfgs', max_iter=200)
clf.fit(X_train, y_train)

scores = cross_val_score(clf, X_test, y_test, cv=5)

print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))


0.76 accuracy with a standard deviation of 0.01


In [174]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest model
rf_clf = RandomForestClassifier(random_state=0, n_estimators=100, max_depth=10)
rf_clf.fit(X_train, y_train)

# Cross-validation
rf_scores = cross_val_score(rf_clf, X_test, y_test, cv=5)
print("Random Forest: %0.2f accuracy with a standard deviation of %0.2f" % (rf_scores.mean(), rf_scores.std()))

Random Forest: 0.76 accuracy with a standard deviation of 0.01


In [175]:
from sklearn.neighbors import KNeighborsClassifier

# KNN model
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)

# Cross-validation
knn_scores = cross_val_score(knn_clf, X_test, y_test, cv=5)
print("KNN: %0.2f accuracy with a standard deviation of %0.2f" % (knn_scores.mean(), knn_scores.std()))

KNN: 0.74 accuracy with a standard deviation of 0.02


In [160]:
from sklearn.ensemble import GradientBoostingClassifier

# Gradient Boosting model
gb_clf = GradientBoostingClassifier(random_state=0, n_estimators=100, learning_rate=0.1)
gb_clf.fit(X_train, y_train)

# Cross-validation
gb_scores = cross_val_score(gb_clf, X_test, y_test, cv=5)
print("Gradient Boosting: %0.2f accuracy with a standard deviation of %0.2f" % (gb_scores.mean(), gb_scores.std()))

Gradient Boosting: 0.75 accuracy with a standard deviation of 0.02


In [161]:

solution_edges = df_solutionInput[['int1', 'int2']].astype(str).values.tolist()
X_solution = extract_edge_features(G, solution_edges)

predictions = knn_clf.predict(X_solution)

df_solutionInput['prediction'] = predictions

#df_solutionInput.to_csv('predicted_solution.csv', index=False)

df_solutionInput.head()

num_positive_predictions = df_solutionInput['prediction'].sum()

print(f"Number of positive predictions: {num_positive_predictions}")


Number of positive predictions: 633
