In [4]:
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split
from dask_ml.linear_model import LogisticRegression
import mlflow
import mlflow.sklearn
import networkx as nx

In [5]:
mlflow.start_run()
df = dd.read_csv("train.csv")


In [6]:
df.head()

Unnamed: 0,source_node,destination_node
0,1,690569
1,1,315892
2,1,189226
3,2,834328
4,2,1615927


In [7]:
# Convert the source and destination nodes to Dask arrays
source_nodes = df['source_node'].to_dask_array(lengths=True)
destination_nodes = df['destination_node'].to_dask_array(lengths=True)

# Create a directed graph using NetworkX
G = nx.DiGraph()
edges = [(source, destination) for source, destination in zip(source_nodes.compute(), destination_nodes.compute())]
G.add_edges_from(edges)

In [9]:

# Calculate Common Neighbors
def common_neighbors_score(G, u, v):
    common_neighbors = set(G.successors(u)).intersection(G.successors(v))
    return len(common_neighbors)

In [10]:

# Calculate Jaccard's Coefficient
def jaccard_coefficient_score(G, u, v):
    u_neighbors = set(G.successors(u))
    v_neighbors = set(G.successors(v))
    intersection = u_neighbors.intersection(v_neighbors)
    union = u_neighbors.union(v_neighbors)
    return len(intersection) / len(union)


In [11]:
# Calculate Preferential Attachment
def preferential_attachment_score(G, u, v):
    u_neighbors = set(G.successors(u))
    v_neighbors = set(G.successors(v))
    return len(u_neighbors) * len(v_neighbors)

In [12]:


# Calculate scores for all pairs of nodes
for u, v in G.edges():
    common_neighbors = common_neighbors_score(G, u, v)
    jaccard_coefficient = jaccard_coefficient_score(G, u, v)
    preferential_attachment = preferential_attachment_score(G, u, v)
    
    print(f"Nodes {u} and {v}:")
    print(f"Common Neighbors Score: {common_neighbors}")
    print(f"Jaccard's Coefficient Score: {jaccard_coefficient}")
    print(f"Preferential Attachment Score: {preferential_attachment}")
    print("-------------")

# End the MLflow run
mlflow.end_run()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Nodes 57308 and 1484810:
Common Neighbors Score: 0
Jaccard's Coefficient Score: 0.0
Preferential Attachment Score: 0
-------------
Nodes 57308 and 609638:
Common Neighbors Score: 0
Jaccard's Coefficient Score: 0.0
Preferential Attachment Score: 0
-------------
Nodes 57308 and 223152:
Common Neighbors Score: 0
Jaccard's Coefficient Score: 0.0
Preferential Attachment Score: 0
-------------
Nodes 57308 and 978696:
Common Neighbors Score: 0
Jaccard's Coefficient Score: 0.0
Preferential Attachment Score: 0
-------------
Nodes 57308 and 338322:
Common Neighbors Score: 0
Jaccard's Coefficient Score: 0.0
Preferential Attachment Score: 0
-------------
Nodes 57310 and 912267:
Common Neighbors Score: 0
Jaccard's Coefficient Score: 0.0
Preferential Attachment Score: 0
-------------
Nodes 57310 and 1078685:
Common Neighbors Score: 0
Jaccard's Coefficient Score: 0.0
Preferential Attachment Score: 0
-------------
Nodes 57310 and 517441: