In [None]:
import random
import csv
import datetime
import os
import shutil
import sys
import gzip

import requests
import networkx as nx
import numpy as np
import matplotlib.pyplot as pl

from networkx import resource_allocation_index, jaccard_coefficient, adamic_adar_index, preferential_attachment
from sklearn import svm
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [None]:
# If dataset does not exist, download it.

if not os.path.exists("twitter_combined.txt"):
    url = "http://snap.stanford.edu/data/twitter_combined.txt.gz"
    local_filename = "twitter_combined.txt.gz"

    print("Dataset not found!")
    with open(local_filename, 'wb') as f:
        r = requests.get(url, stream=True)
        total_size = int(r.headers.get('content-length', 0))
        downloaded = 0
        chunk_size = 1024
        for chunk in r.iter_content(chunk_size=chunk_size):
            downloaded += len(chunk)
            f.write(chunk)
            done = int(50 * downloaded / total_size)
            progress = str(round(downloaded / 1048576, 2))
            total = str(round(total_size / 1048576, 2))
            sys.stdout.write("\rDownloading %s: [%s%s%s] %s/%sMB" % (local_filename, '=' * done, ">", ' ' * (50 - done), progress, total))
            sys.stdout.flush()
    print("\nDownloaded! Extracting GZip...")
    with gzip.open(local_filename, 'rb') as gz:
        with open("twitter_combined.txt", 'wb') as twt:
            shutil.copyfileobj(gz, twt)
    print("Extracted Successfully! Deleting the GZip...")
    os.remove(local_filename)
    print("Done")

In [None]:
# If output directories do not exist, create them.

if not os.path.isdir("data") and not os.path.isdir("graphs"):
    print("Creating output directories...")
    os.mkdir("data")
    os.mkdir("graphs")
    print("Done")

In [None]:
# Create graph from dataset [dataset taken from: http://snap.stanford.edu/data/ego-Twitter.html]
# and write the GraphML file for visual representation of the graph.

graph = nx.Graph()
print("Reading dataset and creating graph...")
with open("twitter_combined.txt", "rb") as twt:
    graph = nx.read_edgelist(twt)
print("Number of Nodes:", graph.number_of_nodes())
print("Number of Edges:", graph.number_of_edges())

print("\nWriting GraphML file...")
nx.write_graphml(graph, "graphs/base.graphml")
print("Graph saved in graphs/base.graphml")


In [None]:
# Downscale the graph to 1% for faster training of the model during demonstration.

n = int(0.01 * graph.number_of_edges())

In [None]:
# Create 'n' number of random estimated edges.

print("Estimating links between unconnected nodes...")
positive = random.sample(graph.edges(), n)
p_sample = nx.Graph()
p_sample.add_edges_from(positive, positive="True")
print("Number of Nodes: ", p_sample.number_of_nodes())
print("Number of Edges: ", p_sample.number_of_edges())

nx.write_edgelist(p_sample, "data/positive_sample.txt", data=["positive"])
print("Data saved in data/positive_sample.txt")

print("\nWriting GraphML file...")
nx.write_graphml(p_sample, "graphs/positive_sample.graphml")
print("Graph saved in graphs/positive_sample.graphml")

In [None]:
# Find out pairs of nodes that are not connected to each other.

print("Looking for truly unconnected nodes...")
i = 0
n_sample = nx.Graph()
while i < n:
    edge = random.sample(graph.nodes(), 2)
    try:
        if edge[1] not in graph.neighbors(edge[0]):
            n_sample.add_edge(edge[0], edge[1], positive="False")
            i += 1
    except:
        pass
negative = n_sample.edges()

print("Number of Nodes: ", n_sample.number_of_nodes())
print("Number of Edges: ", n_sample.number_of_edges())

nx.write_edgelist(n_sample, "data/negative_sample.txt", data=["positive"])
print("Data saved in data/negative_sample.txt")

print("\nWriting GraphML file...")
nx.write_graphml(n_sample, "graphs/negative_sample.graphml")
print("Graph saved in graphs/negative_sample.graphml")

In [None]:
# Create a combined graph from both the positive and negative graphs.

print("Combining positive and negative graphs...")
n_sample.add_edges_from(positive, positive="True")
print("Number of Nodes: ", p_sample.number_of_nodes())
print("Number of Edges: ", p_sample.number_of_edges())

nx.write_edgelist(n_sample, "data/combined_sample.txt", data=["positive"])
print("Data saved in data/combined_sample.txt")

print("\nWriting GraphML file...")
nx.write_graphml(n_sample, "graphs/combined_sample.graphml")
print("Graph saved in graphs/combined_sample.graphml")

In [None]:
# Prepare the training model by removing the positive (assumed) edges from the original graph.

print("Preparing training model...")
graph.remove_edges_from(positive)
print("Number of Nodes: ", p_sample.number_of_nodes())
print("Number of Edges: ", p_sample.number_of_edges())

nx.write_edgelist(graph, "data/training_model.txt", data=False)
print("Data saved in data/training_model.txt")

print("\nWriting GraphML file...")
nx.write_graphml(graph, "graphs/training_model.graphml")
print("Graph saved in graphs/training_model.graphml")

In [None]:
def common_neighbors(G, edges) -> list:
    """Returns a list containing common neighbors of all the edges provided.

    Parameters
    ----------
    G : Graph
        The Graph in which the `edges` are to be checked for common neighbors.
    edges : list
        List of edges from the graph `G`.

    Returns
    -------
    cn_list : list
        List of common neighbors of the `edges`.
    
    Examples
    --------
    >>> CommonNeighbors = common_neighbors(graph, graph.edges())
    """
    cn_list = []
    for edge in edges:
        x, y = edge[0], edge[1]
        n = 0
        try:
            n1, n2 = G.neighbors(x), G.neighbors(y)
            for _ in n1:
                if _ in n2:
                    n += 1
            cn_list.append((x, y, n))
        except:
            pass
    return cn_list

In [None]:
# Scoring methods to create the feature set.
scoring_methods = [
    common_neighbors,
    resource_allocation_index,
    jaccard_coefficient,
    adamic_adar_index,
    preferential_attachment
]

In [None]:
# Creating the feature set by applying the scoring method.

print("Constructing feature set...")
data = []
label = ["Label"] + ["1" for _ in positive] + ["0" for _ in negative]

for method in scoring_methods:
    print("Using method:", method.__name__)
    prediction = method(graph, positive)

    feature = [method.__name__] + [_[2] for _ in prediction]
    prediction = method(graph, negative)
    feature += [_[2] for _ in prediction]
    
    data.append(feature)

data.append(label)
data = [list(_) for _ in zip(*data)] # transposing the data

print("Writing data to CSV file...")
with open("feature_set.csv", "w") as outfile:
    writer = csv.writer(outfile)
    for _ in data:
        writer.writerow(_)
print("Feature set saved as feature_set.csv")

In [None]:
# Load and shuffle the feature set.

r = np.loadtxt(open("feature_set.csv", "rb"), delimiter=",", skiprows=1)
l, b = r.shape
np.random.shuffle(r)

In [None]:
# Split the feature set for training and testing.

model_len = int(0.75 * l)

X_train = r[0: model_len, 0: b - 1]
Y_train = r[0: model_len, b - 1]

X_test = r[model_len: l, 0: b - 1]
Y_test = r[model_len: l, b - 1]

X_train = normalize(X_train, axis=0, norm="max")
X_test = normalize(X_test, axis=0, norm="max")

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Training using the SVM(Support Vector Machine) Classifier.

start_time = datetime.datetime.now()

print("Running SVM Classifier (Start time:", datetime.datetime.strftime(start_time, r"%H:%M:%S.%f") + ")")

classifier = svm.SVC()
classifier.fit(X_train, Y_train)

res = classifier.predict(X_test)

print("SVM Accuracy:", accuracy_score(Y_test, res) * 100, "%")

ConfusionMatrixDisplay.from_predictions(Y_test, res)
pl.show()

end_time = datetime.datetime.now()
total_time = end_time - start_time
print("End time:", datetime.datetime.strftime(end_time, r"%H:%M:%S.%f"))
print("Time taken:", total_time)

In [None]:
# Training using Logistic Regression.

start_time = datetime.datetime.now()

print("Running Linear Classification using Logistic Regression (Start time:", datetime.datetime.strftime(start_time, r"%H:%M:%S.%f") + ")")

classifier = LogisticRegression(random_state=0, multi_class="ovr")
classifier.fit(X_train, Y_train)

res = classifier.predict(X_test)

print("Linear accuracy:", accuracy_score(Y_test, res) * 100, "%")

ConfusionMatrixDisplay.from_predictions(Y_test, res)
pl.show()

end_time = datetime.datetime.now()
total_time = end_time - start_time
print("End Time:", datetime.datetime.strftime(end_time, r"%H:%M:%S.%f"))
print("Time taken:", total_time)

In [None]:
# Training using the MLP(Multi Layer Classification) Classifier.

start_time = datetime.datetime.now()

print("Running Multi Layer Classification using MLPClassifier (Start time:", datetime.datetime.strftime(start_time, r"%H:%M:%S.%f") + ")")

classifier = MLPClassifier(random_state=1, learning_rate="adaptive")
classifier.fit(X_train, Y_train)

res = classifier.predict(X_test)

print("Multi Layer Accuracy:", accuracy_score(Y_test, res) * 100, "%")

ConfusionMatrixDisplay.from_predictions(Y_test, res)
pl.show()

end_time = datetime.datetime.now()
total_time = end_time - start_time
print("End Time:", datetime.datetime.strftime(end_time, r"%H:%M:%S.%f"))
print("Time taken:", total_time)