Features and models used are based on: [Link Prediction in Social Networks using
Computationally Efficient Topological Features](https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=9f3c2d5364aab82a24e24e56f6013cfc4c404e13)

In [119]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import torch.nn as nn
import torch
import torch.optim as optim


We can ignore node labels as we don't want predictions to be made based on them.

In [120]:
def prep_data(path, train_percent):
    data = pd.read_csv(path)
    data.head()
    data = data.drop(['node_1', 'node_2'], axis = 'columns')
    label = data['link_exists']
    train_data = data.drop(['link_exists'], axis = 'columns')
    x_train, x_test, y_train, y_test = train_test_split(train_data, label, test_size=1-train_percent, random_state=42)
    return x_train, y_train, x_test, y_test


Prepare data for PyTorch (transformation from pandas dataframe to PyTorch tensor)

In [121]:
def prep_data_for_NN(x_train, y_train, x_test, y_test):
    x_train = pd.DataFrame.to_numpy(x_train)
    y_train = pd.Series.to_numpy(y_train)
    x_test = pd.DataFrame.to_numpy(x_test)
    y_test = pd.Series.to_numpy(y_test)

    x_train = torch.from_numpy(x_train).float()
    y_train = torch.from_numpy(y_train).float()
    x_test = torch.from_numpy(x_test).float()
    y_test = torch.from_numpy(y_test).float()
    print(x_train.shape)
    print(y_train.shape)
    print(x_test.shape)
    print(y_test.shape)
    print(x_train[0])
    print(y_train[0])


    # x_train = torch.tensor(x_train.values, dtype=torch.float32)
    # y_train = torch.tensor(y_train.values, dtype=torch.float32)
    # x_test = torch.tensor(x_test.values, dtype=torch.float32)
    # y_test = torch.tensor(y_test.values, dtype=torch.float32)

    return x_train, y_train, x_test, y_test
    

Prepare a dataframe to collect training results

In [122]:
model_performance_data = pd.DataFrame(columns=['Dataset' ,'Model', 'Percentage of data in training', 'Train Accuracy','Test Accuracy'])

In [123]:
def train_model(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    acc_score_train = accuracy_score(y_train, model.predict(x_train))
    acc_score_test = accuracy_score(y_test, model.predict(x_test))
    return acc_score_train, acc_score_test

In [124]:
def model_performance(dataset, model, name, x_train, y_train, x_test, y_test, percentage_of_data_in_training):
    accuracy_score_train, accuracy_score_test = train_model(model, x_train, y_train, x_test, y_test)
    model_performance_data.loc[len(model_performance_data.index)] = [dataset, name, percentage_of_data_in_training,
                                                                      accuracy_score_train, accuracy_score_test]

In [125]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer_1 = nn.Linear(12, 64)
        self.layer_2 = nn.Linear(64, 64)
        self.layer_3 = nn.Linear(64, 1)

    def forward(self, x):
        return torch.sigmoid(self.layer_3(torch.relu(self.layer_2(torch.relu(self.layer_1(x))))))

In [126]:
#device for PyTorch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [127]:
data = ['data/CondMat.csv', 'data/GenRel.csv', 'data/ErdosReny.csv', 'data/BarabasiAlbert.csv']

percentages = [0.1, 0.3, 0.5, 0.7, 0.9]

for dataset in data:
    for percent in percentages:
        x_train, y_train, x_test, y_test = prep_data(dataset, percent)
        model = GaussianNB()
        model_performance(dataset, model, 'GaussianNB', x_train, y_train, x_test, y_test, percent)
        model = DecisionTreeClassifier()
        model_performance(dataset, model, 'DecisionTreeClassifier', x_train, y_train, x_test, y_test, percent)
        model = KNeighborsClassifier()
        model_performance(dataset, model, 'KNeigborsClassifier', x_train, y_train, x_test, y_test, percent)
        model = AdaBoostClassifier(algorithm='SAMME')
        model_performance(dataset, model, 'AdaBoostClassifier', x_train, y_train, x_test, y_test, percent)
        model = BaggingClassifier()
        model_performance(dataset, model, 'BaggingClassifier', x_train, y_train, x_test, y_test, percent)
        model_performance_data.head()
        x_train, y_train, x_test, y_test = prep_data_for_NN(x_train, y_train, x_test, y_test)
        model = NeuralNetwork()

        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        for epoch in range(100):
            
            outputs = model(x_train)
            loss = criterion(outputs, y_train)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


        model.eval()

        with torch.no_grad():
            y_pred = model(x_train)
            train_accuracy = accuracy_score(y_train, y_pred)
            y_pred = model(x_test)
            test_accuracy = accuracy_score(y_test, y_pred)
            model_performance_data.loc[len(model_performance_data.index)] = [dataset, 'NeuralNetwork', percent, train_accuracy, test_accuracy]

model_performance_data.to_csv('model_performance.csv', index=False)

torch.Size([18268, 12])
torch.Size([18268])
torch.Size([164416, 12])
torch.Size([164416])
tensor([11.0000,  7.0000,  0.6061,  0.5273,  0.7857,  0.7143,  0.0000, 18.0000,
         0.0000, 77.0000,  0.0000,  4.0000])
tensor(0.)


ValueError: Using a target size (torch.Size([18268])) that is different to the input size (torch.Size([18268, 1])) is deprecated. Please ensure they have the same size.