In [19]:
import os
import numpy as np
import pandas as pd
import json
from itertools import product
import random

from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score

import torch
import torch.nn.functional as F

from torch_geometric.nn import GCNConv
from torch_geometric.utils import train_test_split_edges, negative_sampling
from torch_geometric.data import Data, DataLoader, Dataset
from torch_geometric.data.dataset import Dataset


from transformers import BertTokenizer, BertModel

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach()

In [21]:
def create_bipartite_neg_edges(pos_edges, num_users, num_posts, num_neg_samples ):
    pos_edges_set = set(tuple(edge) for edge in pos_edges.T.tolist())
    sampled_neg_edges = []

    while len(sampled_neg_edges) < pos_edges.T.size(1):
        user = random.randint(0, num_users - 1)
        post = random.randint(num_users, num_users + num_posts - 1)
        edge = (user, post)
        if edge not in pos_edges_set:
            sampled_neg_edges.append(edge)

    return torch.tensor(sampled_neg_edges).T.contiguous()

In [22]:
class LinkpredictionDataset(Dataset):
    def __init__(self, pos_edge_index, neg_edge_index):
        self.pos_edge_index = pos_edge_index.T.tolist()
        self.neg_edge_index = neg_edge_index.T.tolist()
        self.labels = [1]*len(self.pos_edge_index) + [0]*len(self.neg_edge_index)
        self.edges = self.pos_edge_index + self.neg_edge_index
        
    def __len__(self):
        return len(self.edges)
    
    def __getitem__(self, idx) :
        return torch.tensor(self.edges[idx]), torch.tensor(self.labels[idx])

In [23]:
class LinkPredictor(torch.nn.Module):
    def __init__(self, in_channels):
        super(LinkPredictor, self).__init__()
        self.lin1 = torch.nn.Linear(in_channels*2, 1024)
        self.lin2 = torch.nn.Linear(1024, 512)
        self.lin3 = torch.nn.Linear(512, 256)
        self.lin4 = torch.nn.Linear(256, 128)
        self.lin5 = torch.nn.Linear(128, 64)
        self.lin6 = torch.nn.Linear(64, 1)

    def forward(self, edge_features):
        x = F.relu(self.lin1(edge_features))
        x = F.relu(self.lin2(x))
        x = F.relu(self.lin3(x))
        x = F.relu(self.lin4(x))
        x = F.relu(self.lin5(x))
        x = self.lin6(x)
        
        return torch.sigmoid(x)
        

In [24]:
if os.path.exists('reddit_comments_with_embedding.csv'):
    data = pd.read_csv('reddit_comments_with_embedding.csv')
    data['embeddings'] = data['embeddings'].apply(lambda x: torch.tensor(json.loads(x)))
else:
    data = pd.read_csv('reddit_comments.csv')
    data['body'] = data['body'].apply(lambda x: x.lower())
    data['embeddings'] = (data['body'].apply(get_bert_embeddings)).apply(lambda x: json.dumps(x.tolist()))
    data.to_csv('reddit_comments_with_embedding.csv')
    data['embeddings'] = data['embeddings'].apply(lambda x: torch.tensor(json.loads(x)))

In [25]:
user_ids = data["subreddit"].unique()
post_ids = data["submission_id"].unique()

user_id_map = {id: idx for idx,id in enumerate(user_ids)}
post_id_map = {id: idx + len(user_ids) for idx,id in enumerate(post_ids)}

edges = []
edge_features = []
for _, row in data.iterrows():
    user_id = row["subreddit"]
    post_id = row["submission_id"]
    embedding = np.array(row["embeddings"])
    normalized_embedding = normalize(embedding)[0]
    
    user_idx = user_id_map[user_id]
    post_idx = post_id_map[post_id]
    
    edges.append([user_idx, post_idx])
    edge_features.append(normalized_embedding)

edge_index = torch.tensor(edges).T.contiguous()
edge_attr = torch.tensor(edge_features)

num_nodes = len(user_ids) + len(post_ids)
node_features = torch.zeros(num_nodes, edge_attr.size(1))

for edge, feature in zip(edges, edge_features):
    user_idx, post_idx = edge
    node_features[user_idx] += torch.tensor(feature)
    node_features[post_idx] += torch.tensor(feature)
    
node_features = torch.tensor(normalize(node_features))
    


data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr)

In [26]:
num_users = len(user_ids)
num_posts = len(post_ids)

train_pos_edge_index = random.sample(edges, int(0.2*(len(edges))))
val_pos_edge_index = random.sample([row for row in edges if row not in train_pos_edge_index], int(0.05*(len(edges))))
test_pos_edge_index = random.sample([row for row in edges if row not in (train_pos_edge_index+val_pos_edge_index)], int(0.05*(len(edges))))
train_pos_edge_index = (torch.tensor(train_pos_edge_index)).T
val_pos_edge_index = (torch.tensor(val_pos_edge_index)).T
test_pos_edge_index = (torch.tensor(test_pos_edge_index)).T

In [27]:
all_possible_edges = list(product(range(num_users), range(num_users, num_users+num_posts)))
pos_edges_set = set(tuple(edge) for edge in train_pos_edge_index.T.tolist())
all_possible_edges = [edge for edge in all_possible_edges if tuple(edge) not in pos_edges_set]
neg_edges = random.sample(all_possible_edges, train_pos_edge_index.size(1))
train_neg_edge_index = torch.tensor(neg_edges).T.contiguous()

In [28]:
all_possible_edges = list(product(range(num_users), range(num_users, num_users+num_posts)))
pos_edges_set = set(tuple(edge) for edge in val_pos_edge_index.T.tolist())
all_possible_edges = [edge for edge in all_possible_edges if tuple(edge) not in pos_edges_set]
neg_edges = random.sample(all_possible_edges, val_pos_edge_index.size(1))
val_neg_edge_index = torch.tensor(neg_edges).T.contiguous()

In [29]:
all_possible_edges = list(product(range(num_users), range(num_users, num_users+num_posts)))
pos_edges_set = set(tuple(edge) for edge in test_pos_edge_index.T.tolist())
all_possible_edges = [edge for edge in all_possible_edges if tuple(edge) not in pos_edges_set]
neg_edges = random.sample(all_possible_edges, test_pos_edge_index.size(1))
test_neg_edge_index = torch.tensor(neg_edges).T.contiguous()

In [30]:
train_edges_remove = (train_pos_edge_index.T.tolist())+(val_pos_edge_index.T.tolist())+(test_pos_edge_index.T.tolist())

train_adj = (torch.tensor([row for row in edges if row not in train_edges_remove])).T
val_adj = (torch.tensor(train_adj.T.tolist() + train_pos_edge_index.T.tolist())).T
test_adj = (torch.tensor(val_adj.T.tolist() + val_pos_edge_index.T.tolist())).T


In [31]:

train_dataset = LinkpredictionDataset(train_pos_edge_index, train_neg_edge_index)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)


val_dataset = LinkpredictionDataset(val_pos_edge_index, val_neg_edge_index)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)

test_dataset = LinkpredictionDataset(test_pos_edge_index, test_neg_edge_index)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)



In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

x = (data.x).to(torch.float32)
linkpredictor_model = LinkPredictor(x.size(1))
optimizer = torch.optim.Adam(linkpredictor_model.parameters(), lr=0.01)


In [33]:
def train(loader, x):
    
    linkpredictor_model.train()
    total_loss = 0
    for edges , labels in loader:
        edges = edges.T.contiguous()
        
        optimizer.zero_grad()

        
        edge_features = torch.cat([x[edges[0]], x[edges[1]]], dim=-1)
        preds = linkpredictor_model(edge_features).view(-1)
        
        loss = F.binary_cross_entropy(preds, labels.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    return(total_loss/len(loader))

In [34]:
def test(pos_edge_index, neg_edge_index, x):

    linkpredictor_model.eval()
    
    with torch.no_grad():
        edge_features = torch.cat([x[pos_edge_index[0]], x[pos_edge_index[1]]], dim=-1)
        pos_pred = linkpredictor_model(edge_features).view(-1)
        edge_features = torch.cat([x[neg_edge_index[0]], x[neg_edge_index[1]]], dim=-1)
        neg_pred = linkpredictor_model(edge_features).view(-1)
        
    y_pred = torch.cat([pos_pred, neg_pred])
    y_true = torch.cat([torch.ones(pos_pred.size()), torch.zeros(neg_pred.size())])
    acc = accuracy_score(y_true, y_pred > 0.5)
    roc = roc_auc_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred > 0.5)
    prec = precision_score(y_true, y_pred > 0.5)
    f1 = f1_score(y_true, y_pred > 0.5)

    return acc, roc, rec, prec, f1

In [35]:
history = {"epoch":[],"Loss":[],"Validation Accuracy":[],"Validation ROC":[],"Validation Recall":[],"Validation Precision":[],"Validation F1":[]}
final_results = {"Loss":[],"Validation Accuracy":[],"Validation ROC":[],"Validation Recall":[],"Validation Precision":[],"Validation F1":[],
                 "Test Accuracy":[],"Test ROC":[],"Test Recall":[],"Test Precision":[],"Test F1":[]}

model_dir = 'models_weights'

In [36]:
for epoch in range(200):
    loss = train(train_loader,x)
    val_acc, val_roc, val_rec, val_prec, val_f1 = test(val_pos_edge_index, val_neg_edge_index, x)
    
    history["epoch"].append(epoch+1)
    history["Loss"].append(loss)
    history["Validation Accuracy"].append(val_acc)
    history["Validation ROC"].append(val_roc)
    history["Validation Recall"].append(val_rec)
    history["Validation Precision"].append(val_prec)
    history["Validation F1"].append(val_f1)
    
    linkpredictor_path = os.path.join(model_dir,f'linkpredictor_without_GNN_epoch_{epoch+1}.path')
    torch.save(linkpredictor_model.state_dict(), linkpredictor_path)
    
    print(f'\nEpoch {epoch+1},\nLoss: {loss},\nValidation Accuracy: {val_acc}, \nValidation ROC: {val_roc}, \nValidation Recall: {val_rec}, \nValidation Precision: {val_prec}, \nValidation F1: {val_f1} ')
    
test_acc, test_roc, test_rec, test_prec, test_f1 = test(test_pos_edge_index, test_neg_edge_index, x)

final_results["Loss"].append(loss)
final_results["Validation Accuracy"].append(val_acc)
final_results["Validation ROC"].append(val_roc)
final_results["Validation Recall"].append(val_rec)
final_results["Validation Precision"].append(val_prec)
final_results["Validation F1"].append(val_f1)
final_results["Test Accuracy"].append(test_acc)
final_results["Test ROC"].append(test_roc)
final_results["Test Recall"].append(test_rec)
final_results["Test Precision"].append(test_prec)
final_results["Test F1"].append(test_f1)

print(f'\n\n\nTest Accuracy: {test_acc}, \nTest ROC: {test_roc}, \nTest Recall: {test_rec}, \nTest Precision: {test_prec}, \nTest F1: {test_f1} ')


Epoch 1,
Loss: 0.6763137408110529,
Validation Accuracy: 0.5050978792822186, 
Validation ROC: 0.554259315563551, 
Validation Recall: 0.2092169657422512, 
Validation Precision: 0.5124875124875125, 
Validation F1: 0.29713292788879236 

Epoch 2,
Loss: 0.6118559900440689,
Validation Accuracy: 0.54221044045677, 
Validation ROC: 0.5470823664272465, 
Validation Recall: 0.5314029363784666, 
Validation Precision: 0.5431429762401, 
Validation F1: 0.5372088229231087 

Epoch 3,
Loss: 0.5880480593112859,
Validation Accuracy: 0.5320146818923328, 
Validation ROC: 0.5826250635363748, 
Validation Recall: 0.32137030995106036, 
Validation Precision: 0.5553206483439042, 
Validation F1: 0.40712994058382845 

Epoch 4,
Loss: 0.5682798100799225,
Validation Accuracy: 0.5462887438825449, 
Validation ROC: 0.6145198246795239, 
Validation Recall: 0.25570962479608483, 
Validation Precision: 0.6105160662122687, 
Validation F1: 0.36044840471399825 

Epoch 5,
Loss: 0.5445478337207136,
Validation Accuracy: 0.5772838499

In [37]:

df = pd.DataFrame(history)
df.to_csv(f"results/results_without_GNN_epoch_{200}")

df = pd.DataFrame(final_results)
df.to_csv(f"results/final_result_without_GNN_epoch_{200}")