# III Constructing link structures




## Table of Contents

1. [Loading the Data and Necessary Libraries](#loading-dependencies)
2. [Constructing Explicit Links table](#explicit)
3. [Constructing Implicit Links table](#implicit)

## Loading the Data and Libraries 
<a class="anchor" id="loading-dependencies"></a>

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

from itertools import permutations
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

df_c = pd.read_parquet('Comments.parquet')
df_c['parentID'] = pd.to_numeric(df_c['parentID'], errors='coerce').astype('Int64')

df_embeddings = pd.read_parquet('Comments_with_embeddings.parquet')

## Constructing Explicit Links table
<a class="anchor" id="explicit"></a>
 
This creates a dataframe containing all explicit connections, for each comment and saves them as a parquet.

In [None]:
def pairwise_cosine_similarity(array1, array2):
    similarities = []
    for vec1, vec2 in tqdm(zip(array1, array2), total=len(array1), desc="Calculating cosine similarity"):
        similarity_matrix = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))
        similarity = similarity_matrix[0][0]
        similarities.append(similarity)
    return similarities
    
user_comment_id = df_c[['commentID','userID']]  
comment_connections = df_c[['articleID','commentID','parentID']]  
times_df = df_c[['commentID','approveDate','createDate']]  


connections_df = comment_connections.merge(user_comment_id, left_on='commentID', right_on='commentID', how='left')
connections_df = connections_df.rename(columns={'userID': 'userID_b'}) 


user_comment_id = user_comment_id.rename(columns={'commentID': 'parentID'}) 
connections_df = connections_df.merge(user_comment_id, left_on='parentID', right_on='parentID', how='left')
connections_df = connections_df.rename(columns={'userID': 'userID_a'})  


connections_df = connections_df.dropna()
connections_df.userID_a = connections_df.userID_a.astype('Int64')


e_df = connections_df[['articleID', 'parentID', 'commentID', 'userID_a', 'userID_b']]
e_df = e_df.rename(columns={'parentID': 'commentID_a', 'commentID': 'commentID_b'})  

e_df = e_df.merge(times_df[['commentID', 'approveDate']], left_on='commentID_a', right_on='commentID', how='left')
e_df = e_df.drop(columns=['commentID'])  
e_df = e_df.merge(times_df[['commentID', 'createDate']], left_on='commentID_b', right_on='commentID', how='left')
e_df = e_df.drop(columns=['commentID']) 
e_df = e_df.rename(columns={'approveDate': 'approveDate_a', 'createDate': 'createDate_b'})

print("Table construction complete!")

e_df["vectors_a"] = pd.merge(e_df, df_embeddings, left_on='commentID_a', right_on='commentID', how='left')[['comment_vector']] 
e_df["vectors_b"] = pd.merge(e_df, df_embeddings, left_on='commentID_b', right_on='commentID', how='left')[['comment_vector']]


e_df = e_df.dropna()
vectors_a_array = np.array([vec for vec in e_df['vectors_a']])
vectors_b_array = np.array([vec for vec in e_df['vectors_b']])

similarities = pairwise_cosine_similarity(vectors_a_array, vectors_b_array) 
e_df["similarities"] = similarities            

e_df.to_parquet('explicit_links.parquet')
e_df.head()

## Constructing Implicit Links table
<a class="anchor" id="implicit"></a>

This creates a dataframe containing all implicit connections, for each comment and saves them as a parquet.

In [2]:
'''
GCN - Graph comment Network ---OPTIMIZED---
'''
implicit_links_df = pd.DataFrame({'articleID':[],'commentID_a':[],'commentID_b':[],'approveDate':[],'createDate':[],'similarities':[]})
list_articles = df_c.articleID.unique().tolist()
df_c = df_c.merge(df_embeddings[['commentID','comment_vector']], left_on='commentID', right_on='commentID', how='inner')


In [11]:

def pairwise_cosine_similarity_batched(vectors_a, vectors_b, batch_size=1000):
    similarities = []
    for start in range(0, len(vectors_a), batch_size):
        end = min(start + batch_size, len(vectors_a))
        batch_a = vectors_a[start:end]
        batch_b = vectors_b[start:end]
        similarity_batch = cosine_similarity(batch_a, batch_b).diagonal()
        similarities.extend(similarity_batch)
    return np.array(similarities)


article_dfs_to_concat = []

for article in tqdm(list_articles, desc="Identifying implicit links"):
    article_df = df_c[df_c.articleID == article]
    if len(article_df) > 1:                     
        comment_permutations = pd.DataFrame().from_records(permutations(article_df['commentID'], 2), columns=['commentID_a', 'commentID_b'])
        
        merged_df_a = comment_permutations.merge(article_df, left_on='commentID_a', right_on='commentID', how='inner')[['userID','commentID_a','commentID_b','approveDate','comment_vector']] 
        merged_df_a = merged_df_a.rename(columns={'userID': 'userID_a','approveDate': 'approveDate_a', 'comment_vector': 'comment_vector_a'})
        
        merged_df_b = comment_permutations.merge(article_df, left_on='commentID_b', right_on='commentID',  how='inner')[['userID','commentID_a','commentID_b','createDate','comment_vector']] # get post createDate of comment_id
        merged_df_b = merged_df_b.rename(columns={'userID': 'userID_b','createDate': 'createDate_b', 'comment_vector': 'comment_vector_b'})
        
        merged_df = pd.merge(merged_df_a, merged_df_b, on=['commentID_a','commentID_b'])
        filtered = merged_df.loc[merged_df.approveDate_a < merged_df.createDate_b]
    
        comment_combis_df = filtered.reset_index(drop=True) 

        vectors_a_array = np.array([vec for vec in comment_combis_df['comment_vector_a']]) 
        vectors_b_array = np.array([vec for vec in comment_combis_df['comment_vector_b']])
    
        similarities = pairwise_cosine_similarity_batched(vectors_a_array, vectors_b_array) 
        comment_combis_df["similarities"] = similarities                            
    
        similar_comments = comment_combis_df[comment_combis_df['similarities'] > 0.5].copy()
        similar_comments["articleID"] = article
        
        if not similar_comments.empty:
            article_dfs_to_concat.append(similar_comments[['articleID','commentID_a','commentID_b','userID_a','userID_b','approveDate_a','createDate_b','similarities']])
    

implicit_links_df = pd.concat(article_dfs_to_concat, ignore_index=True)  
implicit_links_df.to_parquet('implicit_links.parquet') 
implicit_links_df.head()

Identifying implicit links: 100%|██████████████████████████████████████████████████████| 20/20 [00:03<00:00,  5.92it/s]


Unnamed: 0,articleID,commentID_a,commentID_b,userID_a,userID_b,approveDate_a,createDate_b,similarities
0,nyt://article/69a7090b-9f36-569e-b5ab-b0ba5bb3...,104388059,104389880,79346393,77830605,2020-01-01 02:18:54,2020-01-01 12:02:40,0.638534
1,nyt://article/69a7090b-9f36-569e-b5ab-b0ba5bb3...,104388059,104389944,79346393,1454164,2020-01-01 02:18:54,2020-01-01 12:25:47,0.505946
2,nyt://article/69a7090b-9f36-569e-b5ab-b0ba5bb3...,104388059,104391310,79346393,64729787,2020-01-01 02:18:54,2020-01-01 16:07:02,0.536459
3,nyt://article/69a7090b-9f36-569e-b5ab-b0ba5bb3...,104388059,104392647,79346393,69261094,2020-01-01 02:18:54,2020-01-01 19:07:53,0.645763
4,nyt://article/69a7090b-9f36-569e-b5ab-b0ba5bb3...,104388059,104390537,79346393,48636066,2020-01-01 02:18:54,2020-01-01 14:26:09,0.563657
