In [None]:
import networkx as nx
graph = nx.read_gml("Watts-Lab/author-graph/data/expanded_graph.gml")

In [None]:
def get_titles(graph, author_id):
  titles = list()

  for edge in graph.edges(author_id, data=True):
    titles.append(edge[2]["paperinfo"][1])

  return list(set(titles))

In [None]:
titles = get_titles(graph, "https://dl.acm.org/profile/99660481048")
titles

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
matrix = vectorizer.fit_transform(titles)

In [None]:
# If matrix visualization helps:

feature_names = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(matrix.toarray(), columns=feature_names, index=titles)
# tfidf_df

In [None]:
def calculate_connection_strengths(graph, author_id):

    titles = get_titles(graph, author_id)
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(titles)

    feature_names = tfidf.get_feature_names_out()
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names, index=titles)

    cosine_sim = cosine_similarity(tfidf_matrix)

    connection_strengths = {}
    for neighbor in graph.neighbors(author_id):
        neighbor_titles = get_titles(graph, neighbor)
        neighbor_tfidf = tfidf.transform(neighbor_titles)
        sim = cosine_similarity(tfidf_matrix, neighbor_tfidf).mean()
        connection_strengths[neighbor] = sim

    return connection_strengths, tfidf_df

calculate_connection_strengths(graph, "https://dl.acm.org/profile/99660481048")

In [None]:
def calculate_connection_strengths(graph, author_id):
    titles = get_titles(graph, author_id)
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(titles)

    cosine_sim = cosine_similarity(tfidf_matrix)

    connection_strengths = {}
    for neighbor in graph.neighbors(author_id):
        neighbor_titles = get_titles(graph, neighbor)
        neighbor_tfidf = tfidf.transform(neighbor_titles)
        sim = cosine_similarity(tfidf_matrix, neighbor_tfidf).mean()
        connection_strengths[neighbor] = sim

    max_strength = max(connection_strengths.values())
    normalized_strengths = {k: v / max_strength for k, v in connection_strengths.items()}

    for neighbor, strength in normalized_strengths.items():
        print(f"{neighbor}: {strength:.2f} strength")

    return connection_strengths

In [None]:
calculate_connection_strengths(graph, "https://dl.acm.org/profile/99660481048")

In [None]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.matutils import hellinger

def preprocess_titles(titles):

    processed_titles = []
    for title in titles:
        tokens = gensim.utils.simple_preprocess(title, deacc=True)
        processed_titles.append(tokens)
    return processed_titles

def calculate_topic_similarity(graph, author_id):

    titles = get_titles(graph, author_id)
    processed_titles = preprocess_titles(titles)

    dictionary = corpora.Dictionary(processed_titles)
    corpus = [dictionary.doc2bow(text) for text in processed_titles]

    lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

    author_topic_distributions = [lda_model[doc] for doc in corpus]

    connection_strengths = {}
    for neighbor in graph.neighbors(author_id):
        neighbor_titles = get_titles(graph, neighbor)
        neighbor_processed_titles = preprocess_titles(neighbor_titles)
        neighbor_corpus = [dictionary.doc2bow(text) for text in neighbor_processed_titles]

        neighbor_avg_distribution = [0] * lda_model.num_topics
        for doc_bow in neighbor_corpus:
            for topic, prob in lda_model.get_document_topics(doc_bow):
                neighbor_avg_distribution[topic] += prob
        neighbor_avg_distribution = [prob / len(neighbor_corpus) for prob in neighbor_avg_distribution]


        sims = []
        for author_distribution in author_topic_distributions:
            author_distribution_complete = [0] * lda_model.num_topics
            for topic, prob in author_distribution:
                author_distribution_complete[topic] = prob

            sims.append(hellinger(neighbor_avg_distribution, author_distribution_complete))


        avg_sim = sum(sims) / len(sims) if sims else 0
        connection_strengths[neighbor] = avg_sim

    return connection_strengths

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from scipy.spatial.distance import cosine as cosine_distance

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def encode_titles(titles):
    encoded_input = tokenizer(titles, padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state.mean(dim=1)

def calculate_bert_similarity(graph, author_id):
    titles = get_titles(graph, author_id)
    author_vectors = encode_titles(titles)

    connection_strengths = {}
    for neighbor in graph.neighbors(author_id):
        neighbor_titles = get_titles(graph, neighbor)
        neighbor_vectors = encode_titles(neighbor_titles)

        similarities = [1 - cosine_distance(author_vectors[i].numpy(), neighbor_vectors[j].numpy())
                        for i in range(len(author_vectors))
                        for j in range(len(neighbor_vectors))]

        avg_similarity = sum(similarities) / len(similarities) if similarities else 0
        connection_strengths[neighbor] = avg_similarity

    return connection_strengths

In [None]:
calculate_bert_similarity(graph, "https://dl.acm.org/profile/81100280834")