In [3]:
import pandas as pd
from tqdm import tqdm

## Author Relations (mention relations as ground truth)

In [4]:
df = pd.read_csv('discord_author_game.csv')

In [None]:
name_to_id = dict(zip(df["Author"], df["AuthorID"]))
name_set = set(name_to_id.keys())

relations = set()

for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing messages"):
    author_id = row["AuthorID"]
    author_name = row["Author"]
    content = str(row["Content"])
    for name in name_set:
        if f"@{name}" in content and name != author_name:
            target_id = name_to_id[name]
            relations.add((author_id, target_id))

Processing messages: 100%|██████████| 150000/150000 [00:51<00:00, 2935.56it/s]


In [7]:
rel_df = pd.DataFrame(relations, columns=["FromAuthorID", "ToAuthorID"])


In [8]:
rel_df

Unnamed: 0,FromAuthorID,ToAuthorID
0,125328264230207488,215456151268098058
1,225323159145021440,215456151268098058
2,252458118775046144,194657909492285441
3,108632993685336064,215456151268098058
4,138144510285709312,456226577798135808
...,...,...
388,99292154232332288,215456151268098058
389,137785305586597888,456226577798135808
390,228013037318176769,215456151268098058
391,267784015547727873,265866596105322496


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from collections import defaultdict
known = {"AuthorID", "Author", "Game"}
candidates = [c for c in df.columns if c not in known]

text_col = candidates[0]   # we’ll silently use the first extra column

# —————————————
# 3) Build sentiment‑filtered author→games map
# —————————————
analyzer            = SentimentIntensityAnalyzer()
SENTIMENT_THRESHOLD = 0.2

author_to_games   = defaultdict(list)
author_id_to_name = {}

for _, row in df.iterrows():
    aid  = str(row["AuthorID"])
    name = str(row["Author"])
    author_id_to_name[aid] = name

    if pd.isna(row["Game"]):
        continue

    message = str(row[text_col])
    score   = analyzer.polarity_scores(message)["compound"]
    if score < SENTIMENT_THRESHOLD:
        continue

    games = [g.strip().lower() for g in row["Game"].split(",") if g.strip()]
    author_to_games[aid].extend(games)


In [10]:
count = 0

for _, row in rel_df.iterrows():
    from_id = str(row["FromAuthorID"])
    to_id   = str(row["ToAuthorID"])

    from_games = author_to_games.get(from_id, [])
    to_games   = author_to_games.get(to_id, [])

    if from_games and to_games:
        count += 1

print(f"Number of author pairs with non-empty game mentions: {count}")

Number of author pairs with non-empty game mentions: 244


## Evaluation of related pairs (with cos)

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

# Build game index
all_games = sorted(set(game for games in author_to_games.values() for game in games))
game_to_index = {game: i for i, game in enumerate(all_games)}

# Build author vectors (only include authors with at least one game)
author_vectors = {}
author_gamesets = {}
for author_id, games in author_to_games.items():
    if games:
        vec = np.zeros(len(all_games))
        for game in games:
            if game in game_to_index:
                vec[game_to_index[game]] = 1
        author_vectors[author_id] = vec
        author_gamesets[author_id] = set(games)

# Compute cosine similarity for valid author pairs with overlapping games
similarities = []
valid_pairs = []

for _, row in tqdm(rel_df.iterrows(), total=len(rel_df)):
    aid1 = str(row["FromAuthorID"])
    aid2 = str(row["ToAuthorID"])
    
    if aid1 in author_vectors and aid2 in author_vectors:
        games1 = author_gamesets[aid1]
        games2 = author_gamesets[aid2]
        if games1 & games2:  # check for overlap
            v1 = author_vectors[aid1].reshape(1, -1)
            v2 = author_vectors[aid2].reshape(1, -1)
            sim = cosine_similarity(v1, v2)[0][0]
            similarities.append(sim)
            valid_pairs.append((aid1, aid2, sim))

if similarities:
    avg_sim = sum(similarities) / len(similarities)
    print(f"✅ Valid overlapping pairs: {len(similarities)}")
    print(f"🔍 Average cosine similarity (with overlap): {avg_sim:.4f}")
else:
    print("⚠️ No valid overlapping author pairs found.")


  0%|          | 0/393 [00:00<?, ?it/s]

100%|██████████| 393/393 [00:00<00:00, 3705.05it/s]

✅ Valid overlapping pairs: 172
🔍 Average cosine similarity (with overlap): 0.1106





## Evaluation of random pairs (with cos)

In [None]:
import random


authors_with_games = [author_id for author_id, games in author_to_games.items() if games]


num_samples = 1000  


similarities = []


for _ in tqdm(range(num_samples), desc="Calculating Similarities"):

    author1, author2 = random.sample(authors_with_games, 2)
    
    games1 = author_gamesets[author1]
    games2 = author_gamesets[author2]
    
    if games1 & games2: 
        v1 = author_vectors[author1].reshape(1, -1)
        v2 = author_vectors[author2].reshape(1, -1)
        
        
        sim = cosine_similarity(v1, v2)[0][0]
        similarities.append(sim)

if similarities:
    avg_sim = sum(similarities) / len(similarities)
    print(f"✅ Average cosine similarity for {num_samples} random pairs: {avg_sim:.4f}")
else:
    print("⚠️ No valid pairs found for cosine similarity calculation.")


Calculating Similarities: 100%|██████████| 1000/1000 [00:00<00:00, 22229.84it/s]

✅ Average cosine similarity for 1000 random pairs: 0.3587





## Evaluation of random pairs (with Jaccard similarity)

In [15]:
from tqdm import tqdm

def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    if not set1 and not set2:
        return 0.0
    return len(set1 & set2) / len(set1 | set2)

similarities = []

for _, row in tqdm(rel_df.iterrows(), total=len(rel_df)):
    aid1 = str(row["FromAuthorID"])
    aid2 = str(row["ToAuthorID"])

    games1 = author_to_games.get(aid1, [])
    games2 = author_to_games.get(aid2, [])

    if games1 and games2:
        sim = jaccard_similarity(games1, games2)
        similarities.append(sim)

if similarities:
    avg_similarity = sum(similarities) / len(similarities)
    print(f"🔍 Average Jaccard similarity between related authors: {avg_similarity:.4f}")
else:
    print("⚠️ No valid author pairs with non-empty game lists.")

100%|██████████| 393/393 [00:00<00:00, 23493.32it/s]

🔍 Average Jaccard similarity between related authors: 0.0263





In [35]:
rel_df

Unnamed: 0,FromAuthorID,ToAuthorID
0,125328264230207488,215456151268098058
1,225323159145021440,215456151268098058
2,252458118775046144,194657909492285441
3,108632993685336064,215456151268098058
4,138144510285709312,456226577798135808
...,...,...
388,99292154232332288,215456151268098058
389,137785305586597888,456226577798135808
390,228013037318176769,215456151268098058
391,267784015547727873,265866596105322496


## Evaluation of ranking by BM25 (with MRR)

In [None]:
from rank_bm25 import BM25Okapi
from collections import defaultdict
import pandas as pd
from tqdm import tqdm


valid_pairs = []
for _, row in rel_df.iterrows():
    a1, a2 = str(row["FromAuthorID"]), str(row["ToAuthorID"])
    g1 = author_to_games.get(a1, [])
    g2 = author_to_games.get(a2, [])
    if g1 and g2 and set(g1) & set(g2):
        valid_pairs.append((a1, a2))


documents = [author_to_games[aid] for aid in author_to_games]
author_ids = list(author_to_games.keys())
bm25 = BM25Okapi(documents)


def compute_mrr(pairs, author_to_games, bm25, author_ids, top_k=100):
    reciprocal_ranks = []

    for a1, a2 in tqdm(pairs, desc="Computing MRR"):
        if a1 not in author_to_games or a2 not in author_to_games:
            continue

        query_tokens = author_to_games[a1]
        if not query_tokens:
            continue

        scores = bm25.get_scores(query_tokens)
        sorted_indices = sorted(range(len(scores)), key=lambda i: -scores[i])
        sorted_authors = [author_ids[i] for i in sorted_indices]

        if a2 in sorted_authors[:top_k]:
            rank = sorted_authors.index(a2) + 1
            reciprocal_ranks.append(1 / rank)

    if reciprocal_ranks:
        mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
        print(f"✅ MRR: {mrr:.4f}")
    else:
        print("⚠️ No valid reciprocal ranks found.")


compute_mrr(valid_pairs, author_to_games, bm25, author_ids)


Computing MRR: 100%|██████████| 393/393 [00:04<00:00, 94.15it/s] 

✅ MRR: 0.0455





## Evaluation of ranking by TF-IDF (with MRR)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from tqdm import tqdm
import pandas as pd
import numpy as np

valid_pairs = []
for _, row in rel_df.iterrows():
    a1, a2 = str(row["FromAuthorID"]), str(row["ToAuthorID"])
    g1 = author_to_games.get(a1, [])
    g2 = author_to_games.get(a2, [])
    if g1 and g2 and set(g1) & set(g2):
        valid_pairs.append((a1, a2))


author_ids = list(author_to_games.keys())
docs = [" ".join(author_to_games[aid]) for aid in author_ids] 
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(docs)  # shape: (num_authors, num_tokens)


reciprocal_ranks = []
for a1, a2 in tqdm(valid_pairs, desc="Computing TF-IDF MRR"):
    try:
        idx_a1 = author_ids.index(a1)
        idx_a2 = author_ids.index(a2)
    except ValueError:
        continue 

    query_vec = tfidf_matrix[idx_a1]
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()  # shape: (num_authors,)
    ranked_indices = np.argsort(similarities)[::-1]  

    if idx_a2 in ranked_indices:
        rank = list(ranked_indices).index(idx_a2) + 1 
        reciprocal_ranks.append(1.0 / rank)


if reciprocal_ranks:
    mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
    print(f"✅ TF-IDF MRR: {mrr:.4f}")
else:
    print("⚠️ No valid reciprocal ranks found.")


In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
df = pd.read_csv("discord_author_game.csv")  
df["AuthorID"] = df["AuthorID"].astype(str)
df["Game"] = df["Game"].astype(str)


df["GameList"] = df["Game"].str.lower().str.split(",")
rows = []
for _, row in df.iterrows():
    aid = row["AuthorID"]
    for game in row["GameList"]:
        rows.append((aid, game.strip()))

df_expanded = pd.DataFrame(rows, columns=["AuthorID", "Game"])
df_expanded.drop_duplicates(inplace=True)


author_idx = {aid: i for i, aid in enumerate(df_expanded["AuthorID"].unique())}
game_idx = {g: i for i, g in enumerate(df_expanded["Game"].unique())}

row_ind = df_expanded["AuthorID"].map(author_idx)
col_ind = df_expanded["Game"].map(game_idx)
data = np.ones(len(df_expanded))

sparse_matrix = csr_matrix((data, (row_ind, col_ind)),
                           shape=(len(author_idx), len(game_idx)))


sim_matrix = cosine_similarity(sparse_matrix)

authors = list(author_idx.keys())


valid_pairs = []
aid_to_games = df_expanded.groupby("AuthorID")["Game"].apply(set).to_dict()
for i in range(len(authors)):
    for j in range(i+1, len(authors)):
        a1, a2 = authors[i], authors[j]
        if aid_to_games[a1] & aid_to_games[a2]:  
            valid_pairs.append((a1, a2))
            valid_pairs.append((a2, a1))  


reciprocal_ranks = []
aid_to_idx = {aid: i for i, aid in enumerate(authors)}

for a1, a2 in tqdm(valid_pairs, desc="Evaluating MRR"):
    idx1 = aid_to_idx[a1]
    idx2 = aid_to_idx[a2]
    sims = sim_matrix[idx1]
    ranking = np.argsort(sims)[::-1]
    ranked_authors = [authors[i] for i in ranking if authors[i] != a1]

    if a2 in ranked_authors:
        rank = ranked_authors.index(a2) + 1
        reciprocal_ranks.append(1 / rank)


if reciprocal_ranks:
    mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
    print(f"✅ Fast MRR: {mrr:.4f}")
else:
    print("⚠️ No valid reciprocal ranks found.")


Evaluating MRR: 100%|██████████| 3617212/3617212 [17:31<00:00, 3440.30it/s]

✅ Fast MRR: 0.0043





In [56]:
df

Unnamed: 0,AuthorID,Author,Content,Game
0,111248848113909760,saticron,\r\n💩💩💩🚽💩🚽🚽💩\r\n💩🚽🚽🚽💩🚽🚽💩\r\n💩💩🚽🚽💩🚽🚽💩\r\n💩🚽🚽🚽💩🚽...,
1,142447095830413312,psych3dout,k,
2,129272120336318464,bluntamputation,Memes,
3,123641250879504384,brightqwerty,O shiiiiiit,
4,123641250879504384,brightqwerty,The shit post channel is born,
...,...,...,...,...
149995,456226577798135808,Deleted User,Oh awesome :D,
149996,287175303149649922,meephi2133,im back,
149997,174738347602870273,shinonny,Wb,
149998,131041525185380352,dreams0129,steam has a ck2 sale,Crusader Kings II


## Evaluation of ranking by MF (with MRR)

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import numpy as np
from tqdm import tqdm


author_ids = list(author_to_games.keys())
game_set = set(g for games in author_to_games.values() for g in games)
game_ids = list(game_set)

author_idx = {a: i for i, a in enumerate(author_ids)}
game_idx = {g: i for i, g in enumerate(game_ids)}

rows, cols, data = [], [], []
for a in author_ids:
    for g in author_to_games[a]:
        rows.append(author_idx[a])
        cols.append(game_idx[g])
        data.append(1)

interaction_matrix = csr_matrix((data, (rows, cols)), shape=(len(author_ids), len(game_ids)))


n_components = min(32, interaction_matrix.shape[1] - 1)
svd = TruncatedSVD(n_components=n_components, random_state=42)
author_embeddings = svd.fit_transform(interaction_matrix)


sim_matrix = cosine_similarity(author_embeddings)


aid_to_idx = author_idx


def compute_mrr_svd(pairs, sim_matrix, author_ids, aid_to_idx, top_k=100):
    reciprocal_ranks = []

    for a1, a2 in tqdm(pairs, desc="Computing MRR with SVD"):
        if a1 not in aid_to_idx or a2 not in aid_to_idx:
            continue

        idx1 = aid_to_idx[a1]
        idx2 = aid_to_idx[a2]

        sims = sim_matrix[idx1]
        sorted_indices = np.argsort(sims)[::-1]
        ranked_aids = [author_ids[i] for i in sorted_indices if author_ids[i] != a1]

        if a2 in ranked_aids[:top_k]:
            rank = ranked_aids.index(a2) + 1
            reciprocal_ranks.append(1 / rank)

    if reciprocal_ranks:
        mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
        print(f"✅ SVD-based MRR: {mrr:.4f}")
    else:
        print("⚠️ No valid reciprocal ranks found.")

compute_mrr_svd(valid_pairs, sim_matrix, author_ids, aid_to_idx)


Computing MRR with SVD: 100%|██████████| 393/393 [00:00<00:00, 2812.43it/s]

✅ SVD-based MRR: 0.0615





## Evaluation of ranking with embedding (with MRR)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm


author_to_text = {aid: " ".join(games) for aid, games in author_to_games.items()}
author_ids = list(author_to_text.keys())
author_texts = [author_to_text[aid] for aid in author_ids]


from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
author_embeddings = model.encode(author_texts, show_progress_bar=True)


aid_to_idx = {aid: idx for idx, aid in enumerate(author_ids)}


sim_matrix = cosine_similarity(author_embeddings)


def compute_mrr_with_embedding(pairs, sim_matrix, author_ids, aid_to_idx, top_k=100):
    reciprocal_ranks = []

    for a1, a2 in tqdm(pairs, desc="Computing MRR with Embedding"):
        if a1 not in aid_to_idx or a2 not in aid_to_idx:
            continue

        idx1 = aid_to_idx[a1]
        idx2 = aid_to_idx[a2]

        sims = sim_matrix[idx1]
        sorted_indices = np.argsort(sims)[::-1]
        ranked_aids = [author_ids[i] for i in sorted_indices if author_ids[i] != a1]

        if a2 in ranked_aids[:top_k]:
            rank = ranked_aids.index(a2) + 1
            reciprocal_ranks.append(1 / rank)

    if reciprocal_ranks:
        mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
        print(f"✅ Embedding-based MRR: {mrr:.4f}")
    else:
        print("⚠️ No valid reciprocal ranks found.")


compute_mrr_with_embedding(valid_pairs, sim_matrix, author_ids, aid_to_idx)


Batches: 100%|██████████| 62/62 [00:02<00:00, 24.41it/s]
Computing MRR with Embedding: 100%|██████████| 393/393 [00:00<00:00, 3020.06it/s]

✅ Embedding-based MRR: 0.1528



