In [1]:
#!pip install torch_geometric
!pip install stellargraph

ERROR: Could not find a version that satisfies the requirement stellargraph (from versions: none)
ERROR: No matching distribution found for stellargraph


In [2]:
import os
import torch
import json
import numpy as np
import pandas as pd
import networkx as nx

from node2vec import Node2Vec

In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

In [4]:
print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU device: {torch.cuda.get_device_name(0)}")

GPU available: True
GPU device: NVIDIA GeForce RTX 3060 Ti


# Load and Preprocess Data

In [5]:
working_dir = os.getcwd()
train_news_file = os.path.join(working_dir, 'data','train','news.tsv')
train_behavior_file = os.path.join(working_dir, 'data','train','behaviors.tsv')
train_entity_embedding_file = os.path.join(working_dir, 'data','train','entity_embedding.vec')
train_triplets_file = os.path.join(working_dir, 'data','train','triplets.tsv')

train_news_with_sentiment_file = os.path.join(working_dir, 'data','train','news_with_sentiment.csv')

In [6]:
# Load behaviors data (user-article interactions)
behaviors = pd.read_csv(train_behavior_file,
                       sep='\t',
                       header=None,
                       names=['impression_id', 'user_id', 'time', 'history', 'impressions'])

# Drop NA values
behaviors = behaviors.dropna(subset=['user_id','history','impressions'])

# Process click history to get user-article pairs
user_article_pairs = []
for _, row in behaviors.iterrows():
    if pd.notna(row['history']):
        for article in row['history'].split():
            user_article_pairs.append((row['user_id'], article))

# Remove duplicates
user_article_df = pd.DataFrame(user_article_pairs, columns=['user_id', 'article_id']).drop_duplicates()

# Load meta data (article metadata)
news = pd.read_csv(train_news_file,
                  sep='\t',
                  header=None,
                  names=['article_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities', 'sentiment'])

# Create article-category relationships and remove duplicates
article_category_df = news[['article_id', 'category', 'subcategory']].drop_duplicates()

# Load sentiment data (team produced metadata)
news_tone = pd.read_csv(train_news_with_sentiment_file,
                  header=None,
                  names=['article_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities', 'sentiment'])

# Create article-category relationships and remove duplicates
article_tone_df = news[['article_id', 'sentiment']].drop_duplicates()

# Create Pairs
A bias in the data creates more user article pares than any article meta data pairs, we collected a random same of the most active users to adjust.

In [94]:
train_user_article = []
test_user_article = []
length = 0
for user_id, group in user_article_df.groupby('user_id'):
    length += len(group)
    train_user_article.append(group.iloc[:-1])      
    test_user_article.append(group.iloc[-1:]) 
    if length>30000:
        break

train_user_article_df = pd.concat(train_user_article).reset_index(drop=True)
test_user_article_df = pd.concat(test_user_article).reset_index(drop=True)

In [99]:
articles_in_sample = [row['article_id'] for _, row in train_user_article_df.iterrows()]

In [100]:
article_category_triples = []
article_subcategory_triples=[]
for _, row in article_category_df.iterrows():
    if row['article_id'] in articles_in_sample:
        article_category_triples.append((row['article_id'], 'BELONGS_TO', row['category']))
        article_subcategory_triples.append((row['article_id'], 'BELONGS_TO_SUBCATEGORY', row['subcategory']))

In [101]:
# Article-subcategory triples
article_tone_triples = [(row['article_id'], 'HAS_TONE', row['sentiment'])
                        for _, row in article_tone_df.iterrows()
                        if row['article_id'] in articles_in_sample]

In [102]:
# Create extra meta data triples
entity_triples = []
for _, row in news.iterrows():
    # Parse title entities
    if pd.notna(row['title_entities']):
        if row['article_id'] in articles_in_sample:
            entities = json.loads(row['title_entities'])
            for entity in entities:  
                entity_triples.append((row['article_id'], 'HAS_ENTITY', entity['WikidataId']))
                entity_triples.append((row['article_id'], 'IS_WIKI_TYPE', entity['Type']))

In [103]:
# Combine all triples
triples = user_article_triples + article_category_triples + article_subcategory_triples + entity_triples
triples_tone = user_article_triples + article_category_triples + article_subcategory_triples + entity_triples + article_tone_triples
nodepairs = [(row[0], row[2]) for row in triples]
nodepairs_tone = [(row[0], row[2]) for row in triples_tone]
user_article_df = user_article_df

# Node2Vec Model Training and Evaluation
### MIND Provided Features and MetaData

In [39]:
G = nx.Graph()
G.add_edges_from(nodepairs)

In [40]:
# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec = Node2Vec(G, dimensions=10, walk_length=10, num_walks=5, workers=1)  # Use temp_folder for big graphs

# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)

Computing transition probabilities:   0%|          | 0/15021 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|█████████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.80s/it]


In [41]:
nodepairs[0]

('U76823', 'N57318')

In [46]:
# Look for most similar nodes
most_similar = model.wv.most_similar('U76823')  # Output node names are always strings

predicted = [(id_, score) for id_, score in most_similar if id_.startswith('N')]
predicted

[('N57318', 0.9844935536384583),
 ('N30610', 0.9794884324073792),
 ('N29764', 0.9766647219657898)]

### User Article Only

In [104]:
G = nx.Graph()
G.add_edges_from(train_user_article_df.values.tolist())

In [105]:
# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec = Node2Vec(G, dimensions=2, walk_length=10, num_walks=5, workers=1)  # Use temp_folder for big graphs

# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)

Computing transition probabilities:   0%|          | 0/9423 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|█████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  1.78it/s]


In [109]:
test_user_article_df.values.tolist()[0]

['U100', 'N20575']

In [None]:
train_user_article_df

In [110]:
most_similar = model.wv.most_similar('U100', topn=100)  # Output node names are always strings

In [112]:
predicted = [(id_, score) for id_, score in most_similar if id_.startswith('N')]
predicted

[('N19432', 1.0),
 ('N56521', 1.0),
 ('N28684', 1.0),
 ('N54627', 1.0),
 ('N64241', 0.9999999403953552),
 ('N17652', 0.9999998807907104),
 ('N55558', 0.9999998211860657),
 ('N65172', 0.9999998211860657),
 ('N10039', 0.9999997615814209),
 ('N46256', 0.9999997019767761),
 ('N45452', 0.9999996423721313),
 ('N34930', 0.9999995827674866),
 ('N52910', 0.9999995231628418),
 ('N41456', 0.999999463558197),
 ('N22344', 0.999999463558197),
 ('N50500', 0.999999463558197),
 ('N46091', 0.9999991059303284),
 ('N17952', 0.9999991059303284),
 ('N40843', 0.9999990463256836),
 ('N1880', 0.999998927116394),
 ('N29778', 0.9999988675117493),
 ('N25946', 0.9999988675117493),
 ('N63842', 0.9999987483024597),
 ('N306', 0.9999984502792358),
 ('N14863', 0.9999983310699463),
 ('N9364', 0.9999982714653015),
 ('N10559', 0.9999982118606567),
 ('N60029', 0.9999980926513672),
 ('N33617', 0.9999980926513672),
 ('N30387', 0.9999979734420776),
 ('N62058', 0.9999975562095642),
 ('N43295', 0.9999971389770508),
 ('N19211', 