In [1]:
#!pip install pykeen==1.11.0
#!pip install class-resolver==0.5.4
#!pip install typing-extensions==4.12.2

In [2]:
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline_from_path
#from pykeen.models.predict import predict_tail_scores, predict_head_scores
from pykeen.predict import predict_triples, predict_target

import os
import torch
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import networkx as nx
from pprint import pprint

In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

In [8]:
print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU device: {torch.cuda.get_device_name(0)}")

GPU available: True
GPU device: NVIDIA GeForce RTX 3060 Ti


# Load and Preprocess Data

In [4]:
working_dir = os.getcwd()
train_news_file = os.path.join(working_dir, 'data','train','news.tsv')
train_behavior_file = os.path.join(working_dir, 'data','train','behaviors.tsv')
train_entity_embedding_file = os.path.join(working_dir, 'data','train','entity_embedding.vec')
train_triplets_file = os.path.join(working_dir, 'data','train','triplets.tsv')

val_news_file = os.path.join(working_dir, 'data','val','news.tsv')
val_behavior_file = os.path.join(working_dir, 'data','val','behaviors.tsv')
val_entity_embedding_file = os.path.join(working_dir, 'data','val','entity_embedding.vec')
val_triplets_file = os.path.join(working_dir, 'data','val','triplets.tsv')


In [5]:
# Load behaviors data (user-article interactions)
behaviors = pd.read_csv(train_behavior_file,
                       sep='\t',
                       header=None,
                       names=['impression_id', 'user_id', 'time', 'history', 'impressions'])

behaviors = behaviors.dropna(subset=['user_id','history','impressions'])

# Process click history to get user-article pairs
user_article_pairs = []
for _, row in behaviors.iterrows():
    if pd.notna(row['history']):
        for article in row['history'].split():
            user_article_pairs.append((row['user_id'], article))

user_article_df = pd.DataFrame(user_article_pairs, columns=['user_id', 'article_id']).drop_duplicates()

# Load news data (article metadata)
news = pd.read_csv(train_news_file,
                  sep='\t',
                  header=None,
                  names=['article_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])

# Create article-category relationships
article_category_df = news[['article_id', 'category', 'subcategory']].drop_duplicates()

# Create Triples

In [84]:
# Filter users who have at least 5 clicks
active_users = user_article_df['user_id'].value_counts()
active_users = active_users[active_users >= 5].index.tolist()

user_article_active = user_article_df[user_article_df['user_id'].isin(active_users)]

# Sample 50,000 interactions
sampled_user_article = user_article_active.sample(n=50000, random_state=42)

# User-article triples (user clicks article)
user_article_triples = [(user, 'CLICKS', article) for user, article in sampled_user_article[['user_id', 'article_id']].values]

# Article-category triples (article belongs to category)
article_category_triples = [(row['article_id'], 'BELONGS_TO', row['category'])
                           for _, row in article_category_df.iterrows()]

# Article-subcategory triples
article_subcategory_triples = [(row['article_id'], 'BELONGS_TO_SUBCATEGORY', row['subcategory'])
                               for _, row in article_category_df.iterrows()]

# Create extra meta data triples
entity_triples = []
for _, row in news.iterrows():
    # Parse title entities
    if pd.notna(row['title_entities']):
        entities = json.loads(row['title_entities'])
        for entity in entities:
            entity_triples.append((row['article_id'], 'HAS_ENTITY', entity['WikidataId']))
            entity_triples.append((entity['WikidataId'], 'IS_TYPE', entity['Type']))

# Combine all triples
triples = user_article_triples + article_category_triples + article_subcategory_triples + entity_triples

# TransE Model Training and Evaluation

In [85]:
tf = TriplesFactory.from_labeled_triples(
    np.array(triples),
    create_inverse_triples=True  # Create inverse relations
)

training, testing = tf.split(random_state=256)

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [92058, 45501]


In [86]:
# GPU-optimized pipeline
result = pipeline(
    training=training,
    testing=testing,
    model='TransE',
    model_kwargs=dict(embedding_dim=50),
    training_kwargs=dict(
        num_epochs=25,
        batch_size=512,  # Larger batches better for GPU
    ),
    optimizer_kwargs=dict(lr=0.01),  # Learning rate
    random_seed=256,
    device='gpu',  # This is the key change for GPU
    negative_sampler='basic',
    negative_sampler_kwargs=dict(num_negs_per_pos=10),
    evaluator_kwargs=dict(filtered=True),
    use_tqdm=True  # Show progress bars
)

# Save the model for later use
result.save_to_directory(os.path.join(working_dir, 'results',))

INFO:pykeen.pipeline.api:Using device: gpu
INFO:pykeen.triples.triples_factory:Creating inverse triples.


Training epochs on cuda:0:   0%|          | 0/25 [00:00<?, ?epoch/s]

INFO:pykeen.triples.triples_factory:Creating inverse triples.


Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/711 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/45.5k [00:00<?, ?triple/s]

  return sum(
INFO:pykeen.evaluation.evaluator:Evaluation took 130.18s seconds
INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=89983, num_relations=10, create_inverse_triples=True, num_triples=182001) to file:///C:/Users/peter/Documents/School%20Projects/Advanced%20Data%20Mining/Graph%20Prediction%20on%20Article%20Data/results/training_triples
INFO:pykeen.pipeline.api:Saved to directory: C:\Users\peter\Documents\School Projects\Advanced Data Mining\Graph Prediction on Article Data\results


In [87]:

# Access specific metrics
print("Mean Rank:", result.metric_results.get_metric('mean_rank'))
print("Mean Reciprocal Rank (MRR):", result.metric_results.get_metric('mean_reciprocal_rank'))
print("Hits@1:", result.metric_results.get_metric('hits_at_1')) 
print("Hits@3:", result.metric_results.get_metric('hits_at_3'))
print("Hits@10:", result.metric_results.get_metric('hits_at_10'))

Mean Rank: 10993.625
Mean Reciprocal Rank (MRR): 0.035366449505090714
Hits@1: 0.005802070284169579
Hits@3: 0.04725170875365377
Hits@10: 0.08976725786246456


loss: 0.0001 41003.828125, loss .001 Mean Rank: 41090.44921875, 17120 ComplEx-> TransE, 16409.64453125, 16730 added negative samplerkwargs, embeddings 100->50, 16660, 14567 loss 0.001 -> 01, 16859 batch size 256 -> 128, 14105 batch size 128 - 256 epochs 5->25, 11578 added subcategory, 20820 batch 256 -> 512 loss 0.01 -> 0.001,  16377 -> 9334

In [88]:
model = result.model

# Get top 10 article recommendations for specified user
top_articles = predict_target(
    model=model,
    head='U44625',
    relation='CLICKS',
    tail=None,  # We want to predict tails (articles)
    triples_factory=tf
)