### 1. Data Loading

In [7]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data/books_graph_facts.csv')

# Ensure the columns are named 'head', 'relation', 'tail'
df.columns = ['head', 'relation', 'tail']

# Display first few rows to confirm
print(df.head())


          head          relation           tail
0    book_3725         won_award       award_85
1    book_1435      published_by  publisher_152
2   author_944             wrote      book_3099
3  reader_4577              read      book_2631
4    book_1633  belongs_to_genre       genre_11


### 2. Data Tokenization

#### Training with pykeen
* pykeen.pipeline.pipeline() - To train and evaluate the model
* pykeen.models - Provide list of supported models 
* pykeen.datasets - Load the dataset
* pykeen.training - Model can be trained under stochastic local closed world assumption (LCWA)
* pykeen.sampling - to negative sample options like Basic, Bernoulli
* pykeen.evaluation - evaluate the model and returns the MetricResults (default - RankBasedEvaluator)
* stopper - early stopping of training the model
* Learning rate - lr_schedule_kwargs - to schedule the leanring rate with gamma params
* Triples - Classes for creating and storing training data from triples

In [10]:
from pykeen.triples import TriplesFactory

# Convert the dataset to a triples array (head, relation, tail)
triples = df[['head', 'relation', 'tail']].values

# Create a TriplesFactory from the triples
triples_factory = TriplesFactory.from_labeled_triples(triples)

# Split the data into training, testing, and validation sets
training, testing, validation = triples_factory.split([0.8, 0.1, 0.1])

using automatically assigned random_state=1127245888


### 3. Negative Sampling with Pykeen

- Uniform Negative sampling - Randomly corrupts either Head or Tail.  

        negative_sampler='basic',
        negative_sampler_kwargs=dict(corruption_scheme=('h', 'r', 't'))  # Inform which corruption methods are used
- Bernoulli Negative Sampling - Based on the probability (pr or (1-pr))  

        negative_sampler='bernoulli'
- Identify False negatives during training  

        negative_sampler='basic',
        
        negative_sampler_kwargs=dict(filtered=True) # to filter False negatives from sampling

        negative_sampler_kwargs=dict(filtered=True, filterer='bloom', 
                filterer_kwargs=dict(error_rate=0.0001)) # decreading the error rate will increase the computation cost.

        negative_sampler_kwargs=dict(filtered=True,
                filterer='python-set') # Guarantees that false negatives are filtered 


#### Version 1 implmentation: Usage of basic negative sampler

In [11]:
from pykeen.pipeline import pipeline

# Set up parameters for the pipeline
model_name = 'RotatE'  # You can change this to other models like TransE, ComplEx, etc.
epochs = 100  # Number of epochs to train
seed = 42  # Set random seed for reproducibility

# Train the model using the pipeline
result = pipeline(
    training=training,       # Training set
    testing=testing,         # Testing set
    validation=validation,   # Validation set
    model=model_name,        # Model name
    training_kwargs=dict(
        num_epochs=epochs,   # Training epochs
        use_tqdm_batch=False  # Disable tqdm progress bar for batches
    ),
    random_seed=seed,        # Set random seed
    negative_sampler='basic'  # Specify the negative sampler (basic, bernoulli, etc.)
)

# Print the final evaluation result (Hits@10 is used as an example metric)
hits_at_10 = result.metric_results.get_metric("hits_at_10")
print(f"Hits@10: {hits_at_10:.3f}")

Training epochs on cuda:0: 100%|██████████| 100/100 [11:47<00:00,  7.08s/epoch, loss=0.0329, prev_loss=0.0329]
Evaluating on cuda:0: 100%|██████████| 56.3k/56.3k [00:53<00:00, 1.05ktriple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 106.11s seconds


Hits@10: 0.029


In [15]:
import os

# Define the directory where you want to save the model
save_dir = './model/'

# Check if the directory exists, if not, create it
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Save the model, results, and metadata to the directory
result.save_to_directory(save_dir)

INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=11570, num_relations=5, create_inverse_triples=False, num_triples=450636) to file:///C:/Users/EzhilPriyadharshiniK/OneDrive%20-%20Infoseck2k/Documents/Priya/Code/KnowledgeGraphs/model/training_triples
INFO:pykeen.pipeline.api:Saved to directory: C:\Users\EzhilPriyadharshiniK\OneDrive - Infoseck2k\Documents\Priya\Code\KnowledgeGraphs\model


#### Version 2 implmentation: Usage of basic negative sampler with filter 

In [16]:
from pykeen.pipeline import pipeline

# Set up parameters for the pipeline
model_name = 'RotatE'  # You can change this to other models like TransE, ComplEx, etc.
epochs = 100  # Number of epochs to train
seed = 42  # Set random seed for reproducibility

# Train the model using the pipeline
result = pipeline(
    training=training,       # Training set
    testing=testing,         # Testing set
    validation=validation,   # Validation set
    model=model_name,        # Model name
    training_kwargs=dict(
        num_epochs=epochs,   # Training epochs
        use_tqdm_batch=False  # Disable tqdm progress bar for batches
    ),
    random_seed=seed,        # Set random seed
    negative_sampler='basic',  # Specify the negative sampler (basic, bernoulli, etc.)
    negative_sampler_kwargs=dict(
        filtered=True,
        filterer='python-set',    
    ),
)

# Print the final evaluation result (Hits@10 is used as an example metric)
hits_at_10 = result.metric_results.get_metric("hits_at_10")
print(f"Hits@10: {hits_at_10:.3f}")

INFO:pykeen.pipeline.api:Using device: None
Training epochs on cuda:0: 100%|██████████| 100/100 [59:30<00:00, 35.70s/epoch, loss=0.0137, prev_loss=0.0138]  
Evaluating on cuda:0: 100%|██████████| 56.3k/56.3k [00:58<00:00, 966triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 90.38s seconds


Hits@10: 0.022


#### Version 3 implmentation: Usage of basic negative sampler. To perform the corruption just on tail along with data informed sampling

In [23]:
from pykeen.pipeline import pipeline

# Set up parameters for the pipeline
model_name = 'RotatE'  # You can change this to other models like TransE, ComplEx, etc.
epochs = 100  # Number of epochs to train
seed = 42  # Set random seed for reproducibility

# Train the model using the pipeline
result = pipeline(
    training=training,       # Training set
    testing=testing,         # Testing set
    validation=validation,   # Validation set
    model=model_name,        # Model name
    training_kwargs=dict(
        num_epochs=epochs,   # Training epochs
        use_tqdm_batch=False  # Disable tqdm progress bar for batches
    ),
    random_seed=seed,        # Set random seed
    negative_sampler='basic',  # Specify the negative sampler (basic, bernoulli, etc.)
    negative_sampler_kwargs=dict(
        corruption_scheme=['tail'], # corrupt the tail
        filtered=True,
        filterer='python-set',    
    ),
) 

# Print the final evaluation result (Hits@10 is used as an example metric)
hits_at_10 = result.metric_results.get_metric("hits_at_10")
print(f"Hits@10: {hits_at_10:.3f}")

INFO:pykeen.pipeline.api:Using device: None
Training epochs on cuda:0: 100%|██████████| 100/100 [13:51<00:00,  8.32s/epoch, loss=0.00676, prev_loss=0.00682]
Evaluating on cuda:0: 100%|██████████| 56.3k/56.3k [01:02<00:00, 905triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 108.79s seconds


Hits@10: 0.054
