In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data/books_graph_facts.csv')

# Display the first few rows to inspect the structure
print(df.head())


          head          relation           tail
0    book_3725         won_award       award_85
1    book_1435      published_by  publisher_152
2   author_944             wrote      book_3099
3  reader_4577              read      book_2631
4    book_1633  belongs_to_genre       genre_11


In [3]:
# Get unique entities from 'head' and 'tail' columns
unique_entities = pd.unique(df[['head', 'tail']].values.ravel('K'))

# Get unique relations
unique_relations = df['relation'].unique()

print(f"Unique Entities: {len(unique_entities)}")
print(f"Unique Relations: {unique_relations}")

Unique Entities: 11570
Unique Relations: ['won_award' 'published_by' 'wrote' 'read' 'belongs_to_genre']


In [4]:
# Frequency of relations
relation_counts = df['relation'].value_counts()

print("Relation Frequency:")
print(relation_counts)

Relation Frequency:
relation
read                257137
won_award            86226
belongs_to_genre     85646
wrote                85565
published_by         85426
Name: count, dtype: int64


In [5]:
# Analyze the 'belongs_to_genre' relation
genre_relations = df[df['relation'] == 'belongs_to_genre']
print(f"Books belonging to genres: {genre_relations.head()}")

# Analyze the 'published_by' relation
publisher_relations = df[df['relation'] == 'published_by']
print(f"Books published by publishers: {publisher_relations.head()}")

Books belonging to genres:          head          relation      tail
4   book_1633  belongs_to_genre  genre_11
12  book_2510  belongs_to_genre  genre_15
22  book_1769  belongs_to_genre  genre_17
31  book_3084  belongs_to_genre  genre_13
61  book_4350  belongs_to_genre  genre_17
Books published by publishers:          head      relation           tail
1   book_1435  published_by  publisher_152
14  book_3503  published_by   publisher_97
15  book_1085  published_by   publisher_47
29   book_505  published_by  publisher_199
49   book_777  published_by  publisher_203


In [6]:
# Filter relations where the head is an author and the relation is 'wrote'
author_book_relations = df[(df['relation'] == 'wrote') & (df['head'].str.contains('author'))]
print(author_book_relations.head())

          head relation       tail
2   author_944    wrote  book_3099
16  author_188    wrote  book_3740
17  author_175    wrote  book_3593
18   author_25    wrote   book_223
19  author_838    wrote  book_4485


### Identify the type of relation

* Symmetric: If for any triple (A, R, B) exists, then the reverse triple (B, R, A) should also exist.  
* Asymmetric: If for any triple (A, R, B) exists, then the reverse triple (B, R, A) should not exist.  
* 1-to-N: A single head entity is associated with multiple tail entities for the same relation.  
* Compositional: Involves multiple relations between different entities that infer a new relation.

![Relations between entities](./resources/Relations.png)

In [7]:
# Initialize the new column with default value 'unknown'
df['relation_type'] = 'unknown'

In [8]:
df.head(2)

Unnamed: 0,head,relation,tail,relation_type
0,book_3725,won_award,award_85,unknown
1,book_1435,published_by,publisher_152,unknown


In [9]:
def check_one_to_n(df, relation):
    """
    Check if a given relation is 1-to-N.
    A relation is 1-to-N if a single head entity is associated with multiple tail entities.
    """
    # Group by head and relation, and count the number of unique tails
    head_relation_counts = df[df['relation'] == relation].groupby('head').size()
    
    # Check if any head entity has more than 1 associated tail
    return head_relation_counts.max() > 1

In [10]:
def check_compositional_relation(df, relation1, relation2, implied_relation):
    """
    Check if two relations (relation1 and relation2) compose to imply a third relation.
    For example:
    (A, relation1, B) and (B, relation2, C) => (A, implied_relation, C)
    """
    # Find all triples for the first relation
    comp_found = False
    for index, row in df[df['relation'] == relation1].iterrows():
        head1, tail1 = row['head'], row['tail']
        
        # Look for the second relation where the head is the tail of the first relation
        matching_rows = df[(df['head'] == tail1) & (df['relation'] == relation2)]
        
        # If we find a match, check if the implied relation exists
        for _, match_row in matching_rows.iterrows():
            tail2 = match_row['tail']
            
            # Check if the implied relation exists
            if (df[(df['head'] == head1) & (df['tail'] == tail2) & (df['relation'] == implied_relation)].empty):
                # If the implied relation is missing, mark as compositional
                comp_found = True
                print(f"Compositional relation found: ({head1}, {relation1}, {tail1}) + ({tail1}, {relation2}, {tail2}) => ({head1}, {implied_relation}, {tail2})")
                break
    
    return comp_found

In [11]:
def get_relation_type(df, relation):
    """
    Determine if a relation is symmetric, asymmetric, 1-to-N, or compositional.
    """
    is_symmetric = True
    is_asymmetric = True
    is_one_to_n = check_one_to_n(df, relation)
    #is_compositional = check_compositional_relation(df, 'isFatherOf', 'isBrotherOf', 'isUncleOf')  # Adjust based on your dataset relations

    for index, row in df[df['relation'] == relation].iterrows():
        head, tail = row['head'], row['tail']
        
        # Check for reverse triple (head, relation, tail) -> (tail, relation, head)
        if not ((df['head'] == tail) & (df['tail'] == head) & (df['relation'] == relation)).any():
            is_symmetric = False
        if ((df['head'] == tail) & (df['tail'] == head) & (df['relation'] == relation)).any():
            is_asymmetric = False

    # if is_compositional:
    #     return 'compositional'
    # elif is_symmetric:
    if is_symmetric:
        return 'symmetric'
    elif is_asymmetric:
        return 'asymmetric'
    elif is_one_to_n:
        return '1-to-N'
    else:
        return 'unknown'

In [1]:
# Get unique relations to apply the checks only once per relation type
unique_relations = df['relation'].unique()

# Loop through each relation type and apply the relation type check
for relation in unique_relations:
    relation_type = get_relation_type(df, relation)
    
    # Update the 'relation_type' column for all rows that have the same relation
    df.loc[df['relation'] == relation, 'relation_type'] = relation_type

# Verify the updated DataFrame
print(df.head())

NameError: name 'df' is not defined

In [None]:
# Save the updated dataset to a new CSV file
df.to_csv('updated_books_graph_with_relations_data.csv', index=False)