In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [63]:
#Reading the Query 1

query1 = pd.read_csv('Query1.csv')
query2 = pd.read_csv('Query2.csv')
query3 = pd.read_csv('Query3.csv')

## Part 1: Quality Measure Computation of Rules

Given a set of Horn rules that are assumed to be discovered by an automatic tool using quality measures such as support, head-coverage, and confidence, here's a reminder of how these measures can be computed for a given rule in the form \(\vec{B} \rightarrow r(x, y)\):

- **Support** (\(supp(\vec{B} \rightarrow r(x, y))\)) is defined as: 
  \[
  supp(\vec{B} \rightarrow r(x, y)) := \#\{(x, y) : \exists z_1, ..., z_m : \vec{B} \land r(x, y)\}
  \]

- **Head-Coverage** (\(hc(\vec{B} \rightarrow r(x, y))\)) is defined as: 
  \[
  hc(\vec{B} \rightarrow r(x, y)) := \frac{supp(\vec{B} \rightarrow r(x, y))}{\#\{(x, y) : r(x, y)\}}
  \]

- **Confidence** (\(conf(\vec{B} \rightarrow r(x, y))\)) is defined as: 
  \[
  conf(\vec{B} \rightarrow r(x, y)) := \frac{supp(\vec{B} \rightarrow r(x, y))}{\#\{(x, y) : \exists z_1, ..., z_m : \vec{B}\}}
  \]

### Considered Rules

Let's consider the following rules \(r_1\), \(r_2\), and \(r_3\) where the atoms predicate(x, y) are written as \((?x\ \text{predicate}\ ?y)\):

- **r1**: \( (?a\ \text{nationality}\ ?b) \Rightarrow (?a\ \text{deathPlace}\ ?b) \)
- **r2**: \( (?a\ \text{birthPlace}\ ?b) \land (?a\ \text{country}\ ?b) \Rightarrow (?a\ \text{deathPlace}\ ?b) \)
- **r3**: \( (?a\ \text{child}\ ?h) \land (?h\ \text{parent}\ ?b) \Rightarrow (?a\ \text{spouse}\ ?b) \)


In [64]:
query1.head()

Unnamed: 0,writerNameLabel,birthPlaceLabel,nationalityLabel,deathPlaceLabel
0,Robert Zend,Budapest,Hungarian-Canadian,Canada
1,Robert Zend,Kingdom of Hungary (1920-1946),Hungarian-Canadian,Canada
2,Bruce Alistair McKelvie,Canada,Canadians,Canada
3,Leona Florentino,Captaincy General of the Philippines,Ilocano people,Captaincy General of the Philippines
4,Aziz Ahmad (writer),Hyderabad,Pakistani,Canada


In [65]:
query2.head()

Unnamed: 0,writerName,birthPlaceLabel,nationality,country,deathPlaceLabel
0,Caleb Whitefoord,Edinburgh,Scottish,Scotland,London
1,Caleb Whitefoord,Edinburgh,Scottish,United Kingdom,London
2,Carol Fenner,"North Hornell, New York",American,United States,"Battle Creek, Michigan"
3,Amelia Perrier,Cork (city),http://dbpedia.org/resource/United_Kingdom_of_...,Ireland,Sussex
4,Carola Prosperi,Turin,http://dbpedia.org/resource/Italy,Italy,Turin


In [66]:
query3.head(10)

Unnamed: 0,writerName,childLabel,parentLabel,spouseLabel
0,Caitlin Thomas,Aeronwy Thomas,Caitlin Thomas,Dylan Thomas
1,Dylan Thomas,Aeronwy Thomas,Caitlin Thomas,Caitlin Thomas
2,Qi Xin,Xi Yuanping,Qi Xin,Xi Zhongxun
3,Julia Rush Cutler Ward,Samuel Ward (lobbyist),Samuel Ward (banker),Samuel Ward (banker)
4,Rabindranath Tagore,Rathindranath Tagore,Mrinalini Devi,Mrinalini Devi
5,Henrik Ibsen,Sigurd Ibsen,Suzannah Thoresen,Suzannah Thoresen
6,Barry Shipman,Nina Shipman,Barry Shipman,Gwynne Shipman
7,Belkis Cuza Malé,Ernesto Padilla,Belkis Cuza Malé,Heberto Padilla
8,Nalini Prava Deka,Jim Ankan Deka,Bhabananda Deka,Bhabananda Deka
9,Gail Omvedt,Prachi Patankar,Bharat Patankar,Bharat Patankar


## Calculating Metrics

>Let consider the following rules r1, r2 and r3 where the atoms predicate(x, y) are written as (?x
predicate ?y):

    • r1: (?a nationality ?b) => (?a deathPlace ?b)


In [67]:
#Convert the data query1 into a list of dictionaries
query1_data = query1.to_dict(orient='records')
query2_data = query2.to_dict(orient='records')
query3_data = query3.to_dict(orient='records')

In [68]:
def calculate_metrics_for_rule_1(data):
    """
    A person's nationality is the same as their place of death.
    """
    # Initialize counters
    support = 0  # Instances where nationality matches deathPlace
    total_nationality = 0  # Instances with a specified nationality
    total_deathPlace = 0  # Instances with a specified deathPlace
    
    for instance in data:
        has_nationality = 'nationalityLabel' in instance and instance['nationalityLabel']
        has_deathPlace = 'deathPlaceLabel' in instance and instance['deathPlaceLabel']
        
        if has_nationality:
            total_nationality += 1
        if has_deathPlace:
            total_deathPlace += 1
        if has_nationality and has_deathPlace and instance['nationalityLabel'] == instance['deathPlaceLabel']:
            support += 1
    
    # Calculate metrics
    head_coverage = support / total_deathPlace if total_deathPlace > 0 else 0
    confidence = support / total_nationality if total_nationality > 0 else 0
    
    return {
        'support': support,
        'head_coverage': head_coverage,
        'confidence': confidence
    }

In [69]:
data = query1_data

metrics = calculate_metrics_for_rule_1(data)
print("The metrics for rule 1 are:")
print(metrics)


The metrics for rule 1 are:
{'support': 227, 'head_coverage': 0.13243873978996498, 'confidence': 0.13243873978996498}


>r2: (?a birthPlace ?b) and (?a country ?b) => (?a deathPlace ?b)


In [70]:
def calculate_metrics_for_rule_2(data):
    """
    Calculate support, head coverage, and confidence for rule 2:
    A person's birthPlace and country are the same, and this matches their place of death.
    """
    # Initialize counters
    support = 0  # Instances where birthPlace and country match deathPlace
    total_conditions_met = 0  # Instances where birthPlace matches country
    total_deathPlace = 0  # Instances with a specified deathPlace
    
    for instance in data:
        has_birthPlace = 'birthPlaceLabel' in instance and instance['birthPlaceLabel']
        has_country = 'country' in instance and instance['country']
        has_deathPlace = 'deathPlaceLabel' in instance and instance['deathPlaceLabel']
        
        if has_deathPlace:
            total_deathPlace += 1
        if has_birthPlace and has_country and instance['birthPlaceLabel'] == instance['country']:
            total_conditions_met += 1
            if has_deathPlace and instance['birthPlaceLabel'] == instance['deathPlaceLabel']:
                support += 1
    
    # Calculate metrics
    head_coverage = support / total_deathPlace if total_deathPlace > 0 else 0
    confidence = support / total_conditions_met if total_conditions_met > 0 else 0
    
    return {
        'support': support,
        'head_coverage': head_coverage,
        'confidence': confidence
    }




In [71]:
data = query2_data

metrics = calculate_metrics_for_rule_2(data)
print("The metrics for rule 2 are:")
print(metrics)

The metrics for rule 2 are:
{'support': 23, 'head_coverage': 0.03576982892690513, 'confidence': 0.40350877192982454}


>Rule 3 ::: r3: (?a child ?h) and (?h parent ?b) => (?a spouse ?b)

In [72]:
# Support 
support = query3.apply(lambda row: row['parentLabel'] == row['spouseLabel'], axis=1).sum()

# For head coverage, we need the total instances where a spouse is identified
total_spouse_instances = query3['spouseLabel'].notna().sum()

# For confidence, we consider instances where a child and an identified other parent exist
total_body_instances = query3.apply(lambda row: pd.notna(row['childLabel']) and pd.notna(row['parentLabel']), axis=1).sum()

head_coverage = support / total_spouse_instances if total_spouse_instances > 0 else 0
confidence = support / total_body_instances if total_body_instances > 0 else 0


In [73]:
# Print the metrics
print(f"Metrics for rule 3:")
print(f"Support: {support}")
print(f"Head Coverage: {head_coverage}")
print(f"Confidence: {confidence}")

Metrics for rule 3:
Support: 104
Head Coverage: 0.3837638376383764
Confidence: 0.3837638376383764


>Analysis of Rules in Part 1:

- Rule 1: About 227 instances resonate the same nationality and the deathPlace of an entity. Low Head coverage (0.132) is the probability of the instances having this pattern, which is significantly low. And the confidence is very poor, hence the rule is not very reliable in predicting an entity's death place based on nationality.

- Rule 2 has a significantly higher confidence than Rule 1 and Rule 3 implying that both an entity's birthplace matches its country, there's a 40.35% chance that this location will also be its death place. But with low support and low head coverage there is less representation across the dataset.

- Rule 3 suggests a significantly higher head coverage and a moderate reliability compared to the other two rules, when a child-parent relationship is observed, 

## Part 2 AMIE Results - Analysis

Analyse the results and observe the results of some rules that are obtained and compare the
obtained measures (support, HC and PCA-confidence) especially for the three used rules in
part 1.


Consider these rules from the Film dataset

>Rule: 1 db:name  ?a  ?g  db:runtime  ?b   => ?a  db:runtime  ?b	0.050075643	0.995488722	662	665	-1

>Rule 2: ?g  db:name  ?a  ?g  db:writer  ?b   => ?a  db:director  ?b	0.024831527	0.395104895	339	858	-1

>Rule: 3 ?h  db:director  ?b  ?a  db:name  ?h   => ?a  db:director  ?b	0.050395546	0.995658466	688	691	-1

Analysis: 

The rules 1 and 3 exhibit high confidence (~0.995). Meaning that, the observed pattern with an entity is true.

- Rule 1 interprets that if an entity ?a has a db:name relation with any value ?g and there exists a db:runtime relation with the value ?b, then ?a has a db:runtime of ?b directly. However, there are limited number of instances where 'runtime' can be applied.

- Rule 2 interprets that if an entity ?g is associated with ?a through db:name and ?g is also associated with ?b through db:writer, then ?a is likely to have a db:director relation with ?b. The confidence is extremely low, suggesting that entities like this are very rare in the dataset.

- Rule 3 indicates that if an entity ?h is a db:director of ?b, and ?a has a db:name relation with ?h, then ?a is also a db:director of ?b. The high confidence makes it more reliable.

## Part 3 Use of generated rules
The idea here is to use the generated rules for predictions and debugging knowledge graphs.

>Rule 1 from AMIE : PREDICTING

SPARQL CONSTRUCT QUERY

```
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>

CONSTRUCT {
  ?work dbo:writer ?writer .
  ?writer foaf:name ?writerLabel .
  ?work dbo:director ?director .
  ?director foaf:name ?directorLabel .
}
WHERE {
  ?work dbo:writer ?writer .
  ?writer foaf:name ?writerLabel .
  ?work dbo:director ?director .
  ?director foaf:name ?directorLabel .
  
  FILTER (LANG(?writerLabel) = "en")
  FILTER (LANG(?directorLabel) = "en")
}
```

### Running the ASK Query:

In [75]:
from SPARQLWrapper import SPARQLWrapper, JSON

# Initialize the SPARQLWrapper with the DBpedia endpoint
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

# ASK query for Rule 2
ask_query = """
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>

ASK {
  ?g foaf:name ?name .
  ?g dbo:writer ?b .
  ?a dbo:director ?b .
}
"""

# Set the query to SPARQLWrapper
sparql.setQuery(ask_query)

# Set the return format to JSON (ASK results can be interpreted from JSON too)
sparql.setReturnFormat(JSON)

# Execute the query and convert the result into a Python dictionary
result = sparql.query().convert()

# The ASK query response is found under the "boolean" key in the result dictionary
exists = result["boolean"]

# Print the result
if exists:
    print("The pattern exists in the knowledge graph.")
else:
    print("The pattern does not exist in the knowledge graph.")


The pattern exists in the knowledge graph.


In [77]:
# Define the ASK query for Rule 3
ask_query = """
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>

ASK {
  ?h dbo:director ?b .
  ?a foaf:name ?name .
  ?a dbo:director ?b .
  FILTER EXISTS { ?a foaf:name ?h }
}
"""

# Set the query and its return format
sparql.setQuery(ask_query)
sparql.setReturnFormat(JSON)

# Execute the query and convert the results
result = sparql.query().convert()

# Extract and print the boolean result
exists = result["boolean"]
print("The pattern exists in the knowledge graph:" if exists else "The pattern does not exist in the knowledge graph.")


The pattern does not exist in the knowledge graph.


### 3. Select query

In [78]:
# Define SELECT query
select_query = """
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>

SELECT DISTINCT ?g ?a ?b ?existingRole
WHERE {
  ?g foaf:name ?aName .
  ?g dbo:writer ?b .
  ?a foaf:name ?aName .
  OPTIONAL { ?a dbo:director ?b . }
  OPTIONAL { ?a ?existingRole ?b . FILTER(?existingRole != dbo:director) }
  
  FILTER NOT EXISTS { ?a dbo:director ?b . }
}
"""

# Set the query and its return format to JSON
sparql.setQuery(select_query)
sparql.setReturnFormat(JSON)

# Execute the query
results = sparql.query().convert()

# Iterate through the results and print them
for result in results["results"]["bindings"]:
    g = result["g"]["value"]
    a = result.get("a", {}).get("value", "No a found")
    b = result.get("b", {}).get("value", "No b found")
    existingRole = result.get("existingRole", {}).get("value", "No existing role found")
    print(f"Work: {g}, Person: {a}, Writer: {b}, Existing Role: {existingRole}")



Work: http://dbpedia.org/resource/Cabaret_(1927_film), Person: http://dbpedia.org/resource/Cabaret_(1927_film), Writer: http://dbpedia.org/resource/Becky_Gardiner, Existing Role: http://dbpedia.org/ontology/wikiPageWikiLink
Work: http://dbpedia.org/resource/Cabaret_(1927_film), Person: http://dbpedia.org/resource/Cabaret_(1927_film), Writer: http://dbpedia.org/resource/Owen_Davis, Existing Role: http://dbpedia.org/ontology/wikiPageWikiLink
Work: http://dbpedia.org/resource/Cabaret_(2019_film), Person: http://dbpedia.org/resource/Cabaret_(2019_film), Writer: http://dbpedia.org/resource/Bhushan_Kumar, Existing Role: http://dbpedia.org/ontology/wikiPageWikiLink
Work: http://dbpedia.org/resource/Cabaret_(2019_film), Person: http://dbpedia.org/resource/Cabaret_(2019_film), Writer: http://dbpedia.org/resource/Pooja_Bhatt, Existing Role: http://dbpedia.org/ontology/wikiPageWikiLink
Work: http://dbpedia.org/resource/Cabaret_(1927_film), Person: http://dbpedia.org/resource/Cabaret_(1927_film), 

Conclusion: Identified cases entity ?g with a specified name and writer, where the named entity ?a is not already a director of ?b, highlighting cases where inference of Rule 3 is questioned.