# Triple Extraction

#### Preliminary Operations

In [31]:
#!pip install stanza spacy https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz SentencePiece

In [30]:
#modello spacy
#!python -m spacy download en_core_web_sm 

In [23]:
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer
from transformers import BertTokenizer, BertModel, AutoModelForSeq2SeqLM
import stanza
from stanza.server import CoreNLPClient
import os
import random
import spacy

In [None]:
#pd.set_option('display.max_rows', None)

## 1.REBEL

https://huggingface.co/Babelscape/rebel-large

In [4]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")
gen_kwargs = {
    "max_length": 256,
    "length_penalty": 0,
    "num_beams": 3,
    "num_return_sequences": 3,
}

In [5]:
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

In [6]:
# Function to apply
def apply_extraction(row):
    # Tokenize the text
    model_inputs = tokenizer(row['claim'], max_length=256, padding=True, truncation=True, return_tensors='pt')



    # Generate
    generated_tokens = model.generate(
        model_inputs["input_ids"].to(model.device),
        attention_mask=model_inputs["attention_mask"].to(model.device),
        **gen_kwargs,
    )

    # Decode
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    # Extract triplets
    triplets = []
    for idx, sentence in enumerate(decoded_preds):
        triplets.extend(extract_triplets(sentence))

    return triplets


In [None]:
# Apply the function to each row in the DataFrame
df['triplets'] = df.apply(apply_extraction, axis=1)

# Display the results
for index, row in df.iterrows():
    print(f"Claim : {row['claim']}")
    print(f"Triplets: {row['triplets']}")
    print("-" * 50)

## 2.CORENLP

https://stanfordnlp.github.io/CoreNLP/openie.html

In [8]:
%env NO_PROXY='localhost'
%env no_proxy='localhost'

env: NO_PROXY='localhost'
env: no_proxy='localhost'


In [19]:
# Examine the CoreNLP installation folder to make sure the installation is successful
!ls $CORENLP_HOME

"ls" non � riconosciuto come comando interno o esterno,
 un programma eseguibile o un file batch.


In [20]:
# Scarica e inizializza il modello e il server CoreNLP una sola volta
stanza.download('en')
nlp_model = stanza.Pipeline('en', package='genia')
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)
os.environ["CORENLP_HOME"] = corenlp_dir

# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
client = CoreNLPClient(
    annotators=['openie'],
    memory='4G',
    endpoint='http://localhost:9006',
    be_quiet=True)

# Definisci la funzione extract_openie_triplets utilizzando le variabili globali
def extract_openie_triplets(text):
    # Utilizza il modello e il server CoreNLP globali
    document = client.annotate(text, output_format='json')

    # Estrai le triplette OpenIE dal documento annotato
    triples = []
    for sentence in document['sentences']:
      for triple in sentence['openie']:
        triples.append({
           'subject': triple['subject'],
           'relation': triple['relation'],
            'object': triple['object']
        })
    print(triples)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-02-11 12:51:43 INFO: Downloading default packages for language: en (English) ...
2024-02-11 12:51:44 INFO: File exists: C:\Users\c.farallo\stanza_resources\en\default.zip
2024-02-11 12:51:48 INFO: Finished downloading models and saved to C:\Users\c.farallo\stanza_resources.
2024-02-11 12:51:48 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-02-11 12:51:49 INFO: Loading these models for language: en (English):
| Processor | Package        |
------------------------------
| tokenize  | genia          |
| pos       | genia_nocharlm |
| lemma     | genia_nocharlm |
| depparse  | genia_nocharlm |

2024-02-11 12:51:49 INFO: Using device: cpu
2024-02-11 12:51:49 INFO: Loading: tokenize
2024-02-11 12:51:49 INFO: Loading: pos
2024-02-11 12:51:49 INFO: Loading: lemma
2024-02-11 12:51:49 INFO: Loading: depparse
2024-02-11 12:51:49 INFO: Done loading processors!
2024-02-11 12:51:50 INFO: Writing properties to tmp file: corenlp_server-8fd2bd047882487b.props


## 3.Spacy

https://spacy.io/usage/linguistic-features#pos-tagging

https://spacy.io/api/token#attributes

In [32]:
nlp = spacy.load("en_core_web_sm")

In [33]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup NOUN NN dep xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [None]:
## Prova del codice

for index, row in df.iterrows():
    text = str(row['claim'])
    doc = nlp(text)

    # Estrai soggetto, predicato e oggetto
    subjects = [token.text for token in doc if token.dep_ in ["nsubj", "nsubjpass"]]

    # Estendi la condizione per l'estrazione dei predicati
    predicates = [token.text for token in doc if token.dep_ in ["ROOT", "aux"]]

    objects = [token.text for token in doc if token.dep_ in ["dobj", "attr", "prep"]]

    # Stampa i risultati
    print(f"Claim: {text} - Subjects: {subjects}, Predicates: {predicates}, Objects: {objects}")

## Knowledge Base

## Test Dataset