In [1]:
import psycopg2
import pandas as pd


from time import strftime, gmtime

In [2]:
from itables import init_notebook_mode, show
import re
from importlib import reload

In [3]:
import coreferee

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import postgresql_functions as pgf
import settings as stt

In [10]:
reload(pgf)

<module 'postgresql_functions' from '/home/francesco/shared_files/python_notebooks/Early-Modern-Astronomy/mathshistory/postgresql_functions.py'>

In [8]:
q1 = """
-- DROP TABLE astronomers.coref_sentence;

CREATE TABLE astronomers.coref_sentence (
    pk_sentence serial4 NOT NULL,
    fk_mathshistory int4 NULL,
    st_id int4 NULL,
    sentence text NULL,
    sent_vector vector(300),
    CONSTRAINT coref_sentence_pk PRIMARY KEY (pk_sentence),
    CONSTRAINT coref_sentence_fk FOREIGN KEY (fk_mathshistory) REFERENCES astronomers.mathshistory(pk_mathshistory)
);
CREATE INDEX coref_sentence_fk_mathshistory_idx ON astronomers.coref_sentence USING btree (fk_mathshistory);
CREATE INDEX coref_sentence_st_id_idx ON astronomers.coref_sentence USING btree (st_id);
"""

In [12]:
q2 = """
-- DROP TABLE astronomers.coref_chunk;

CREATE TABLE astronomers.coref_chunk (
	pk_chunk serial4 NOT NULL,
	fk_sentence int4 NULL,
	st_id int4 NULL,
	c_text varchar NULL,
	c_start_char int4 NULL,
	c_end_char int4 NULL,
	c_root_text varchar NULL,
	c_root_dep_ varchar NULL,
	c_root_ancestors varchar NULL,
	c_root_head_text varchar NULL,
	fk_mathshistory int4 NULL,
    chunk_vector vector(300),
	CONSTRAINT coref_chunk_pk PRIMARY KEY (pk_chunk),
	CONSTRAINT coref_chunk_fk FOREIGN KEY (fk_sentence) REFERENCES astronomers.coref_sentence(pk_sentence)
);
"""

In [23]:
q3 = """
-- DROP TABLE astronomers.coref_token;

CREATE TABLE astronomers.coref_token (
	pk_token serial4 NOT NULL,
	fk_sentence int4 NULL,
	st_id int4 NULL,
	t_index int4 NULL,
	t_text varchar NULL,
	t_lemma varchar NULL,
	t_pos_ varchar NULL,
	t_tag_ varchar NULL,
	t_dep_ varchar NULL,
	t_head_i int4 NULL,
	t_head_text varchar NULL,
	t_head_pos_ varchar NULL,
	t_head_tag_ varchar NULL,
	t_children varchar NULL,
    t_vector vector(300),
	fk_mathshistory int4 NULL,
	CONSTRAINT coref_token_pk PRIMARY KEY (pk_token),
	CONSTRAINT coref_token_fk FOREIGN KEY (fk_sentence) REFERENCES astronomers.coref_sentence(pk_sentence)
);
CREATE INDEX coref_token_fk_mathshistory_idx ON astronomers.coref_token USING btree (fk_mathshistory);
CREATE INDEX coref_token_st_id_idx ON astronomers.coref_token USING btree (st_id);
CREATE INDEX coref_token_t_head_i_idx ON astronomers.coref_token USING btree (t_head_i);
CREATE INDEX coref_token_t_index_idx ON astronomers.coref_token USING btree (t_index);"""

In [16]:
q4 = """
CREATE TABLE astronomers.coref_entity (
	pk_entity serial4 NOT NULL,
	fk_sentence int4 NULL,
	st_id int4 NULL,
	e_text varchar NULL,
	e_label varchar NULL,
	fk_mathshistory int4 NULL,
    e_vector vector(300),
	CONSTRAINT coref_entity_pk PRIMARY KEY (pk_entity),
	CONSTRAINT coref_entity_fk FOREIGN KEY (fk_sentence) REFERENCES astronomers.coref_sentence(pk_sentence)
);
"""

In [22]:
### connect to the local database
cnn = psycopg2.connect(host="localhost", port = 5432, database="espace_intellectuel", 
                        user="postgres", password=stt.dbw)
#conn

In [24]:
with cnn.cursor() as curs:
    try:   
        curs.execute(q3)
        cnn.commit()
    except Exception as e:            
        cnn.rollback()
        print(e)

In [25]:
cnn.close()

In [None]:
textes = pd.DataFrame(result[0])
textes.columns = ['id', 'texte', 'size']

In [None]:
textes.head()

In [None]:
len(textes), len(textes[textes['size'] > 1500])

In [None]:
### Choose one document
txt = textes.iloc[1].texte 
print(txt)

In [None]:
#txt = "We have quoted above from Biancani concerning his high regard for Galileo. However, he did not always agree with Galileo's views. The first disagreement came in 1611 and concerned the mountains on the moon. Galileo had observed the surface of the moon through a telescope in 1609 and had used certain mathematical techniques to prove that there were lunar mountains. His claim appeared in Sidereus Nuncius published in May 1610. In May 1611 a group of scientists, mostly Jesuits, was brought together by cardinal Ferdinando Gonzaga in Mantua to discuss Galileo's claims. One of the major points discussed was Galileo's proof that there were mountains on the moon, and the report from the group came down firmly in favour of the traditional belief that the moon was perfectly smooth. Galileo suspected that Biancani was the author of the report and letters were exchanged in which Biancani dissociated himself from any insult towards Galileo saying that he was sorry if he had been offended but, nevertheless, pointing out that he did believe that the moon was perfectly smooth. He also disagreed with Galileo in 1613 when a dispute broke out between Galileo and Christoph Scheiner over sunspots. Galileo unfairly accused Scheiner of plagiarism but, although Scheiner's discovery of sunspots was certainly independent of any work by Galileo, his explanation was quite wrong. Biancani, however, defended his fellow Jesuit Scheiner."

In [None]:
#print(txt)

## Coreferee

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
### https://spacy.io/universe/project/coreferee
nlp.add_pipe('coreferee')

In [None]:
doc = nlp(txt)

In [None]:
doc._.coref_chains.print()

In [None]:
### Produce resolved text
# https://stackoverflow.com/questions/75204212/spacy-coreferee-how-to-cleanly-extract-coreferenced-text

resolved_text = ""

for token in doc[:]:
  
    repres = doc._.coref_chains.resolve(token)
    # print(repres)
    if repres:
        c = " and ".join([t.text for t in repres])
        # resolved_text += " " + c
        resolved_text += c + " "
        # print(c)
    else:
        #resolved_text += " " + token.text
        resolved_text += token.text_with_ws

print(resolved_text)

In [None]:
def resolve_text(doc):
    
    resolved_text = ""
    
    for token in doc:
  
        repres = doc._.coref_chains.resolve(token)
        # print(repres)
        if repres:
            c = " and ".join([t.text for t in repres])
            # resolved_text += " " + c
            resolved_text += c + " "
            # print(c)
        else:
            #resolved_text += " " + token.text
            resolved_text += token.text_with_ws
        
    return resolved_text    

In [None]:
resolve_text(doc)[:1000]

## Import into database

In [None]:
### Je n'ai retenu que ceux à plus que 1500 !
# il faudra reprendre les autres

ll = textes[textes['size'] > 1500].values.tolist()
len(ll), ll[:2]

UPDATE astronomers.mathshistory SET coreferenced_txt = null;

In [None]:

### Next line commented to avoid disruption

for t in ll:

    error = []
    
    doc = nlp(t[1])
        
    rt = resolve_text(doc).replace("'", "\\\'")
    # print(type(rt), len(rt))
    
    
    with conn.cursor() as curs:
        try:
            qs = f"""
            UPDATE astronomers.mathshistory SET coreferenced_txt = E'{rt}'
            WHERE pk_mathshistory = {t[0]};
            """
            
            curs.execute(qs)
            conn.commit()
        except Exception as e:
            error.append([t[0], e])
            # print(error)
            with open('spacy/logs_errors_coreferenced.txt', 'a') as f:
                f.write(f'd{str(error)} — Error — {strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())}\n\n')
            conn.rollback()