# Explore named entities and verbs

In this notebook I explore the detection and linking of named entities


* I use the pip_spacy environment to be able to use plugins and extensions not available in Conda


In [13]:
from importlib import reload
import pandas as pd

import psycopg2
import settings as stt

from itables import init_notebook_mode, show
import re

from time import strftime, gmtime

In [14]:
import postgresql_functions as pgf

In [15]:
# reload(pgf)

## Explore verbs
Get a list of named entities contiaining the term _University_

In [67]:
### connect to the local database
conn = psycopg2.connect(host="localhost", port = 5432, database="espace_intellectuel", 
                        user="postgres", password=stt.dbw)
#conn

In [68]:
# conn.close()

In [69]:
q = """-- frequency of verbs
select t_lemma, count(*) as freq 
from mathshistory.coref_token ct 
where ct.t_pos_ ='VERB'
group by t_lemma 
order by freq desc;"""

In [75]:
q = """
-- frequency of verbs and of their synctactic roles
select concat_ws(' : ', t_lemma, lower(t_dep_)) , count(*) as freq 
from mathshistory.coref_token ct 
where ct.t_pos_ ='VERB'
group by t_lemma , t_dep_ 
order by freq desc;
"""

In [76]:
result = pgf.sql_explore(q, conn)
# print(f'Lines count: {len(result[0])}, errors count: {len(result[1])}, \nFirst lines: {result[0][:5]}')

In [77]:
ne = pd.DataFrame(result[0])
ne.columns = ['verb', 'freq']
ne.head()

Unnamed: 0,verb,freq
0,write : root,6927
1,have : root,6424
2,publish : root,5386
3,become : root,4224
4,give : root,4077


In [118]:
### https://github.com/mwouts/itables/blob/main/docs/advanced_parameters.md
show(ne.sort_values(by='freq', ascending=False), classes="display",

     columnDefs=[{"className": "dt-left", "targets": "_all"}],
     scrollY="400px", scrollCollapse=True, paging=False, column_filters="footer", dom="lrtip")

Unnamed: 0,name,freq
Loading... (need help?),,


In [84]:
verb = 'study'

### ROOT, xcomp
dependency = "and ct.t_dep_ in ('ROOT')\n"

In [102]:
q = f"""
with tw1 as (
select cs.pk_sentence, cs.sentence
from mathshistory.coref_sentence cs, mathshistory.coref_token ct 
where ct.fk_sentence = cs.pk_sentence
{dependency}   --and ct.t_dep_ = 'ROOT'
and ct.t_lemma = '{verb}'
offset 50
limit 20)
select concat(tw1.pk_sentence::varchar, ': ', tw1.sentence) sent, 
    array_agg(array[ct.t_text, ct.t_index::varchar, ct.t_lemma, ct.t_pos_ , ct.t_dep_,
    ct.t_head_text, t_head_pos_, t_head_i::varchar, ct.t_head_tag_]
    order by ct.t_index asc)::text token
from tw1, mathshistory.coref_token ct 
where ct.fk_sentence = tw1.pk_sentence
group by tw1.pk_sentence, tw1.sentence;
"""

# print(q)

In [103]:
result = pgf.sql_explore(q, conn)
# print(f'Lines count: {len(result[0])}, errors count: {len(result[1])}, \nFirst lines: {result[0][:1]}')
dfq = pd.DataFrame(result[0])

In [104]:
dfq['new']=dfq[1].apply(lambda x : [i.split(',') for i in (re.sub('{|}}$', '', x)).split('},')])
# print(dfq['new'][0])

In [105]:
dfq['text'] = dfq['new'].apply(lambda x : [(i[0].upper() + '  (' + ','.join([e.lower() for e in i[1:]]) + ')  ') for i in x])
dfq = dfq[[0,'text']]

In [106]:
dfq.iloc[0]

0       6391: Benedetti studied the first four books o...
text    [BENEDETTI  (99,benedetti,propn,nsubj,studied,...
Name: 0, dtype: object

In [107]:
dfq.columns = ['sentence', 'tokens']
len(dfq)

20

In [122]:
### https://github.com/mwouts/itables/blob/main/docs/advanced_parameters.md
show(dfq.sort_values(by='sentence', ascending=False), classes="display",

     columnDefs=[{"className": "dt-left", "targets": "_all"}],
     scrollY="400px", scrollCollapse=True, paging=False, column_filters="footer", dom="lrtip")

Unnamed: 0,sentence,tokens
Loading... (need help?),,


## Get named entites from database related to 'University'

Get a list of named entities contiaining the term _University_

In [4]:
### connect to the local database
conn = psycopg2.connect(host="localhost", port = 5432, database="espace_intellectuel", 
                        user="postgres", password=stt.dbw)
#conn

In [132]:
q1 = """
with tw1 as (
select pk_sentence,st_id, fk_mathshistory, sentence 
from mathshistory.coref_sentence cs
where sentence ~ 'University'
--limit 200
), tw2 as (
select ct.t_text tx, concat_ws(' ', ct.t_index, ct.t_pos_, ct.t_dep_, ct.t_head_text, ct.t_head_i),
ct1.t_text tx1 , concat_ws(' ', ct1.t_index, ct1.t_pos_, ct1.t_dep_, ct1.t_head_text, ct1.t_head_i) ,
ct2.t_text tx2, concat_ws(' ', ct2.t_index, ct2.t_pos_, ct2.t_dep_, ct2.t_head_text, ct2.t_head_i) ,
ct3.t_text tx3, concat_ws(' ', ct3.t_index, ct3.t_pos_, ct3.t_dep_, ct3.t_head_text, ct3.t_head_i) , 
ct4.t_text tx4, concat_ws(' ', ct4.t_index, ct4.t_pos_, ct4.t_dep_, ct4.t_head_text, ct4.t_head_i) ,
ct5.t_text tx5, concat_ws(' ', ct5.t_index, ct5.t_pos_, ct5.t_dep_, ct5.t_head_text, ct5.t_head_i)
from mathshistory.coref_token ct 
join tw1 on ct.fk_sentence = tw1.pk_sentence
join mathshistory.coref_token ct1 on ct1.fk_sentence = tw1.pk_sentence and ct1.t_head_i = ct.t_index 
join mathshistory.coref_token ct2 on ct2.fk_sentence = tw1.pk_sentence and ct2.t_index = ct1.t_index +1
left join mathshistory.coref_token ct3 on ct3.fk_sentence = tw1.pk_sentence 
		and ct3.t_index = ct2.t_index +1 and ct3.t_pos_ = 'PROPN'
left join mathshistory.coref_token ct4 on ct4.fk_sentence = tw1.pk_sentence 
		and ct4.t_index = ct3.t_index +1 and ct4.t_pos_ = 'PROPN'
left join mathshistory.coref_token ct5 on ct5.fk_sentence = tw1.pk_sentence 
		and ct5.t_index = ct4.t_index +1 and ct5.t_pos_ = 'PROPN'
where ct.t_text = 'University'
and ct1.t_dep_ ='prep'
and ct1.t_text in ('of')) --, 'at', 'in'
select concat_ws(' ', tx, tx1, tx2,tx3,tx4, tx5) ne, count(*) eff
from tw2
group by tx, tx1, tx2, tx3,tx3,tx4, tx5
order by eff desc;
"""

In [133]:
result = pgf.sql_explore(q1, conn)
print(f'Lines count: {len(result[0])}, errors count: {len(result[1])}, \nFirst lines: {result[0][:5]}')

Lines count: 825, errors count: 0, 
First lines: [('University of California', 357), ('University of Berlin', 337), ('University of Chicago', 315), ('University of London', 259), ('University of Cambridge', 253)]


In [134]:
ne = pd.DataFrame(result[0])
ne.columns = ['name', 'freq']
ne.head()

Unnamed: 0,name,freq
0,University of California,357
1,University of Berlin,337
2,University of Chicago,315
3,University of London,259
4,University of Cambridge,253


In [135]:
### https://github.com/mwouts/itables/blob/main/docs/advanced_parameters.md
show(ne.sort_values(by='freq', ascending=False), classes="display",

     columnDefs=[{"className": "dt-left", "targets": "_all"}],
     scrollY="400px", scrollCollapse=True, paging=False, column_filters="footer", dom="lrtip")

Unnamed: 0,name,freq
Loading... (need help?),,


## Verbs related to university, academy, etc.



In [137]:
### university, faculty, college, academy, ...

# the most frequently used is university

term = 'university' 

In [138]:
q = f"""
with tw1 as (
select pk_sentence,st_id, fk_mathshistory, sentence 
from mathshistory.coref_sentence cs
where sentence ~* '{term}'
--limit 200
), tw2 as (
select h1.t_head_text h1_text, concat_ws(' ', h1.t_head_pos_,h1.t_dep_) h1_pos, 
			ct.t_head_text h_text, ct.t_head_pos_ h_pos, ct.t_text tx, concat_ws(' ', ct.t_index, ct.t_pos_, ct.t_dep_, ct.t_head_text, ct.t_head_i, 
			ct.t_head_text, ct.t_head_pos_),
ct1.t_text tx1 , concat_ws(' ', ct1.t_index, ct1.t_pos_, ct1.t_dep_, ct1.t_head_text, ct1.t_head_i) ,
ct2.t_text tx2, concat_ws(' ', ct2.t_index, ct2.t_pos_, ct2.t_dep_, ct2.t_head_text, ct2.t_head_i) ,
ct3.t_text tx3, concat_ws(' ', ct3.t_index, ct3.t_pos_, ct3.t_dep_, ct3.t_head_text, ct3.t_head_i) , 
ct4.t_text tx4, concat_ws(' ', ct4.t_index, ct4.t_pos_, ct4.t_dep_, ct4.t_head_text, ct4.t_head_i) ,
ct5.t_text tx5, concat_ws(' ', ct5.t_index, ct5.t_pos_, ct5.t_dep_, ct5.t_head_text, ct5.t_head_i)
from mathshistory.coref_token ct 
join tw1 on ct.fk_sentence = tw1.pk_sentence
join mathshistory.coref_token ct1 on ct1.fk_sentence = tw1.pk_sentence and ct1.t_head_i = ct.t_index 
--head 2nd level 
join mathshistory.coref_token h1 on h1.fk_sentence = tw1.pk_sentence and h1.t_index = ct.t_head_i 
join mathshistory.coref_token ct2 on ct2.fk_sentence = tw1.pk_sentence and ct2.t_index = ct1.t_index +1
left join mathshistory.coref_token ct3 on ct3.fk_sentence = tw1.pk_sentence 
		and ct3.t_index = ct2.t_index +1 and ct3.t_pos_ = 'PROPN'
left join mathshistory.coref_token ct4 on ct4.fk_sentence = tw1.pk_sentence 
		and ct4.t_index = ct3.t_index +1 and ct4.t_pos_ = 'PROPN'
left join mathshistory.coref_token ct5 on ct5.fk_sentence = tw1.pk_sentence 
		and ct5.t_index = ct4.t_index +1 and ct5.t_pos_ = 'PROPN'
where ct.t_text ~* '{term}'
and ct1.t_dep_ ='prep'
and ct1.t_text in ('of')) --, 'at', 'in'
select h1_text, h1_pos, h_text, h_pos, tx, tx1, tx2,tx3,tx4, tx5, count(*) eff
from tw2
group by h1_text, h1_pos, h_text, h_pos, tx, tx1, tx2, tx3,tx3,tx4, tx5
order by eff desc;
"""

In [139]:
result = pgf.sql_explore(q, conn)
# print(f'Lines count: {len(result[0])}, errors count: {len(result[1])}, \nFirst lines: {result[0][:5]}')

In [140]:
univ = pd.DataFrame(result[0])
univ.columns=['h1_text', 'h1_pos', 'h_text', 'h_pos', 'tx', 'tx1', 'tx2','tx3','tx4', 'tx5', 'eff']
univ.head()

Unnamed: 0,h1_text,h1_pos,h_text,h_pos,tx,tx1,tx2,tx3,tx4,tx5,eff
0,entered,VERB ROOT,entered,VERB,University,of,Berlin,,,,25
1,studied,VERB prep,at,ADP,University,of,Berlin,,,,22
2,professor,NOUN prep,at,ADP,University,of,California,,,,21
3,matriculated,VERB prep,at,ADP,University,of,Edinburgh,,,,19
4,went,VERB prep,to,ADP,University,of,Göttingen,,,,17


In [144]:
### https://github.com/mwouts/itables/blob/main/docs/advanced_parameters.md
show(univ.sort_values(by='eff', ascending=False), classes="display",

     columnDefs=[{"className": "dt-left", "targets": "_all"}],
     scrollY="400px", scrollCollapse=True, paging=False, column_filters="footer", dom="lrtip")

Unnamed: 0,h1_text,h1_pos,h_text,h_pos,tx,tx1,tx2,tx3,tx4,tx5,eff
Loading... (need help?),,,,,,,,,,,


In [142]:
u_verbs_size = univ[['h1_text', 'h_text', 'eff']].groupby(by=['h1_text', 'h_text']).sum()
u_verbs_size.sort_values(by='eff', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,eff
h1_text,h_text,Unnamed: 2_level_1
professor,at,578
entered,entered,295
studied,at,264
Professor,at,211
appointed,at,200


In [143]:
### https://github.com/mwouts/itables/blob/main/docs/advanced_parameters.md
show(u_verbs_size.sort_values(by='eff', ascending=False), classes="display",

     columnDefs=[{"className": "dt-left", "targets": "_all"}],
     scrollY="400px", scrollCollapse=True, paging=False, column_filters="footer", dom="lrtip")

Unnamed: 0_level_0,Unnamed: 1_level_0,eff
h1_text,h_text,Unnamed: 2_level_1
Loading... (need help?),,


## Verbs and their synctactic role

In [12]:
q = """
with tw1 as (
select pk_sentence,st_id, fk_mathshistory, sentence 
from mathshistory.coref_sentence cs
where sentence ~ 'University'
--limit 200
), tw2 as (
select h1.t_head_text h1_text, concat_ws(' ', h1.t_head_pos_,h1.t_dep_) h1_pos, 
			ct.t_head_text h_text, ct.t_head_pos_ h_pos, ct.t_text tx, concat_ws(' ', ct.t_index, ct.t_pos_, ct.t_dep_, ct.t_head_text, ct.t_head_i, 
			ct.t_head_text, ct.t_head_pos_),
ct1.t_text tx1 , concat_ws(' ', ct1.t_index, ct1.t_pos_, ct1.t_dep_, ct1.t_head_text, ct1.t_head_i) ,
ct2.t_text tx2, concat_ws(' ', ct2.t_index, ct2.t_pos_, ct2.t_dep_, ct2.t_head_text, ct2.t_head_i) ,
ct3.t_text tx3, concat_ws(' ', ct3.t_index, ct3.t_pos_, ct3.t_dep_, ct3.t_head_text, ct3.t_head_i) , 
ct4.t_text tx4, concat_ws(' ', ct4.t_index, ct4.t_pos_, ct4.t_dep_, ct4.t_head_text, ct4.t_head_i) ,
ct5.t_text tx5, concat_ws(' ', ct5.t_index, ct5.t_pos_, ct5.t_dep_, ct5.t_head_text, ct5.t_head_i)
from mathshistory.coref_token ct 
join tw1 on ct.fk_sentence = tw1.pk_sentence
join mathshistory.coref_token ct1 on ct1.fk_sentence = tw1.pk_sentence and ct1.t_head_i = ct.t_index 
--head 2nd level 
join mathshistory.coref_token h1 on h1.fk_sentence = tw1.pk_sentence and h1.t_index = ct.t_head_i 
join mathshistory.coref_token ct2 on ct2.fk_sentence = tw1.pk_sentence and ct2.t_index = ct1.t_index +1
left join mathshistory.coref_token ct3 on ct3.fk_sentence = tw1.pk_sentence 
		and ct3.t_index = ct2.t_index +1 and ct3.t_pos_ = 'PROPN'
left join mathshistory.coref_token ct4 on ct4.fk_sentence = tw1.pk_sentence 
		and ct4.t_index = ct3.t_index +1 and ct4.t_pos_ = 'PROPN'
left join mathshistory.coref_token ct5 on ct5.fk_sentence = tw1.pk_sentence 
		and ct5.t_index = ct4.t_index +1 and ct5.t_pos_ = 'PROPN'
where ct.t_text = 'University'
and ct1.t_dep_ ='prep'
and ct1.t_text in ('of')) --, 'at', 'in'
select h1_text, h1_pos, h_text, h_pos, tx, tx1, tx2,tx3,tx4, tx5, count(*) eff
from tw2
group by h1_text, h1_pos, h_text, h_pos, tx, tx1, tx2, tx3,tx3,tx4, tx5
order by eff desc;
"""

In [13]:
result = pgf.sql_explore(q, conn)
# print(f'Lines count: {len(result[0])}, errors count: {len(result[1])}, \nFirst lines: {result[0][:5]}')

In [16]:
univ = pd.DataFrame(result[0])
univ.columns=['h1_text', 'h1_pos', 'h_text', 'h_pos', 'tx', 'tx1', 'tx2','tx3','tx4', 'tx5', 'eff']
univ.head()

Unnamed: 0,h1_text,h1_pos,h_text,h_pos,tx,tx1,tx2,tx3,tx4,tx5,eff
0,entered,VERB ROOT,entered,VERB,University,of,Berlin,,,,25
1,studied,VERB prep,at,ADP,University,of,Berlin,,,,22
2,professor,NOUN prep,at,ADP,University,of,California,,,,21
3,matriculated,VERB prep,at,ADP,University,of,Edinburgh,,,,19
4,went,VERB prep,to,ADP,University,of,Göttingen,,,,17


In [17]:
### https://github.com/mwouts/itables/blob/main/docs/advanced_parameters.md
show(univ.sort_values(by='eff', ascending=False), classes="display",

     columnDefs=[{"className": "dt-left", "targets": "_all"}],
     scrollY="400px", scrollCollapse=True, paging=False, column_filters="footer", dom="lrtip")

Unnamed: 0,h1_text,h1_pos,h_text,h_pos,tx,tx1,tx2,tx3,tx4,tx5,eff
Loading... (need help?),,,,,,,,,,,
