In [54]:
from sklearn.feature_extraction.text import CountVectorizer
import csv
import json
import re
from collections import defaultdict
import itertools
import sys

In [2]:
path = "/Users/yidesdo21/Projects/outputs/10_ptc_ten_recent/bag_of_identifiers/"


### create a co-occurrence matrix for identifiers in the articles and write the matrix to a CSV file
- if two identifiers appear in the same article, then they co-occur.
- 1) we start with each article and it's bag of identifiers, 
- 2) then create a document-term matrix with the transform method in CountVectorizer. The term in the matrix is made up of the set of all identifiers from all the articles. The transform method allows us to define the vocabulary (the set of identifiers) and turn the vocabulary and the articles into a document-term matrix.
- 3) aftet that, we use matrix multiplication to get word-word co-occurrence matrix.
- 4) create the CSV from the co-occurence matrix, columns include: concept_id1, concept_id2, frequency

In [131]:
article1 = ["id:id_1", "id:id_2", "id:id_3"]
article2 = ["id:id_2", "id:id_4"]
article3 = ["id:id_1", "id:id_3"]

In [132]:
articles = [article1, article2, article3]

In [133]:
articles

[['id:id_1', 'id:id_2', 'id:id_3'],
 ['id:id_2', 'id:id_4'],
 ['id:id_1', 'id:id_3']]

In [121]:
# ## test 
# article1 = ['AlzheimerOntology:t_tau', 'AlzheimerOntology:tauopathy']
# article2 = ["9606", 'AlzheimerOntology:t_tau', 'AlzheimerOntology:tauopathy']
# article3 = ["9606", 'AlzheimerOntology:t_tau']
# articles = [article1, article2, article3]

In [134]:
vocab = set()
article_transform = list()

for article in articles:
    article_transform.append(" ".join(article))
    for identifier in article:
        vocab.add(identifier.lower())

vocab = sorted(list(vocab))

In [135]:
vocab

['id:id_1', 'id:id_2', 'id:id_3', 'id:id_4']

In [136]:
article_transform

['id:id_1 id:id_2 id:id_3', 'id:id_2 id:id_4', 'id:id_1 id:id_3']

In [137]:
vectorizer = CountVectorizer(vocabulary=vocab, token_pattern=r"(?u)\b\w\w+:?\w+\b")  # need to add colon as part of the tokens
X = vectorizer.transform(article_transform)  
X[X > 0] = 1   # filter spurious result, but I don't think we need this line


In [138]:
Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
Xc.setdiag(0) # sometimes you want to fill same word cooccurence to 0
Xc = Xc.tolil()   # this line removes the sparsewarning
# co_occur = Xc.toarray()
# print(Xc.todense()) # print out matrix in dense format

In [139]:
print(Xc)

  (0, 0)	0
  (0, 1)	1
  (0, 2)	2
  (1, 0)	1
  (1, 1)	0
  (1, 2)	1
  (1, 3)	1
  (2, 0)	2
  (2, 1)	1
  (2, 2)	0
  (3, 1)	1
  (3, 3)	0


In [140]:
indices = list(zip(*Xc.nonzero()))
# indices = list(map(set, zip(*Xc.nonzero())))

In [141]:
indices

[(0, 1), (0, 2), (1, 0), (1, 2), (1, 3), (2, 0), (2, 1), (3, 1)]

In [142]:
indices_set = list()
co_occur_dict = dict()
ind2vocab = dict(zip(range(len(vocab)), vocab))

for i in indices:
    index_set = {i[0],i[1]}
    if index_set not in indices_set: 
#         print(i)
#         print(Xc[i])
        indices_set.append(index_set)
        co_occur_dict[(ind2vocab[i[0]],ind2vocab[i[1]])] = Xc[i]
    

In [143]:
co_occur_dict

{('id:id_1', 'id:id_2'): 1,
 ('id:id_1', 'id:id_3'): 2,
 ('id:id_2', 'id:id_3'): 1,
 ('id:id_2', 'id:id_4'): 1}

In [17]:
# Xc.shape

(4, 4)

In [18]:
# co_occur_dict = dict()
# vocab2ind = dict(zip(vocab, range(len(vocab))))
# ind2vocab = dict(zip(range(len(vocab)), vocab))

# for i in range(len(vocab)-1):
#     print(co_occur[i])
#     for j in range(i+1,len(vocab)):   # skip the diagano and the duplicated half of the pyramid 
#         freq = co_occur[i][j]
#         if freq != 0:
#             print(freq)
#             co_occur_dict[(ind2vocab[i],ind2vocab[j])] = freq
#     print("-----------")

[0 1 2 0]
1
2
-----------
[1 0 1 1]
1
1
-----------
[2 1 0 0]
-----------


In [144]:
def write_to_csv(path, file_name, co_occur_dict):
    """write the co-occurrence dictionary to a CSV file
    input -- path that saves the CSV file
             the co-occurrence dictionary"""
    
    header = ["concept_id1", "concept_id2", "frequency"]  # co-occurrence frequency in the same abtract

    with open(path+file_name+'.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(header) # write the header
        for k,v in co_occur_dict.items():
            writer.writerow([k[0],k[1],v])

In [145]:
co_occur_dict

{('id:id_1', 'id:id_2'): 1,
 ('id:id_1', 'id:id_3'): 2,
 ('id:id_2', 'id:id_3'): 1,
 ('id:id_2', 'id:id_4'): 1}

In [146]:
write_to_csv(path,"test",co_occur_dict)

### write a function to create the co-occurrence CSV

In [147]:
def create_vocab(articles):
    """create a set of vocabularies -- all the identifiers in all articles are included
    input -- a list of lists. [["id_1", "id_2"], ["id_1"], ...]
             Each nested list is a bag of identifiers from each article
    output -- a sorted list of strings. ["id_1", "id_2", ...]"""
    
    vocab = set()
    article_transform = list()

    for article in articles:
        article_transform.append(" ".join(article))
        for identifier in article:
            vocab.add(identifier.lower())   # the CountVector convert all characters to lowercase before tokenizing

    vocab = sorted(list(vocab))
    return vocab, article_transform

In [149]:
articles

[['id:id_1', 'id:id_2', 'id:id_3'],
 ['id:id_2', 'id:id_4'],
 ['id:id_1', 'id:id_3']]

In [150]:
create_vocab(articles)

(['id:id_1', 'id:id_2', 'id:id_3', 'id:id_4'],
 ['id:id_1 id:id_2 id:id_3', 'id:id_2 id:id_4', 'id:id_1 id:id_3'])

In [151]:
def create_co_occur(vocab, article_transform):
    """create a co-occurrence dictionary
    output -- keys are tuples of strings, and values are numbers (frequency of co-occurrences).
              {("id_1", "id_2"), 10}
    input -- a vocabulary list, 
             a list of strings, the strings are joined by the bag of identifiers"""
    
    # create the document-term matrix
    
    vectorizer = CountVectorizer(vocabulary=vocab, token_pattern=r"(?u)\b\w\w+:?\w+\b")  # need to add colon as part of the tokens
    X = vectorizer.transform(article_transform)  
    X[X > 0] = 1   # filter spurious result, but I don't think we need this line
#     print(X.toarray())
#     print(vectorizer.get_feature_names())
    
    # creat the co-occurrence matrix
    Xc = (X.T * X) 
    Xc.setdiag(0) 
    Xc = Xc.tolil()   # this line removes the sparsewarning
    indices = list(zip(*Xc.nonzero()))
#     print(Xc)
#     print(indices)
    
    indices_set = list()
    co_occur_dict = dict()
    ind2vocab = dict(zip(range(len(vocab)), vocab))

    for i in indices:
#         print(i)
#         print((ind2vocab[i[0]],ind2vocab[i[1]]), Xc[i])
        index_set = {i[0],i[1]}
        if index_set not in indices_set: 
            indices_set.append(index_set)
            co_occur_dict[(ind2vocab[i[0]],ind2vocab[i[1]])] = Xc[i]

                              
    return co_occur_dict

In [152]:
result = create_vocab(articles)
vocab, trans = result[0], result[1]

In [153]:
vocab

['id:id_1', 'id:id_2', 'id:id_3', 'id:id_4']

In [154]:
create_co_occur(vocab, trans)

{('id:id_1', 'id:id_2'): 1,
 ('id:id_1', 'id:id_3'): 2,
 ('id:id_2', 'id:id_3'): 1,
 ('id:id_2', 'id:id_4'): 1}

In [314]:
# ## test 
# article1 = ['AlzheimerOntology:t_tau', 'AlzheimerOntology:tauopathy']
# article2 = ["9606", 'AlzheimerOntology:t_tau', 'AlzheimerOntology:tauopathy']
# article3 = ["9606", 'AlzheimerOntology:t_tau']
# articles = [article1, article2, article3]

In [315]:
# articles

[['AlzheimerOntology:t_tau', 'AlzheimerOntology:tauopathy'],
 ['9606', 'AlzheimerOntology:t_tau', 'AlzheimerOntology:tauopathy'],
 ['9606', 'AlzheimerOntology:t_tau']]

In [316]:
# result = create_vocab(articles)
# vocab, trans = result[0], result[1]

In [317]:
# vocab

['9606', 'AlzheimerOntology:t_tau', 'AlzheimerOntology:tauopathy']

In [318]:
# trans

['AlzheimerOntology:t_tau AlzheimerOntology:tauopathy',
 '9606 AlzheimerOntology:t_tau AlzheimerOntology:tauopathy',
 '9606 AlzheimerOntology:t_tau']

In [155]:
write_to_csv(path,"test",co_occur_dict)

### create the co-occurrence matrix and CSV file for the 10 most recent articles

In [3]:
json_path = "/Users/yidesdo21/Projects/outputs/10_ptc_ten_recent/json/"
article_file = "combined_ten_recent_spans_nested_case.json"

In [4]:
with open(json_path+article_file) as f:
    data = json.load(f)

In [5]:
len(data)

10

In [8]:
bag_identifiers = dict()   # a pointer from the article titles to the bag of identifiers in the titles
list_identifiers = list() # a list of lists of bag of identifiers to create the vocabulary for the co-occurrence matrix

for a in data:
    title, ents = a["title"], a["ents"]
    identifiers = set()
    
    for ent in ents:
        identifiers.add(ent["identifier"])
    
    sort_ids = sorted(list(identifiers))
    list_identifiers.append(sort_ids)
    bag_identifiers[title] = sort_ids


In [9]:
bag_identifiers

{'CCC_000637925000033': ['351',
  '6647',
  '847',
  'AlzheimerOntology:Endocytosis',
  'AlzheimerOntology:Intervention',
  'AlzheimerOntology:Membrane',
  'AlzheimerOntology:Microglia',
  'AlzheimerOntology:Receptors',
  'AlzheimerOntology:amyloid_beta_protein',
  'AlzheimerOntology:antioxidant',
  'AlzheimerOntology:brain',
  'AlzheimerOntology:inflammation',
  'AlzheimerOntology:oxygen',
  'AlzheimerOntology:phagocytosis',
  'MESH:C405953',
  'MESH:D008345',
  'MESH:D010100',
  'MESH:D016229',
  'NDDUO:Intervention',
  'NDDUO:Study_type',
  'NDDUO:disease',
  'none',
  'obo:BFO_0000015',
  'obo:FMA_50801',
  'obo:IAO_0000230',
  'obo:OGMS_0000031',
  'span:Process'],
 'CCC_000647663200001': ['10090',
  '11820',
  '19164',
  'AlzheimerOntology:APP',
  'AlzheimerOntology:Subtypes',
  'AlzheimerOntology:amyloid_beta_protein',
  'AlzheimerOntology:amyloid_precursor_protein',
  'AlzheimerOntology:brain',
  'AlzheimerOntology:presence_of_amyloid_plaque',
  'AlzheimerOntology:presenilin',


In [10]:
cnt_ids = 0 # including duplications
for i in list_identifiers:  # number of identifiers in each article
    print(len(i))
    cnt_ids += len(i)

print(cnt_ids)

27
37
16
24
29
22
13
27
22
25
242


In [284]:
# list_identifiers

In [12]:
vocab_ids, art_transform = create_vocab(list_identifiers)

In [163]:
len(vocab_ids)

137

In [59]:
# vocab_ids

In [64]:
# art_transform

In [63]:
# vocab_ids

In [169]:
ten_dict = create_co_occur(vocab_ids, art_transform)
ten_dict_sorted = dict(sorted(ten_dict.items(), key=lambda x: x[1], reverse=True))

In [171]:
write_to_csv(path,"ten_article_co_occur",ten_dict_sorted)

### create the id information CSV file for the 10 most recent articles

In [None]:
## expected dictionary 
# {"AlzheimerOntology:Subtypes": {"article": ["NLM_33988687", "CCC_000653155500009"],
#                                 "span": [[10,27], [13,15]],
#                                 "mention": ["Alzheimer Disease", "AD"],
#                                 "canonical name": ["Alzheimer Disease"],
#                                 "type": [],
#                                 "annotator": ["NIO"]}}

In [68]:
id_dict = defaultdict(lambda: defaultdict(set))  # either set or list, but list has too many duplications

In [69]:
id_dict

defaultdict(<function __main__.<lambda>()>, {})

In [70]:
for a in data:
    title, ents = a["title"], a["ents"]  
#     id_dict = defaultdict(lambda: defaultdict(list))
    
    for ent in ents:
        identifier = ent["identifier"]
        id_dict[identifier]["article"].add(title)
#         print(id_dict[identifier]["span"])
#         print(ent["span"])
        id_dict[identifier]["span"].add(str(ent["span"]))
        id_dict[identifier]["mention"].add(ent["mention"])
        id_dict[identifier]["canonical name"].add(ent["concept"])
        id_dict[identifier]["type"].add(ent["type"])
        id_dict[identifier]["annotator"].add(ent["annotator"])


In [71]:
len(id_dict)

137

In [72]:
id_dict

defaultdict(<function __main__.<lambda>()>,
            {'AlzheimerOntology:Receptors': defaultdict(set,
                         {'article': {'CCC_000637925000033'},
                          'span': {'[1232, 1241]',
                           '[165, 173]',
                           '[1723, 1731]',
                           '[28, 37]',
                           '[371, 380]',
                           '[462, 471]',
                           '[758, 767]'},
                          'mention': {'receptor', 'receptors'},
                          'canonical name': {'Receptor'},
                          'type': {''},
                          'annotator': {'NIO'}}),
             'AlzheimerOntology:Microglia': defaultdict(set,
                         {'article': {'CCC_000637925000033'},
                          'span': {'[204, 213]',
                           '[42, 51]',
                           '[562, 571]',
                           '[779, 788]'},
                          'me

In [73]:
path

'/Users/yidesdo21/Projects/outputs/10_ptc_ten_recent/bag_of_identifiers/'

In [74]:
fields = ['concept_id', 'article', 'span', 'mention', 'canonical name', 'type', 'annotator']
file_name = "ten_article_id_info"

with open(path+file_name+'.csv', 'w') as f:
    writer = csv.DictWriter(f, fields)
    writer.writeheader()
    
    for key,val in sorted(id_dict.items()):
        row = {'concept_id': key}
        row.update(val)
        writer.writerow(row)
    