In [38]:
import requests
from bs4 import BeautifulSoup
import os
import csv
import nltk

import spacy
nlp = spacy.load("en_core_sci_sm")

x = None
class AccuracyFileGenerator:
    
    def __init__(self, pmcid, context_pmcid, acc_dir_path, context_dir_path, paragraph_file_path=None, name=""):
        # if base_path:
        print(pmcid)
        self.output_base = os.path.join(acc_dir_path, pmcid)
        # else:
        #     self.output_base = pmcid
        self.context_pmcid = context_pmcid
        self.pmcid = pmcid
        self.ann_file_path = os.path.join(context_dir_path, pmcid + '.ann')
        self.txt_file_path = os.path.join(context_dir_path, pmcid + '.txt')
        
        self.indices = None
        self.name = name
        
        self.para_file_path = os.path.join("FullText", context_pmcid, f'{pmcid}_par.csv')
                                           
        with open(self.txt_file_path) as f:
            self.content = f.read()
        
        self.html_content = self.get_html_content(pmcid)
    
    
    def get_html_content(self, pmcid):
        """
        Gets the HTML content for the given PMCID
        """
        headers = {
            "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.33"
        }
        # pmcid="PMC4038460"
        content = requests.get(url=f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/", headers=headers)
        return content.text
        
    def get_lines(self):
        """
        Read all the lines of the brat annotation file
        """
        lines = []
        with open(self.ann_file_path) as f:
            for line in f:
                lines.append(line)
        return lines
    
    
    def get_target_span_relation_dict(self):
        """
        For each T_i in te brat file where T_i occurs in some relation R_i
        a dictionary is generated with three keys:
        target : stores the line 
        spans: stores all the spans where are contexts for T_i
        relations: stores all R_j where T_i is the target for R_j
        """
        lines = self.get_lines()
        d = {}
        tsrs = {} # Ti -> {target: [], spans: [[]], relations: [[]]}
        for line in lines:
            if "AnnotatorNotes" in line:
                continue
            parts = line.split("\t")
            if parts[0][0] == 'T':
                d[parts[0]] = parts
                tsrs[parts[0]] = {'target': parts, 'spans': [], 'relations': []}
            else:
                if len(parts) <= 1:
                    break
                # print(parts[1])
                _, target, context = parts[1].split()
                target, context = target.strip().split(":")[1], context.strip().split(":")[1]
                target, context = d[target], d[context]
                if target[0] in tsrs:
                    tsrs[target[0]]['spans'].append(context)
                    tsrs[target[0]]['relations'].append(parts)
        return tsrs
        

    def parse_indices_file(self):
        """
        Reads the start and end indices of paragraphs from the csv file into a list of lists
        """
        indices = []
        if self.para_file_path:
            with open(self.para_file_path) as f:
                for line in f:  
                    split = line.split(" ")
                    indices.append([int(split[0]), int(split[1].strip())])
            return indices
        
        with open(self.txt_file_path) as f:
            c = f.read()
        res = []
        start = 0
        for para in c.split('\n\n'):
            res.append([start, start+len(para)])
            start = start + len(para) + 2
        return res
    
        
    def adjust(self, original_str, p_start):
        """
        Adjusts offsets based on the index of the start of the paragraph in the original txt file
        """
        # print(original_str, p_start)
        if ";" in original_str:
            print(self.pmcid)
            space_idx = original_str.find(" ")
            text = original_str[:space_idx]
            remaining = original_str[space_idx+1:]
            indices = remaining.split(";")
            res_indices = []
            for index in indices:
                idx0, idx1 = index.split(" ")
                idx0 = str(int(idx0) - p_start)
                idx1 = str(int(idx1) - p_start)
                res_indices.append(idx0 + " " + idx1)
            res_indices_joined = ";".join(res_indices)
            return " ".join([text, res_indices_joined])
                
        ################ TEMP MODIFICATION #################
        # if len(original_str.split(" ")) > 3:
        #     splits = original_str.split(" ")
        #     text = splits[0]
        #     idx0 = splits[1]
        #     idx1 = splits[2].split(";")[0]
        # else:
        ################ TEMP MODIFICATION #################
        else:
            text, idx0, idx1 = original_str.split(" ")
            idx0 = str(int(idx0) - p_start)
            idx1 = str(int(idx1) - p_start)
            return " ".join([text, idx0, idx1])
    
    
    def pair_to_ann_str(self, pair):
        res = []
        for line in pair:
            res.append("\t".join(line))
        return "\n".join(res)
    
    def get_reference_article(self):
    
#         soup = BeautifulSoup(self.html_content)
#         curr_link = None
#         for link in soup.findAll('a'):

#             if link.text == citation_marker_text:
#                 curr_link = link.get('href')
#                 break

#         reference_pmcid = None
#         for div in soup.findAll('div'):
#             if div.get('id') == curr_link[1:]:
#                 for span in div.children:
#                     start = str(span).find("PMC")
#                     reference_pmcid = str(span)[start:start+10]
        
        text = None
        path = os.path.join('Reference', f"{self.context_pmcid}", f"{self.context_pmcid}.txt")
        with open(path) as f:
            text = f.read()
        
        return text
         
    def process_tsrs(self, tsrs):
        for _, tsr in tsrs.items():
            if not tsr['spans']:
                continue

            # seen_paragraphs = set()
            paragraphs = []
            tsr['target']
            p_start, p_end = None, None
            for span in tsr['spans']:
                if not p_start:
                    pair = [tsr['target'], span]
                    ###### TEMP MODIFICATION ######
                    # c_start, c_end = int(pair[1][1].split(" ")[1]), int(pair[1][1].split(" ")[2])
                    # if ";" in pair[1][1].split(" ")[2]:
                    #     c_start, c_end = int(pair[1][1].split(" ")[1]), int(pair[1][1].split(" ")[2].split(" ")[-1])
                    # else:
                    c_start, c_end = int(pair[1][1].split(" ")[1]), int(pair[1][1].split(" ")[-1])
                    ###### TEMP MODIFICATION ######
                    p_start, p_end = None, None
                    for id0, id1 in self.indices:
                        if id0 <= c_start and id1 >= c_end:
                            p_start, p_end = id0, id1
                            break
                    # if (p_start, p_end) not in seen_paragraphs:
                        # seen_paragraphs.add((p_start, p_end))
                    with open(self.txt_file_path) as f:
                        content = f.read()
                    paragraph = content[p_start:p_end]
                    paragraphs.append(paragraph)

                    # if len(seen_paragraphs) > 1:
                    #     print("UNHANDLED CASE")


                span[1] = self.adjust(span[1], p_start)
            if not p_start:
                print("ERROR, ERROR, ERROR")
            tsr['target'][1] = self.adjust(tsr['target'][1], p_start)
            tsr['paragraph'] = paragraphs
        return tsrs
    
    def generate_ann_file(self, tsrs):
     
        idx = 1
        global x
        x = tsrs
        for _, tsr in tsrs.items():
            
            if not tsr['spans']:
                continue
            res = []
            res.append("\t".join(tsr['target']))
            for span, rel in zip(tsr['spans'], tsr['relations']):  
                res.append("\t".join(span))
                res.append("\t".join(rel))

            with open(self.output_base + f"_{idx}.ann", "w") as f:
                f.write("".join(res))
            
#             doc = nlp(tsr['paragraph'][0])
#             spacy_sents = list(doc.sents)
            
#             sentences = []
#             for s in spacy_sents:
#                 if len(str(s).strip()) > 0:
#                     sentences.append(str(s).strip())
                    
#             # sentences = nltk.sent_tokenize(tsr['paragraph'][0])
#             sentence_labels = [[sent, "0"] for sent in sentences]
#             for span in tsr['spans']:
#                 context = span[2]
#                 for i in range(len(sentence_labels)):
#                     if context.strip() in sentence_labels[i][0]:
#                         sentence_labels[i][1] = "1"
            
#             with open("aa.csv", "a") as f:
#                 row_writer = csv.writer(f)
#                 base = [self.name, self.context_pmcid, self.pmcid, tsr['target'][2].strip()]
#                 for l in sentence_labels:
#                     row_writer.writerow(base + l)
                
                
            idx += 1
        
    
    def generate_txt_file(self, tsrs):
        idx = 1
        for _, tsr in tsrs.items():
            if not tsr['spans']:
                continue
            # res = []
            # # for paragraph in tsr['paragraphs']:
            # #     res.append(paragraph)
            # res.append()
            citation_marker_text = tsr['target'][2].strip()
            with open(self.output_base + f"_{str(idx)}.txt" , "w") as f:
                f.write(tsr['paragraph'][0])
                f.write("\n")
                f.write("**************************************************************")
                f.write("\n"*3)
                f.write(self.get_reference_article())
            idx += 1


    def run(self):
        tsrs = self.get_target_span_relation_dict()
        self.indices = self.parse_indices_file()
        tsrs = self.process_tsrs(tsrs)
        self.generate_ann_file(tsrs)
        self.generate_txt_file(tsrs)

In [3]:
with open("aa.csv", "w") as f:
    w = csv.writer(f)
    w.writerow(['Annotator', 'ReferencePMCID', 'CitingPMCID', 'CitationMarker', 'Sentence', 'Label'])

In [None]:
# /var/www/html/brat/data/CitationIntegrity/Nikki/Context

In [39]:
testDir = 'Jodi'

context_dir = os.path.join(testDir, 'Context')
accuracy_dir = os.path.join(testDir, 'Accuracy')

curr_acc_dir = os.path.join(accuracy_dir, 'PMC3607626')
curr_context_path = os.path.join(context_dir, 'PMC3607626')

In [40]:
#1	AnnotatorNotes T11	About THIS study - unclear what part relates to the CITED study.

In [41]:
o = AccuracyFileGenerator('PMC3297570', 'PMC3607626', acc_dir_path=curr_acc_dir, context_dir_path=curr_context_path, name='Eugene')
o.run()

PMC3297570


In [42]:
testDir = 'Jodi'

context_dir = os.path.join(testDir, 'Context')
accuracy_dir = os.path.join(testDir, 'Accuracy')

for directory in os.listdir(context_dir):
    if not directory.startswith("PMC"):
        continue
    curr_acc_dir = os.path.join(accuracy_dir, directory)
    curr_context_path = os.path.join(context_dir, directory)
    os.makedirs(curr_acc_dir)
    visited_files = set()
    for filename in os.listdir(os.path.join(context_dir, directory)):
        if not filename.startswith("PMC"):
            continue
        pmcid = filename[:filename.find('.')]
        if pmcid in visited_files:
            continue
        if pmcid == "PMC3010068":
            continue
        visited_files.add(pmcid)
        print(pmcid)
        o = AccuracyFileGenerator(pmcid, directory, acc_dir_path=curr_acc_dir, context_dir_path=curr_context_path, name='Jodi')
        o.run()

PMC4482400
PMC4482400
PMC5962687
PMC5962687
PMC5503620
PMC5503620
PMC3291531
PMC3291531
PMC4406659
PMC4406659
PMC5913322
PMC5913322
PMC4957882
PMC4957882
PMC4033777
PMC4033777


KeyboardInterrupt: 

### Eugene

In [10]:
testDir = 'Eugene'

In [11]:
context_dir = os.path.join(testDir, 'Context')
accuracy_dir = os.path.join(testDir, 'Accuracy')

for directory in os.listdir(context_dir):
    if not directory.startswith("PMC"):
        continue
    curr_acc_dir = os.path.join(accuracy_dir, directory)
    curr_context_path = os.path.join(context_dir, directory)
    os.makedirs(curr_acc_dir)
    visited_files = set()
    for filename in os.listdir(os.path.join(context_dir, directory)):
        if not filename.startswith("PMC"):
            continue
        pmcid = filename[:filename.find('.')]
        if pmcid in visited_files:
            continue
        visited_files.add(pmcid)
        # print(pmcid)
        o = AccuracyFileGenerator(pmcid, directory, acc_dir_path=curr_acc_dir, context_dir_path=curr_context_path, name='Eugene')
        o.run()

PMC4482400
PMC5962687
PMC5503620
PMC3291531
PMC3010068
PMC4406659
PMC5913322
PMC4957882
PMC4033777
PMC4151150
PMC3050809
PMC5349099
PMC3630392
PMC4404204
PMC3000346
PMC4342580
PMC5068955
PMC4384303
PMC5881750
PMC4044952
PMC4038460
PMC6377406
PMC3842116
PMC3549971
PMC5084273
PMC2995166
PMC6357844
PMC6358975
PMC2995477
PMC4026148
PMC6210981
PMC3020954
PMC4702311
PMC5487699
PMC3771941
PMC3776211
PMC3509553
PMC2886067
PMC4599244
PMC3182203
PMC2942840
PMC6295637
PMC5797541
PMC4399516
PMC4154524
PMC4821123
PMC5397424
PMC4458864
PMC4968454
PMC4362394
PMC6007455
PMC4740398
PMC4161327
PMC6436236
PMC4915581
PMC4407866
PMC4490250
PMC5017755
PMC6032046
PMC3654344
PMC3658960
PMC4738362
PMC4311266
PMC4375079
PMC4174929
PMC5531429
PMC4199491
PMC6096637
PMC4231111
PMC5454371
PMC4030114
PMC3749360
PMC4401705
PMC4076741
PMC4540762
PMC5346261
PMC5068862
PMC4895704
PMC6166321
PMC4363689
PMC6073260
PMC4783239
PMC4673647
PMC4113061
PMC3837041
PMC5560232
PMC5253162
PMC5703837
PMC3984159
PMC5419585
PMC3619105

### Nikki

In [28]:
testDir = 'Nikki'

In [29]:
testDir = 'Nikki'

context_dir = os.path.join(testDir, 'Context')
accuracy_dir = os.path.join(testDir, 'Accuracy')

for directory in os.listdir(context_dir):
    if not directory.startswith("PMC"):
        continue
    curr_acc_dir = os.path.join(accuracy_dir, directory)
    curr_context_path = os.path.join(context_dir, directory)
    os.makedirs(curr_acc_dir)
    visited_files = set()
    for filename in os.listdir(os.path.join(context_dir, directory)):
        if not filename.startswith("PMC"):
            continue
        pmcid = filename[:filename.find('.')]
        if pmcid in visited_files:
            continue
        visited_files.add(pmcid)
        print(pmcid)
        o = AccuracyFileGenerator(pmcid, directory, acc_dir_path=curr_acc_dir, context_dir_path=curr_context_path, name='Nikki')
        o.run()

PMC4482400
PMC5962687
PMC5962687
PMC5503620
PMC3291531
PMC3010068
PMC4406659
PMC5913322
PMC4957882
PMC4033777
PMC4151150
PMC3050809
PMC5349099
PMC3630392
PMC4404204
PMC3000346
PMC4342580
PMC5068955
PMC4384303
PMC5881750
PMC4044952
PMC4038460
PMC6377406
PMC6377406
PMC3842116
PMC3549971
PMC5084273
PMC2995166
PMC2995166
PMC6357844
PMC6358975
PMC2995477
PMC4026148
PMC6210981
PMC3020954
PMC4702311
PMC5487699
PMC3771941
PMC3771941
PMC3776211
PMC3509553
PMC2886067
PMC4599244
PMC3182203
PMC3182203
PMC2942840
PMC6295637
PMC6295637
PMC6295637
PMC5797541
PMC4399516
PMC4154524
PMC4821123
PMC5397424
PMC4458864
PMC4968454
PMC4362394
PMC6007455
PMC4740398
PMC4161327
PMC6436236
PMC4915581
PMC4407866
PMC4490250
PMC5017755
PMC6032046
PMC3654344
PMC3658960
PMC4738362
PMC4311266
PMC4375079
PMC4174929
PMC5531429
PMC4199491
PMC6096637
PMC4231111
PMC5454371
PMC4030114
PMC3749360
PMC4401705
PMC4076741
PMC4540762
PMC5346261
PMC5068862
PMC4895704
PMC6166321
PMC4363689
PMC6073260
PMC4783239
PMC4673647
PMC4113061

# Jodi

In [None]:
# T3	Citation 28746 28828	(currently more than 500 genes in total to interrogate, with others emerging) [35]
#1	AnnotatorNotes T3	informal reference to personal communication follows (making this similar to a MultiCitation)
# In PMC3654344

In [None]:
testDir = 'Jodi'

context_dir = os.path.join(testDir, 'Context')
accuracy_dir = os.path.join(testDir, 'Accuracy')

for directory in os.listdir(context_dir):
    if not directory.startswith("PMC"):
        continue
    curr_acc_dir = os.path.join(accuracy_dir, directory)
    curr_context_path = os.path.join(context_dir, directory)
    os.makedirs(curr_acc_dir)
    visited_files = set()
    for filename in os.listdir(os.path.join(context_dir, directory)):
        if not filename.startswith("PMC"):
            continue
        pmcid = filename[:filename.find('.')]
        if pmcid == "PMC3010068": #or pmcid == "PMC3654344" or pmcid == "PMC3297570":
            continue
        if pmcid in visited_files:
            continue
        visited_files.add(pmcid)
        # print(pmcid)
        o = AccuracyFileGenerator(pmcid, directory, acc_dir_path=curr_acc_dir, context_dir_path=curr_context_path, name='Jodi')
        o.run()

PMC4482400
PMC5962687
PMC5503620
PMC3291531
PMC4406659
PMC5913322
PMC4957882
PMC4033777
PMC4151150
PMC3050809
PMC5349099
PMC3630392
PMC4404204
PMC3000346
PMC4342580
PMC5068955
PMC4384303
PMC5881750
PMC4044952
PMC4038460
PMC6377406
PMC3842116
PMC3549971
PMC5084273
PMC2995166
PMC6357844
PMC6358975
PMC2995477
PMC4026148
PMC6210981
PMC3020954
PMC4702311
PMC5487699
PMC3771941
PMC3776211
PMC3509553
PMC2886067
PMC4599244
PMC3182203
PMC2942840
PMC6295637
PMC5797541
PMC4399516
PMC4154524
PMC4821123
PMC5397424
PMC4458864
PMC4968454
PMC4362394
PMC6007455
PMC4740398
PMC4161327
PMC6436236
PMC4915581
PMC4407866
PMC4490250
PMC5017755
PMC6032046
PMC3654344
PMC3658960
PMC4738362
PMC4311266
PMC4375079
PMC4174929
PMC5531429
PMC4199491
PMC6096637
PMC4231111
PMC5454371
PMC4030114
PMC3749360
PMC4401705
PMC4076741
PMC4540762
PMC5346261
PMC5068862
PMC4895704
PMC6166321
PMC4363689
PMC6073260
PMC4783239
PMC4673647
PMC4113061
PMC3837041
PMC5560232
PMC5253162
PMC5703837
PMC3984159
PMC5419585
PMC3619105
PMC3749330

PMC6377406
PMC2995166

In [45]:
import spacy

In [46]:
nlp = spacy.load("en_core_sci_sm")

In [52]:
text = """
Myeloid derived suppressor cells (MDSC) are immature 
myeloid cells with immunosuppressive activity. 
They accumulate in tumor-bearing mice and humans 
with different types of cancer, including hepatocellular 
carcinoma (HCC).
"""
doc = nlp(text)

print(list(doc.sents))

[
Myeloid derived suppressor cells (MDSC) are immature 
myeloid cells with immunosuppressive activity., 
They accumulate in tumor-bearing mice and humans 
with different types of cancer, including hepatocellular 
carcinoma (HCC)., 
]


In [53]:
x = list(doc.sents)

In [63]:
[str(i).strip() for i in x][0]

'Myeloid derived suppressor cells (MDSC) are immature \nmyeloid cells with immunosuppressive activity.'

In [70]:
import spacy

In [72]:
nlp = spacy.load("en_core_web_sm")

In [89]:
s = "I like my coffee"
doc = nlp(s)
for ent in doc.ents:
    print(ent.text, ent.label_)

In [79]:
a = 0.5 * 0.75 *0.3 * 0.25 * 0.3 * 0.2 * 0.1 * 0.75 * 0.4 * 0.2

In [78]:
b = 0.5 * 0.75 * 0.4 * 0.1 * 0.4 * 0.01 * 0.5 * 0.75 * 0.4 * 0.2

In [80]:
a > b

True

'The Bank of America Plaza at St Louis'

In [88]:
spacy.explain("FAC")

'Buildings, airports, highways, bridges, etc.'