In [1]:
import PyPDF2,os,sys,joblib
from nltk.util import ngrams
from collections import Counter
from prettytable import PrettyTable
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('[A-Za-z0-9-]*\w+')
# import textract, re
# import pdfminer
import pandas as pd
import shutil
from nltk.tokenize import sent_tokenize

In [2]:
def lowercase(text): # to lower case parameter string
    return text.lower()

def get_file_paragraphs(file_name, fig_no):
    """
    This function reads the file specified in parameter and using the figure no. ,
    it identifies that section of the document text in which the experiment is 
    diccussed whose results are specified in the figure
    """
    pdfFileObj = open(file_name, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    num_pages = pdfReader.numPages
    full_text = ""
    fig = lowercase(fig_no.split(' ')[0])
    fig_value = fig_no.split(' ')[1]
#     print(num_pages)
    for i in range(4, num_pages):
#         print('page:',i)
        pageObj = pdfReader.getPage(i)
        try:
            page_text = lowercase(pageObj.extractText())
            sentences = sent_tokenize(page_text)
#             if(i==20):
#                 print(sentences)
            para = ''
            before =5
            after = 8
            for i in range(len(sentences)):
                j=i-1
                if((sentences[i][-4:]=='fig.' or sentences[i][-5:]=='fig .' or sentences[i][-5:]=='figs.' or sentences[i][-6:]=='figs .') and (fig_value in sentences[i+1][:4])):
#                     print('yes')
                    if(i-before < 0):
                        stop = -1
                    else:
                        stop = i-before
                    for j in range(i-1, stop, -1):
                        para = sentences[j] + para

                    para += sentences[i]

                    if(i+after >= len(sentences)):
                        stop = len(sentences)
                    else:
                        stop = i+after+1
                    for j in range(i+1, stop, +1):
                        para = para + sentences[j]
            full_text+=para
        except:
            pass
        
    pdfFileObj.close()
    return full_text

In [4]:
code_xl = pd.read_excel(r'annotated_graph_data.xlsx')
records = code_xl.values

In [6]:
"""
Create a dictionary to store feature values retrieved from the filtered patents
"""
data = {}
for row in records:
    patent_id = row[1]
    fig_id = row[2]
    tumor_cell = row[3]
    chemo_agent_virus = row[4]
    y_value = row[5]
    x_value = row[6]
    
    if(patent_id not in data):
        data[patent_id]={}
    if(fig_id not in data[patent_id]):
        data[patent_id][fig_id]={'chemo/virus':[chemo_agent_virus],
                                'tumor/cell_survival':[tumor_cell],# 0 or 1
                                'days':[x_value],
                                'tumor_size_value/cell_survival_value':[y_value],
                                'para':"",
                                'experiment_type':"",
                                'species':"",
                                "cancer_type":"",
                                "cell_lines":[],
                                "quantity":""}      
    else:
        data[patent_id][fig_id]['chemo/virus'].append(chemo_agent_virus)
        data[patent_id][fig_id]['tumor/cell_survival'].append(tumor_cell)
        data[patent_id][fig_id]['days'].append(x_value)
        data[patent_id][fig_id]['tumor_size_value/cell_survival_value'].append(y_value)
#         data[patent_id][fig_id]['para'].append()

In [7]:
"""
Read patent with specidied patent ids 
"""
for patent_id in data:
    for key in data[patent_id]: # key is fig_id
#         print(patent_id, key)
        path = '../Patent_data_set'
        file_path = os.path.join(path,patent_id)
        information = get_file_paragraphs(file_path,key)
        data[patent_id][key]['para']=information



In [8]:
def get_experiment_type(paragraph, tumor_cell_survival_list):
    """
    function to determine type off experiment performed - invivo or invitro
    from text paragraph
    """
    if(('invivo' in paragraph) or ('in vivo' in paragraph) or ('inject' in paragraph)):
        return "in vivo"
    elif(('invitro' in paragraph) or ('in vitro' in paragraph)):
        return "in vitro"
    else:
        exp_type = sum(tumor_cell_survival_list)
        if(exp_type==0):
            return "in vivo"
        else:
            return "in vitro"
        
def get_species(paragraph,exp_type,invivolist): # only for invivo experiments
    if(exp_type=='in vivo'):
        for animal in invivolist:
            if(animal in paragraph):
                return animal
    else:
        return ''

In [9]:
def get_cancer_list(cancer_list_file):
    """
    read all cancer types from the 'cancer list' file
    """
    cancers = open(cancer_list_file, 'r').read().lower().split(", ")
    size_based_canc_list = [list() for j in range(4)]
    for cancer in cancers:
        c = lemmatizer.lemmatize(cancer)
        size_based_canc_list[len(c.split(" ")) - 1].append(c)
    return size_based_canc_list


def get_cancer_type(paragraph, caner_list):
    """
    put different cancer types in different lists based on their no. pf tokens
    """
    possible_cancers=[]
    for n_gram_list in caner_list:
        for gram in n_gram_list:
            possible_cancers.append((paragraph.count(gram),gram))
    possible_cancers.sort()
    return possible_cancers[-1][1]

def get_cell_lines(paragraph,all_cell_lines):
    """
    read all cell lines from saved list 'all_cell_lines.sav'
    """
    temp_1=[]
    temp_2=[]
    for cell_line in all_cell_lines:
        if(cell_line[0] in paragraph):
            if(cell_line[0] not in temp_1 and len(cell_line[0])>2):
                temp_1.append(cell_line[0])
                temp_2.append(cell_line)
    return temp_2

def get_ngrams_tokens(tokens_list, n):
    """
    function to break string to tokens 
    """
    grams = []
    for ngram in ngrams(tokens_list, n):
        grams.append(' '.join(i for i in ngram))
    return grams

def num_there(s):
    """
    function to check if a given token has numeric character
    """
    return any(i.isdigit() for i in s)

def get_quantity(paragraph,quantity_list_unigram):
    """
    Function to get quantity of chemotherapic agent/ oncolyic virus strain used
    from text paragraph
    """
    result=""
    tokens = tokenizer.tokenize(paragraph)    
    unigram = get_ngrams_tokens(tokens,1)
    for quantity in quantity_list_unigram:
        for i in range(len(unigram)):
            if(quantity in unigram[i]):
                x = unigram[i-1]
                if(num_there(x)):
                    if(quantity=='multiplicity' and unigram[i+1]=='of'):
                        result = unigram[i-1] + ' '+unigram[i] + ' '+ unigram[i+1]+ ' '+unigram[i+2]
                    else:
                        result = unigram[i-1]+ ' '+unigram[i]
                    
                elif(num_there(unigram[i])):
                    result = unigram[i]
            if(unigram[i]=='moi' and len(result)==0):
                if(unigram[i+1]=='of' and num_there(unigram[i+2])):
                    result = unigram[i] + ' '+ unigram[i+1]+ ' '+unigram[i+2]
    return result

In [10]:
invivo_animals = ['scid mice', 'mice', 'mouse', 'rabbit', 'rat', 'dog', 'goat', 'sheep', 'pig', 'cat', 'primate']
all_cell_lines = joblib.load('all_cell_lines.sav')
canc_list = get_cancer_list("cancer list.txt") ## get names of all types of cancers (total 176 types), 
quantity_list_uni=['vp', 'vp/mm', 'moi', 'micromolar', 'multiplicity','pfu', 'tcid']

"""
Calling the above functions on each patent id to fill the data dictionary with feature values
"""
for patent_id in data:
    for key in data[patent_id]: # key is fig_id
#         print(patent_id, key)
        paragraph = data[patent_id][key]['para']
        tumor_cell_survival_list = data[patent_id][key]['tumor/cell_survival']
        data[patent_id][key]['experiment_type'] = get_experiment_type(paragraph,tumor_cell_survival_list)
        
        exp_type = data[patent_id][key]['experiment_type']
        data[patent_id][key]['species'] = get_species(paragraph,exp_type,invivo_animals)
        data[patent_id][key]['cancer_type'] = get_cancer_type(paragraph,canc_list)
        data[patent_id][key]['cell_lines'] = get_cell_lines(paragraph,all_cell_lines)
        data[patent_id][key]['quantity'] = get_quantity(paragraph,quantity_list_uni)

In [11]:
# data[patent_id][key]

In [12]:
# Dumping the dictionary data as a saved model
joblib.dump(data,'features_dictionary.sav')

['features_dictionary.sav']

In [13]:
"""
Converting the dictionary data into the form of a list, so that it could be put in a excel sheet as output

Headings (Column titles) for the output sheet:  patent_id, cancer type, chemo agent/ oncolytic virus, quantity, cell survival %, tumor volume, days, species, experiment type, cell lines
"""
result=[]
for patent_id in data:
    for fig in data[patent_id]:
        rows = len(data[patent_id][fig]['tumor/cell_survival'])
        for i in range(rows):
            row = []
            row.append(patent_id)
            row.append(data[patent_id][fig]['cancer_type'])
            row.append(data[patent_id][fig]['chemo/virus'][i])
            row.append(data[patent_id][fig]['quantity'])
            if data[patent_id][fig]['tumor/cell_survival'][i] == 0: #tumor
                row.append('')
                row.append(data[patent_id][fig]['tumor_size_value/cell_survival_value'][i])                
            else: # cell survival
                row.append(data[patent_id][fig]['tumor_size_value/cell_survival_value'][i])
                row.append('')
            row.append(data[patent_id][fig]['days'][i])
            row.append(data[patent_id][fig]['species'])
            row.append(data[patent_id][fig]['experiment_type'])
            row.append(data[patent_id][fig]['cell_lines'])
            result.append(row)

In [14]:
# len(result)

In [15]:
# result

In [16]:
"""
Exporting the results in an excel sheet 'final_output.csv'
"""
df = pd.DataFrame(result,columns=['patent_id', 'cancer type', 'chemo agent/ oncolytic virus', 'quantity', 'cell survival %', 'tumor volume (in mm3)', 'days', 'species', 'experiment type', 'cell lines']) 
df.to_excel('final_output.xlsx')