In [1]:
import PyPDF2 
# import textract
from nltk.util import ngrams
from collections import Counter
from prettytable import PrettyTable
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('[A-Za-z0-9-]*\w+')

In [2]:
def get_file_text(file_name):
    pdfFileObj = open(file_name, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    num_pages = pdfReader.numPages
    full_text = ""
    for i in range(num_pages):
        pageObj = pdfReader.getPage(i)
        try:            
            full_text += pageObj.extractText()
        except:
            pass
    pdfFileObj.close()
    return full_text

In [4]:
doc_text = get_file_text('../Patent_data_set/US3323996.pdf') #get raw document data



In [7]:
# doc_text

In [8]:
def get_cancer_list(cancer_list_file):
    cancers = open(cancer_list_file, 'r').read().lower().split(", ")
    size_based_canc_list = [list() for j in range(4)]
    for cancer in cancers:
        c = lemmatizer.lemmatize(cancer)
        size_based_canc_list[len(c.split(" ")) - 1].append(c)
    return size_based_canc_list

canc_list = get_cancer_list("cancer list.txt") ## get names of all types of cancers (total 176 types), 
# canc_list has separate components list for unigrams, bigrams, trigrams and 4grams cancer 

In [9]:
# canc_list

In [10]:
total = 0
for i in canc_list:
    total += len(i)
total

176

In [11]:
def get_normalized_tokens(data):
    tokens = tokenizer.tokenize(data)
    normalized_tokens = []
    for token in tokens:
        token = token.lower()
        if token in stop_words:
            continue
        elif token == '.' or token == '_':
            continue
        token = lemmatizer.lemmatize(token)
        normalized_tokens.append(token)
    return normalized_tokens

In [12]:
normalized_tokens = get_normalized_tokens(doc_text)

In [13]:
len(normalized_tokens) # retrieving tokens after normalization from the document

3688

In [14]:
def get_ngrams_tokens(tokens_list, n):
    grams = []
    for ngram in ngrams(tokens_list, n):
        grams.append(' '.join(i for i in ngram))
    return grams

In [25]:
# This grams list contains 4 lists - 1st of unigrams, 2nd of bigrams, 3rd of trigrams, 4th of 4grams and 5th of 5grams 
# of the retrieveed document
grams = [get_ngrams_tokens(normalized_tokens, 1), get_ngrams_tokens(normalized_tokens, 2), 
         get_ngrams_tokens(normalized_tokens, 3), get_ngrams_tokens(normalized_tokens, 4)]
# for i in grams:
#     print(len(i))

In [26]:
def find_cancers_from_doc(doc_grams, cancers_list):
    found_4gr = dict()
    for canc in cancers_list[3]:
        t = doc_grams[3].count(canc)
        if(t):
            found_4gr[canc] = t
    found_3gr = dict()
    for canc in cancers_list[2]:
        flag = False
        for k in found_4gr.keys():
            if canc in k:
                flag = True
        if flag:
            continue        
        t = doc_grams[2].count(canc)
        if(t):
            found_3gr[canc] = t
    found_2gr = dict()
    for canc in cancers_list[1]:
        flag = False
        for k in found_4gr.keys():
            if canc in k:
                flag = True
        if flag:
            continue        
        for k in found_3gr.keys():
            if canc in k:
                flag = True
        if flag:
            continue        
        t = doc_grams[1].count(canc)
        if(t):
            found_2gr[canc] = t
    found_1gr = dict()
    for canc in cancers_list[0]:
        flag = False
        for k in found_4gr.keys():
            if canc in k:
                flag = True
        if flag:
            continue        
        for k in found_3gr.keys():
            if canc in k:
                flag = True
        if flag:
            continue        
        for k in found_2gr.keys():
            if canc in k:
                flag = True
        if flag:
            continue        
        t = doc_grams[0].count(canc)
        if(t):
            found_1gr[canc] = t
    return found_4gr, found_3gr, found_2gr, found_1gr

In [27]:
found_4gr, found_3gr, found_2gr, found_1gr = find_cancers_from_doc(grams, canc_list)

In [28]:
x = PrettyTable()
index = 1
cancers = list(found_1gr.items()) + list(found_2gr.items()) + list(found_3gr.items()) + list(found_4gr.items())
cancers.sort(key = lambda x: x[1], reverse = True)
x.field_names = ["S.no.", "Cancer Type"] + ["Frequency"]
for cancer in cancers:
    x.add_row([index, cancer[0], cancer[1]])
    index += 1
print(x)

+-------+-------------+-----------+
| S.no. | Cancer Type | Frequency |
+-------+-------------+-----------+
|   1   |   sarcoma   |     4     |
|   2   |  carcinoma  |     4     |
+-------+-------------+-----------+


In [29]:
def get_virus_names(doc_grams):
    grams_v = [list() for i in range(3)]
    for unigram in doc_grams[0]:
        if 'virus' in unigram and unigram != 'virus':
            grams_v[0].append(unigram)
    for i in range(1,3):
        for gram in doc_grams[i]:
            tok_gram = gram.split(" ")
#             if tok_gram[-1] == 'virus':
            if 'virus' in tok_gram[-1]:
                grams_v[i].append(gram)
        i += 1
    return grams_v

In [30]:
grams_v = get_virus_names(grams)

In [31]:
x = PrettyTable()
viruses = grams_v[0] + grams_v[1] + grams_v[2]
index = 1
x.field_names = ["S.no.", "Virus"] + ["Frequency"]
for k,v in Counter(viruses).most_common():
    if v>2:
        x.add_row([index, k, v])
        index += 1
print(x)

+-------+-----------------+-----------+
| S.no. |      Virus      | Frequency |
+-------+-----------------+-----------+
|   1   | influenza virus |     12    |
+-------+-----------------+-----------+
