In [27]:
#load data in text variable
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.data = ''

    def handle_data(self, data):
        self.data+=data;
        
with open('data', 'r') as f:
    parser = MyHTMLParser()
    parser.feed(f.read())
    text = parser.data

In [70]:
#get words based on frequency and which are unusual
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def remove_stopwords(list_of_tokens):
    stopset = set(stopwords.words('english'))
    stopset=stopset.union(set(['fig']))
    
    return [i.lower() for i in list_of_tokens if i.lower() not in stopset]

def unusual_words(text, english_vocab):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    unusual = text_vocab - english_vocab
    return sorted(unusual)

def getwords(text):
    raw_list_of_list = [nltk.tokenize.sent_tokenize(x) for x in text.split('\n') if len(x)!= 0]
    sentences = [item for sublist in raw_list_of_list for item in sublist]#flatten raw_list_of_list to get list of sentences
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    a=[] #stores all noun phrases
    c=[] #unusual words not used in common english
    wordnet_lemmatizer = WordNetLemmatizer()
    for sentence in sentences:
        temp = nltk.pos_tag(nltk.tokenize.word_tokenize(sentence))
        #only take noun phrases
        a.extend([wordnet_lemmatizer.lemmatize(x[0], pos=get_wordnet_pos(x[1])) for x in temp if x[1] in ['NN', 'NNS', 'NNP', 'NNPS']])
        unusualset = set(unusual_words([x[0] for x in temp], english_vocab))
        c.extend([wordnet_lemmatizer.lemmatize(x[0], pos=get_wordnet_pos(x[1])) for x in temp if x[0] in unusualset and get_wordnet_pos(x[1]) != ''] )
    
    a = [x for (x, y) in nltk.FreqDist(a).most_common(50)]
    a.extend([x for (x, y) in nltk.FreqDist(c).most_common(50)])
    return list(set(a))
getwords(text)

['herein',
 'level',
 'invention',
 'path',
 'implant',
 'spanning',
 'anchor',
 'fibrosus',
 'vertebra',
 'arm',
 'location',
 'illustrate',
 'locate',
 'displacement',
 'spinae',
 'spine',
 'patient',
 'bone',
 'mean',
 'comprise',
 'screw',
 'displace',
 'body',
 'disc',
 'structure',
 'view',
 'adjust',
 'motion',
 'fixation',
 'method',
 'embodiment',
 'contact',
 'prosthesis',
 'direction',
 'mm',
 'consist',
 'FIG',
 'force',
 'exert',
 'thickness',
 'place',
 'use',
 'device',
 'FIGS',
 'Implants',
 'disorder',
 'redistribute',
 'tissue',
 'pulposus',
 'tendon',
 'etc',
 'moment',
 'section',
 'span',
 'example',
 'surface',
 'target',
 'muscle',
 'describe',
 'portion',
 'configure',
 'include',
 'connect',
 'alter',
 'erector',
 'refers',
 'distance',
 'process',
 'position',
 'represent',
 'treatment',
 'region',
 'effect',
 'material',
 'end',
 'have',
 'segment',
 'feature']

In [69]:
#use tf-idf to get important words

from sklearn.feature_extraction.text import TfidfVectorizer

def get_words_tfidf(text):
    corpus = [nltk.tokenize.sent_tokenize(x) for x in text.split('\n') if len(x)!= 0]
    corpus = [item for sublist in corpus for item in sublist]#flatten raw_list_of_list to get list of sentences
    vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,1))
    X = vectorizer.fit_transform(corpus).toarray()
    features = vectorizer.get_feature_names()
    stopset = set(stopwords.words('english'))
    
    Set = set()
    for i in range(0, len(X)):
        for j in range(0, len(X[i])):
            #remove stopwords, numbers and only include if importance is more than .5
            if(X[i][j] > .5 and not features[j].isdigit() and features[j] not in stopset):
                Set.add(features[j])
    return list(Set)

get_words_tfidf(text)

# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(min_df=1)
# counts = vectorizer.fit_transform(corpus).toarray()
# from sklearn.feature_extraction.text import TfidfTransformer
# transformer = TfidfTransformer(smooth_idf=True)
# tfidf = transformer.fit_transform(counts)

['substantially',
 'alternative',
 'open',
 'displacement',
 'background',
 'detailed',
 'maybe',
 'see',
 'descriptions',
 'mm',
 'vertebroplasty',
 'claimed',
 'brief',
 'stiffness',
 'portion',
 'pass',
 'unit',
 'operatively',
 'identical',
 'end',
 'toward',
 'used',
 'similarly',
 'vertebra',
 'encloses',
 'eyelet',
 'rigid',
 'cross',
 'spinalis',
 'midline',
 'degeneration',
 'external',
 'view',
 'terms',
 'skin',
 'disclosure',
 'surgery',
 'figs',
 'segments',
 'top',
 'flexors',
 'coatings',
 'similar',
 'longissimus',
 'spanning',
 'implant',
 'biomechanics',
 'indicate',
 'bone',
 'iliocostalis',
 'kyphoplasty',
 'shown',
 'contact',
 'description',
 'transdermally',
 'field',
 '4a',
 'varying',
 'anatomy',
 'moment',
 'screws',
 'summary',
 'failure',
 'multiple',
 'different',
 'process',
 'fsu',
 'segment',
 'path',
 'fig',
 'loads',
 'lateral',
 'inflation',
 'ring',
 'locking',
 'ta',
 'titanium',
 '2a',
 'fracture',
 'flexible',
 'portions',
 'drugs',
 '2b',
 'drawi

In [65]:
#use rake from https://github.com/aneesha/RAKE and Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). 
from rake import Rake
import nltk
def getphrases_rake(text):
    rake = Rake("SmartStoplist.txt")
    keywords = rake.run(text)
    phrases = [x for (x, y) in keywords if y > 40]
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    phrases = [phrase for phrase in phrases if len(set(phrase.split(' ')) - english_vocab) < len(set(phrase.split(' ')))]
    return phrases
#     for phrase in phrases:
#         if len(set(phrase.split(' ')) - english_vocab) < len(set(phrase.split(' ')))
getphrases_rake(text)

['intervertebral disc and all adjoining ligaments between them and excludes other connecting tissues such as muscles',
 'there is association between strenuous work like lifting and lumbar disc problems',
 'mm\nby using appropriately sized and positioned implants and methods as described herein',
 'reduced segmental motion during flexion and increased spinal stability during flexion',
 'by using appropriately sized and positioned implants as described herein',
 'wherein said implanting is accomplished without removing an intervertebral disk',
 'wherein said implanting further includes not cutting an interspinalis muscle',
 'change their mechanical properties as shear stress is applied',
 'forward bending moment which includes body weight and any external load',
 'soft cartilaginous disks between each vertebrae called intervertebral discs',
 'need for highly invasive surgeries requiring significant trauma',
 'installing implants and prostheses for less invasive spinal treatments',
 'an 

In [71]:
#use textrank from https://github.com/davidadamojr/TextRank
import textrank

def getphrases_textrank(text):
    phrases = textrank.extract_key_phrases(text)
    return phrases
getphrases_textrank(text)

{'13-14',
 '13-17',
 '13-19',
 '33-38',
 '3A-B show',
 '4A-B show',
 '4a-b',
 'A',
 'A FSU',
 'A bone-facing',
 'A frontal',
 'A method',
 'A sagittal',
 'A transverse',
 'A vertebra',
 'Alternative',
 'Anatomy',
 'B',
 'BACKGROUND Spinal',
 'BRIEF DESCRIPTIONS',
 'Biocompatible',
 'Bone A',
 'Bone B',
 'Bone C',
 'C',
 'C1-C7',
 'Cranial',
 'Current surgical',
 'D1',
 'D2',
 'DDD',
 'DESCRIPTION Spinal',
 'DESCRIPTIONS OF',
 'DETAILED DESCRIPTION',
 'DISCLOSURE Selectively',
 'Dependent',
 'Different',
 'Displacement',
 'Displacement distance',
 'Displacement segment',
 'Enlarged portion',
 'Excessive loading',
 'Exemplary',
 'FIELD OF',
 'FIG',
 'FIGS',
 'FSU',
 'G',
 'IAR',
 'Implant',
 'Implant fixation',
 'Inferior end',
 'Intervertebral discs',
 'L',
 'L1-L5',
 'LA',
 'LB',
 'LC',
 'Lp',
 'Lw',
 'Medial',
 'OF DISCLOSURE',
 'OF DRAWINGS',
 'OF THE',
 'Other',
 'Other exemplary',
 'PGs',
 'PTFE',
 'Proximal',
 'Rigid',
 'S',
 'SUMMARY OF',
 'Segment',
 'Soft compliant',
 'Specific

In [68]:
#use LDA
import gensim
def get_bow_LDA(text):
    documents = [x for x in text.split('\n') if len(x)!= 0]
    num_topics = 15
    from nltk.corpus import stopwords
    stoplist = set(stopwords.words('english'))

    texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
    dictionary = gensim.corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, update_every=1, chunksize=10000, passes=1)

    return [' '.join([dictionary[x] for (x,y) in lda.get_topic_terms(i, 5)]) for i in range(0, num_topics)]
get_bow_LDA(text)

['may spinal spinae muscles erector',
 'may displacement body implants figs.',
 'may implant target vertebral tissue',
 'displacement annulus nucleus fibrosus may',
 'may implant embodiments present views',
 'may segments spine implant (fig.',
 'may implant displacement vertebral embodiments,',
 'may implant displacement portion exemplary',
 'may implant embodiments, vertebral could',
 'surface view thoracic could posterior',
 'may implant vertebral body target',
 'body vertebral intervertebral fig. mm',
 'vertebral may disc moment spinal',
 'vertebral target body tissue spine',
 'may implant could implants surface']

In [36]:
#get 10 urls from google search result
import urllib.parse # Used to read the html document
from bs4 import BeautifulSoup
import re
# check if url is proper
regex = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

def geturl(query):
    urls = []
    for start in range(0,1):
        query = urllib.parse.urlencode({"q": query})
        r = requests.get('https://www.google.com/search?' + query)
        soup = BeautifulSoup(r.text)

        ### Looks like google contains URLs in <cite> tags.
        ### So for each cite tag on each page (10), print its contents (url)
        for cite in soup.findAll('cite'):
            url = cite.text
            if("http" not in url):
                url = "http://" + url
            if(not url.endswith(('.pdf', '.ppt')) and regex.match(url) is not None):#pdfs are not being able to get parsed
                urls.append(url)
    return urls[0:5]

In [66]:
#get data from url
import requests
import re

def getdata_from_url(url):
    if("http" not in url):
        url = "http://" + url
    try:
        r = requests.get(url,  timeout=0.5, verify=False)
        soup = BeautifulSoup(r.text, 'html.parser')
        [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
        raw_text = soup.getText()
        raw_text= [x.strip(' \t\n\r') for x in raw_text.split('\n') if len(x) != 0]
        raw_text=[re.sub('[\t+]', '', x) for x in raw_text] #remove tabs from middle
        raw_text=[x for x in raw_text if len(x) != 0] #remove empty strings
        return raw_text
    except Exception as e:
        return []
    
# getdata_from_url("https://en.wikipedia.org/wiki/Hi,_How_Are_You")
    

In [67]:
#run all extracted features from patent in google search and get urls 
#and for each url get features and display relevant words

#try rake method to extract phrases from dataset and use lda to get important words from google search links.
# [y  for x in getwords() for y in geturl(x)]

urls = [(x, geturl(x))  for x in getphrases_rake(text)]
print("obtained relevant urls")

for (x, y) in urls:
    print('getting topics for: ' + x + " : with " + str(len(y)) + " urls")
    words = set()
    for url in y:
        #get lda words
        print(url)
        url_data = " ".join(getdata_from_url(url))
        if(url_data.strip() != ''):
            for sent in get_bow_LDA(url_data):
                for word in sent.split(' '):
                    if(word.isalpha()):
                        words.add(word)
    print(words)

obtained relevant urls
getting topics for: intervertebral disc and all adjoining ligaments between them and excludes other connecting tissues such as muscles : with 5 urls
https://en.wikipedia.org/wiki/Functional_spinal_unit
https://books.google.com/books?isbn=1478443367
https://quizlet.com/45435732/ap-1-exam-3-flash-cards/
https://www.dartmouth.edu/~humananatomy/part_1/chapter_2.html
http://www.proz.com/kudoz/polish.../5558084-segment_ruchowy.html
{'images', 'bones', 'joint', 'ebooksfree', 'muscles', 'searchsearch', 'resultbasics', 'bone', 'news', 'sign', 'motion', 'allimagesvideosnewsshoppingmapsbookssearch', 'may', 'relevancesorted', 'history', 'biomechanical', 'passive', 'play', 'programs', 'much', 'complete', 'google', 'synovial', 'muscle', 'spinal', 'neutral', 'page', 'unit', 'fsu', 'maps', 'availablegoogle', 'region', 'go', 'solutions', 'drive'}
getting topics for: there is association between strenuous work like lifting and lumbar disc problems : with 5 urls
https://www.ninds.n