In [1]:
#load data in text variable
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.data = ''

    def handle_data(self, data):
        self.data+=data;
        
with open('data', 'r') as f:
    parser = MyHTMLParser()
    parser.feed(f.read())
    text = parser.data

In [2]:
#get words based on frequency
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def remove_stopwords(list_of_tokens):
    stopset = set(stopwords.words('english'))
    stopset=stopset.union(set(['fig']))
    
    return [i.lower() for i in list_of_tokens if i.lower() not in stopset]

def unusual_words(text, english_vocab):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    unusual = text_vocab - english_vocab
    return sorted(unusual)

def getwords(text):
    raw_list_of_list = [nltk.tokenize.sent_tokenize(x) for x in text.split('\n') if len(x)!= 0]
    sentences = [item for sublist in raw_list_of_list for item in sublist]#flatten raw_list_of_list to get list of sentences
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    a=[] #stores all noun phrases
    c=[] #unusual words not used in common english
    wordnet_lemmatizer = WordNetLemmatizer()
    for sentence in sentences:
        temp = nltk.pos_tag(nltk.tokenize.word_tokenize(sentence))
        #only take noun phrases
        a.extend([wordnet_lemmatizer.lemmatize(x[0], pos=get_wordnet_pos(x[1])) for x in temp if x[1] in ['NN', 'NNS', 'NNP', 'NNPS']])
        unusualset = set(unusual_words([x[0] for x in temp], english_vocab))
        c.extend([wordnet_lemmatizer.lemmatize(x[0], pos=get_wordnet_pos(x[1])) for x in temp if x[0] in unusualset and get_wordnet_pos(x[1]) != ''] )
    
    a = [x for (x, y) in nltk.FreqDist(a).most_common(50)]
    a.extend([x for (x, y) in nltk.FreqDist(c).most_common(50)])
    return list(set(a))
getwords(text)

In [3]:
#use tf-idf to get important words

from sklearn.feature_extraction.text import TfidfVectorizer

def get_words_tfidf(text):
    corpus = [nltk.tokenize.sent_tokenize(x) for x in text.split('\n') if len(x)!= 0]
    corpus = [item for sublist in corpus for item in sublist]#flatten raw_list_of_list to get list of sentences
    vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,1))
    X = vectorizer.fit_transform(corpus).toarray()
    features = vectorizer.get_feature_names()
    stopset = set(stopwords.words('english'))
    
    Set = set()
    for i in range(0, len(X)):
        for j in range(0, len(X[i])):
            #remove stopwords, numbers and only include if importance is more than .5
            if(X[i][j] > .5 and not features[j].isdigit() and features[j] not in stopset):
                Set.add(features[j])
    return list(Set)

get_words_tfidf(text)

# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(min_df=1)
# counts = vectorizer.fit_transform(corpus).toarray()
# from sklearn.feature_extraction.text import TfidfTransformer
# transformer = TfidfTransformer(smooth_idf=True)
# tfidf = transformer.fit_transform(counts)



In [5]:
#use rake from https://github.com/aneesha/RAKE and Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). 
from rake import Rake
import nltk
def getphrases_rake(text):
    rake = Rake("SmartStoplist.txt")
    keywords = rake.run(text)
    phrases = [x for (x, y) in keywords if y > 30]
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    phrases = [phrase for phrase in phrases if len(set(phrase.split(' ')) - english_vocab) < len(set(phrase.split(' ')))]
    return phrases
#     for phrase in phrases:
#         if len(set(phrase.split(' ')) - english_vocab) < len(set(phrase.split(' ')))
getphrases_rake(text)

In [6]:
#use textrank from https://github.com/davidadamojr/TextRank
import textrank

def getphrases_textrank(text):
    phrases = textrank.extract_key_phrases(text)
    return phrases
getphrases_textrank(text)

In [7]:
#use LDA

import gensim
def get_bow_LDA(text):
    documents = [x for x in text.split('\n') if len(x)!= 0]
    num_topics = 15
    from nltk.corpus import stopwords
    stoplist = set(stopwords.words('english'))

    texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
    dictionary = gensim.corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, update_every=1, chunksize=10000, passes=1)

    return [' '.join([dictionary[x] for (x,y) in lda.get_topic_terms(i, 5)]) for i in range(0, num_topics)]
get_bow_LDA(text)

In [8]:
#get 10 urls from google search result
import urllib.parse # Used to read the html document
from bs4 import BeautifulSoup
import re
# check if url is proper
regex = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

def geturl(query):
    urls = []
    for start in range(0,1):
        query = urllib.parse.urlencode({"q": query})
        r = requests.get('https://www.google.com/search?' + query)
        soup = BeautifulSoup(r.text)

        ### Looks like google contains URLs in <cite> tags.
        ### So for each cite tag on each page (10), print its contents (url)
        for cite in soup.findAll('cite'):
            url = cite.text
            if("http" not in url):
                url = "http://" + url
            if(not url.endswith(('.pdf', '.ppt')) and regex.match(url) is not None):#pdfs are not being able to get parsed
                urls.append(url)
    return urls[0:5]
# getget("Hi How")

In [9]:
#get data from url
import requests
import re

def getdata_from_url(url):
    if("http" not in url):
        url = "http://" + url
    try:
        r = requests.get(url,  timeout=0.001, verify=False)
        soup = BeautifulSoup(r.text, 'html.parser')
        [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
        raw_text = soup.getText()
        raw_text= [x.strip(' \t\n\r') for x in raw_text.split('\n') if len(x) != 0]
        raw_text=[re.sub('[\t+]', '', x) for x in raw_text] #remove tabs from middle
        raw_text=[x for x in raw_text if len(x) != 0] #remove empty strings
        return raw_text
    except Exception as e:
        print(e)
        return []
    
# getdata_from_url("https://en.wikipedia.org/wiki/Hi,_How_Are_You")
    

In [10]:
#run all extracted features from patent in google search and get urls 
#and for each url get features and display relevant words

#try rake method to extract phrases from dataset and use lda to get important words from google search links.
# [y  for x in getwords() for y in geturl(x)]

urls = [(x, geturl(x))  for x in getphrases_rake(text)]
print("obtained relevant urls")

for (x, y) in urls:
    print('getting topics for: ' + x + " : with " + str(len(y)) + " urls")
    words = set()
    for url in y:
        #get lda words
        print(url)
        url_data = " ".join(getdata_from_url(url))
        if(url_data.strip() != ''):
            for sent in get_bow_LDA(url_data):
                for word in sent.split(' '):
                    if(word.isalpha()):
                        words.add(word)
    print(words)

obtained relevant urls
getting topics for: intervertebral disc and all adjoining ligaments between them and excludes other connecting tissues such as muscles : with 5 urls
https://en.wikipedia.org/wiki/Functional_spinal_unit
HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/Functional_spinal_unit (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb743399b0>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
https://books.google.com/books?isbn=1478443367
HTTPSConnectionPool(host='books.google.com', port=443): Max retries exceeded with url: /books?isbn=1478443367 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb74324320>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
https://quizlet.com/45435732/ap-1-exam-3-flash-cards/
HTTPSConnectionPool(host='quizlet.com', port=443): Max

HTTPConnectionPool(host='www.freepatentsonline.com', port=80): Max retries exceeded with url: /5716416.html (Caused by ConnectTimeoutError(<requests.packages.urllib3.connection.HTTPConnection object at 0x7fcb74440a90>, 'Connection to www.freepatentsonline.com timed out. (connect timeout=0.001)'))
https://www.google.ch/patents/US7901460?hl=de&cl=en
HTTPSConnectionPool(host='www.google.ch', port=443): Max retries exceeded with url: /patents/US7901460?hl=de&cl=en (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb7467a128>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
https://www.google.ch/patents/US8211175
HTTPSConnectionPool(host='www.google.ch', port=443): Max retries exceeded with url: /patents/US8211175 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb740d8390>: Failed to establish a new connection: [Errno 101] Network is unreachable',)

HTTPSConnectionPool(host='www.ncbi.nlm.nih.gov', port=443): Max retries exceeded with url: /pmc/articles/PMC4516732/ (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb74324780>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
https://books.google.com/books?isbn=0323070868
HTTPSConnectionPool(host='books.google.com', port=443): Max retries exceeded with url: /books?isbn=0323070868 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb74324860>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
set()
getting topics for: installing implants and prostheses for less invasive spinal treatments : with 5 urls
http://www.doctorclavel.com/en/spine-surgery/
HTTPConnectionPool(host='www.doctorclavel.com', port=80): Max retries exceeded with url: /en/spine-surgery/ (Caused by NewConnectionError('<requests.packages.urllib3.connectio

HTTPConnectionPool(host='www.everydayhealth.com', port=80): Max retries exceeded with url: /.../parkinsons-disease-progression.aspx (Caused by ConnectTimeoutError(<requests.packages.urllib3.connection.HTTPConnection object at 0x7fcb74310a58>, 'Connection to www.everydayhealth.com timed out. (connect timeout=0.001)'))
https://www.rheumatoidarthritis.org/ra/symptoms/progression/
HTTPSConnectionPool(host='www.rheumatoidarthritis.org', port=443): Max retries exceeded with url: /ra/symptoms/progression/ (Caused by ConnectTimeoutError(<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb742b9860>, 'Connection to www.rheumatoidarthritis.org timed out. (connect timeout=0.001)'))
http://www.healthline.com/health/dementia/stages
HTTPConnectionPool(host='www.healthline.com', port=80): Max retries exceeded with url: /health/dementia/stages (Caused by ConnectTimeoutError(<requests.packages.urllib3.connection.HTTPConnection object at 0x7fcb740d87b8>, 'Connection to www.healt

HTTPSConnectionPool(host='www.google.ch', port=443): Max retries exceeded with url: /patents/US9468466 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb742f9780>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
http://www.google.com/patents/US20170027621
HTTPConnectionPool(host='www.google.com', port=80): Max retries exceeded with url: /patents/US20170027621 (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7fcb74324588>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
https://books.google.com/books?isbn=1463487614
HTTPSConnectionPool(host='books.google.com', port=443): Max retries exceeded with url: /books?isbn=1463487614 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb74324080>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
https://

HTTPSConnectionPool(host='books.google.com', port=443): Max retries exceeded with url: /books?isbn=8131242552 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb74324080>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
set()
getting topics for: which is specifically disclosed herein without departing : with 5 urls
https://books.google.com/books?isbn=1481673629
HTTPSConnectionPool(host='books.google.com', port=443): Max retries exceeded with url: /books?isbn=1481673629 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb74324cf8>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
https://books.google.com/books?isbn=1481665952
HTTPSConnectionPool(host='books.google.com', port=443): Max retries exceeded with url: /books?isbn=1481665952 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSCon

HTTPSConnectionPool(host='prezi.com', port=443): Max retries exceeded with url: /suzpayi7ijqj/ (Caused by ConnectTimeoutError(<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb74324dd8>, 'Connection to prezi.com timed out. (connect timeout=0.001)'))
http://eorthopod.com/lumbar-spine-anatomy/
HTTPConnectionPool(host='eorthopod.com', port=80): Max retries exceeded with url: /lumbar-spine-anatomy/ (Caused by ConnectTimeoutError(<requests.packages.urllib3.connection.HTTPConnection object at 0x7fcb74324438>, 'Connection to eorthopod.com timed out. (connect timeout=0.001)'))
https://www.mayfieldclinic.com/PE-AnatSpine.htm
HTTPSConnectionPool(host='www.mayfieldclinic.com', port=443): Max retries exceeded with url: /PE-AnatSpine.htm (Caused by ConnectTimeoutError(<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb74324c50>, 'Connection to www.mayfieldclinic.com timed out. (connect timeout=0.001)'))
https://www.depuysynthes.com/patients/aab

HTTPConnectionPool(host='www.supraalloys.com', port=80): Max retries exceeded with url: /medical-titanium.php (Caused by ConnectTimeoutError(<requests.packages.urllib3.connection.HTTPConnection object at 0x7fcb74310630>, 'Connection to www.supraalloys.com timed out. (connect timeout=0.001)'))
https://en.wikipedia.org/wiki/Titanium_biocompatibility
HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/Titanium_biocompatibility (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb742c2780>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
http://www.mrisafety.com/SafetyInfoFromList.asp?LSub=44
HTTPConnectionPool(host='www.mrisafety.com', port=80): Max retries exceeded with url: /SafetyInfoFromList.asp?LSub=44 (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7fcb74339fd0>: Failed to establish a new connection: [Errno 101]

HTTPSConnectionPool(host='www.google.ch', port=443): Max retries exceeded with url: /patents/US7985193 (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb742c2358>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
http://www.google.se/patents/US5154608?hl=sv
HTTPConnectionPool(host='www.google.se', port=80): Max retries exceeded with url: /patents/US5154608?hl=sv (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7fcb742b9860>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
https://en.wikipedia.org/wiki/Patent_claim
HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/Patent_claim (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb74324160>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
set()
getting 

HTTPConnectionPool(host='www.freepatentsonline.com', port=80): Max retries exceeded with url: /y2016/0213402.html (Caused by ConnectTimeoutError(<requests.packages.urllib3.connection.HTTPConnection object at 0x7fcb74474080>, 'Connection to www.freepatentsonline.com timed out. (connect timeout=0.001)'))
http://www.google.com/patents/US20150173888
HTTPConnectionPool(host='www.google.com', port=80): Max retries exceeded with url: /patents/US20150173888 (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7fcb740d8208>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
http://www.google.com/patents/US8845724
HTTPConnectionPool(host='www.google.com', port=80): Max retries exceeded with url: /patents/US8845724 (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7fcb742f9b00>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
set()
getting topics for: de

HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/Iliocostalis (Caused by NewConnectionError('<requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x7fcb74336da0>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
http://www.anatomyexpert.com/app/structure/5243/471/
HTTPConnectionPool(host='www.anatomyexpert.com', port=80): Max retries exceeded with url: /app/structure/5243/471/ (Caused by ConnectTimeoutError(<requests.packages.urllib3.connection.HTTPConnection object at 0x7fcb744741d0>, 'Connection to www.anatomyexpert.com timed out. (connect timeout=0.001)'))
http://www.google.com/patents/US9468466
HTTPConnectionPool(host='www.google.com', port=80): Max retries exceeded with url: /patents/US9468466 (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7fcb74324588>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
http://docpla

HTTPConnectionPool(host='www.somatics.de', port=80): Max retries exceeded with url: /artikel/for-professionals/2.../103-spinal-mechanics (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x7fcb74324240>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
set()


In [53]:
urls

[('intervertebral disc and all adjoining ligaments between them and excludes other connecting tissues such as muscles',
  ['https://en.wikipedia.org/wiki/Functional_spinal_unit',
   'https://books.google.com/books?isbn=1478443367',
   'https://quizlet.com/45435732/ap-1-exam-3-flash-cards/',
   'https://www.dartmouth.edu/~humananatomy/part_1/',
   'http://www.proz.com/kudoz/polish.../5558084-segment_ruchowy.html']),
 ('there is association between strenuous work like lifting and lumbar disc problems',
  ['https://www.ninds.nih.gov/Disorders/.../Low-Back-Pain-Fact-Sheet',
   'http://www.nytimes.com/health/guides/disease/herniated...disk/overview.html',
   'https://www.uptodate.com/.../low-back-pain-in-adults-beyond-the-basics',
   'https://www.mayfieldclinic.com/PE-HLdisc.htm',
   'http://www.disability-benefits-help.org/working.../degenerative-disc-disease']),
 ('mm\nby using appropriately sized and positioned implants and methods as described herein',
  ['https://www.ncbi.nlm.nih.gov/p