In [2]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import sys

In [3]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


In [4]:
def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

In [5]:
def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [6]:
def extract_sample(url, output_directory):
    """
    Extracts the html from the webpage, finds the div tag with the id = 'sampletext' and performs text splitting to remove any other html elements.
    It then writes the reamining text to a new txt file in a directory specified by output_directory.
    """
    raw_html =  simple_get(url)  
    try:
        html = BeautifulSoup(raw_html, 'html.parser')  
    except TypeError:
        print('THE ERROR OCCURED AT THIS URL: '+ str(url))
        print('RAW HTML: ')
        print('\n')
        print(raw_html)
        

    for p in html.select('div'):
        try:
            if p['id'] == 'sampletext':
                text = p.get_text()
        except KeyError:
            continue
        
    #https://stackoverflow.com/questions/904746/how-to-remove-all-characters-after-a-specific-character-in-python

    text = text.split('Sample Type', 1)[-1]
    text = text.split('Keywords', 1)[0]
    text = text.replace('(adsbygoogle = window.adsbygoogle || []).push({});', '')
    text = text.replace('(Medical Transcription Sample Report)', '')
    text = text.rstrip()
    

    title = text.split('Sample Name: ')[-1]
    title = title.split('Description: ')[0]
    text = text.split(title)[-1]
    title = title.replace ('\n', '')
    title = title.replace(' ', '_')
    title = title.replace('-','')
    title = title.replace('/','_')
    

    
    file_name = output_directory + '/' +  title + '.txt'

    

    text_file = open(file_name, "w+")
    text_file.write(text)
    text_file.close()

In [7]:
#https://www.crummy.com/software/BeautifulSoup/bs4/doc/#get-text
def retrieve_top_layer_urls(url):
    """
    Retrieves the url of the subsections without the MTSamples website.
    """
    links =[]
    raw_html =  simple_get(url)  
    html = BeautifulSoup(raw_html, 'html.parser')  
    for link in html.find_all('a'):
        links.append(link.get('href'))
    links = [x for x in links if x is not None] 
    links =  [x for x in links if x.startswith('/site/pages/browse')]
    del links[:40]  
    
    links =  [ ('http://mtsamples.com' + x) for x in links ]

    return links

In [3]:
def retrieve_bottom_layer_urls(url):
    """
    Retrieves the url of the inidividual sample pages from the subsection pages. This has some redudancy as a url may appear multiple times on page. In the end it 
    doesn't matter because the extract_sample() function will overwrite any already existing documents with the same name. 
    """
    links =[]
    raw_html =  simple_get(url)  
    html = BeautifulSoup(raw_html, 'html.parser')  
    for link in html.find_all('a'):
        links.append(link.get('href'))
    links = [x for x in links if x is not None]

   
    links =  [x for x in links if x.startswith('/site/pages/sample.asp')]
    
    links =  [ ('http://mtsamples.com' + x) for x in links ]

   
    return links

In [11]:
def mt_samples_extractor():
    """
    This function loops through all the available url links that link to inidividual samples and then calls the extract_sample() function to extract the text from the sample pages.
    """
    top_level_links = retrieve_top_layer_urls('http://mtsamples.com/')
    for i, link in enumerate(top_level_links):
        print(i, " : ", link)

    # for x in top_level_links:
    #     bottom_level_links = retrieve_bottom_layer_urls(x)
    #     for y in bottom_level_links:
    #         extract_sample(y, sys.argv[1])

In [None]:
##### To launch the code, in the command line write:  python web_extract.py text_folder_name
##### Where text_folder_name is the folder into which you want the text to be written to
mt_samples_extractor()

# My Code

Clean up code with checks and correct functions and move to other branch

In [567]:
link = "https://mtsamples.com/site/pages/sitemap.asp"
base = "https://mtsamples.com"
res = get(link)
# if res.status_code == 200:
#     print("Success")
html = BeautifulSoup(res.content, 'html.parser')

In [79]:
section_dict = {}

for section in html.find("ul", {"class": "list-unstyled"}).findChildren("a" , recursive=True):
    if section.find("img") is not None:
        current_section = section.text.strip()
        section_dict[current_section] = []
    else:
        section_dict[current_section].append(f"{base}{section.get('href')}")

In [568]:
def get_values(html):
    section_words = []
    new_html = str(html).split('<hr/>')[1]
    for item in html.find_all("b"):
        text = item.text.strip(" :")
        if text.isupper():
            section_words.append(text)
                
    return section_words

In [599]:
final_dict = {}
for key, value in section_dict.items():
    print(key)
    for link in section_dict[key]:
        res = get(link)
        html = BeautifulSoup(res.content, 'html.parser')
        final_dict[link] = [get_values(html)]

Allergy / Immunology
Autopsy
Bariatrics
Cardiovascular / Pulmonary
Chiropractic
Consult - History and Phy.
Cosmetic / Plastic Surgery
Dentistry
Dermatology
Diets and Nutritions
Discharge Summary
Emergency Room Reports
Endocrinology
ENT - Otolaryngology
Gastroenterology
General Medicine
Hematology - Oncology
Hospice - Palliative Care
IME-QME-Work Comp etc.
Lab Medicine - Pathology
Letters
Nephrology
Neurology
Neurosurgery
Obstetrics / Gynecology
Office Notes
Ophthalmology
Orthopedic
Pain Management
Pediatrics - Neonatal
Physical Medicine - Rehab
Podiatry
Psychiatry / Psychology
Radiology


IndexError: list index out of range

In [598]:
import pandas as pd
df = pd.DataFrame.from_dict(final_dict, orient="columns").melt(var_name="link", value_name="words")#, columns=["Link", "Words"])
df

Unnamed: 0,link,words
0,https://mtsamples.com/site/pages/sample.asp?ty...,"[SUBJECTIVE, MEDICATIONS, ALLERGIES, OBJECTIVE..."
1,https://mtsamples.com/site/pages/sample.asp?ty...,"[HISTORY, PAST MEDICAL HISTORY, PAST SURGICAL ..."
2,https://mtsamples.com/site/pages/sample.asp?ty...,"[CHIEF COMPLAINT, PAST MEDICAL HISTORY, IMMUNI..."
3,https://mtsamples.com/site/pages/sample.asp?ty...,"[HISTORY, PAST MEDICAL HISTORY, PAST SURGICAL ..."
4,https://mtsamples.com/site/pages/sample.asp?ty...,"[HISTORY, IMPRESSION, RECOMMENDATIONS]"
5,https://mtsamples.com/site/pages/sample.asp?ty...,"[SUBJECTIVE, REVIEW OF SYSTEMS, PAST MEDICAL H..."
6,https://mtsamples.com/site/pages/sample.asp?ty...,"[ADMITTING DIAGNOSIS, DISCHARGE DIAGNOSIS, HOS..."


In [457]:
# I need sample name
# I need url
# I need everything between "Description" and "Keywords"