Skip to content

Commit

Permalink
[Feature] Delete cloudsearch data upon scraping all data, Change s3 keys
Browse files Browse the repository at this point in the history
  • Loading branch information
RyanSept committed Mar 23, 2017
1 parent cdfe9d2 commit 8ee6dbc
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 13 deletions.
3 changes: 2 additions & 1 deletion .gitignore
@@ -1 +1,2 @@
*.pyc
*.pyc
delete_*.json
49 changes: 43 additions & 6 deletions healthtools/scrapers/base_scraper.py
Expand Up @@ -14,34 +14,49 @@ def __init__(self):
self.s3 = None
self.s3_key = None
self.document_id = 0 # id for each entry, to be incremented
self.delete_file = None # contains docs to be deleted after scrape

def scrape_site(self):
'''
Scrape the whole site
'''
self.get_total_number_of_pages()
all_results = []
delete_batch = []
skipped_pages = 0

print "Running {} ".format(type(self).__name__)
for page_num in range(1, self.num_pages_to_scrape + 1):
url = self.site_url.format(page_num)
try:
print "Scraping page %s" % str(page_num)
entries = self.scrape_page(url)
scraped_page = self.scrape_page(url)
entries = scraped_page[0]
delete_docs = scraped_page[1]

all_results.extend(entries)
delete_batch.extend(delete_docs)
print "Scraped {} entries from page {} | {}".format(len(entries), page_num, type(self).__name__)
except Exception as err:
skipped_pages += 1
print "ERROR: scrape_site() - source: {} - page: {} - {}".format(url, page_num, err)
continue
print "| {} completed. | {} entries retrieved. | {} pages skipped.".format(type(self).__name__, len(all_results), skipped_pages)

all_results_json = json.dumps(all_results)
self.upload_data(all_results_json)
self.archive_data(all_results_json)
if all_results:
all_results_json = json.dumps(all_results)
delete_batch = json.dumps(delete_batch)

self.delete_cloudsearch_docs()
self.upload_data(all_results_json)
self.archive_data(all_results_json)

return all_results
# store delete operations for next scrape
delete_file = open(self.delete_file, "w")
delete_file.write(delete_batch)
delete_file.close()

return all_results

def scrape_page(self, page_url):
'''
Expand All @@ -53,17 +68,21 @@ def scrape_page(self, page_url):
rows = table.find_all("tr")

entries = []
delete_batch = []
for row in rows:
# only the columns we want
# -1 because fields/columns has extra index; id
columns = row.find_all("td")[:len(self.fields) - 1]
columns = [text.text.strip() for text in columns]
columns.append(self.generate_id())

entry = dict(zip(self.fields, columns))
entry = self.format_for_cloudsearch(entry)
entries.append(entry)

delete_batch.append({"type": "delete", "id": entry["id"]})
self.document_id += 1
return entries
return entries, delete_batch
except Exception as err:
print "ERROR: Failed to scrape data from page {} -- {}".format(page_url, str(err))

Expand Down Expand Up @@ -92,6 +111,24 @@ def archive_data(self, payload):
except Exception as err:
print "ERROR - archive_data() - {} - {}".format(self.s3_key, str(err))

def delete_cloudsearch_docs(self):
'''
Delete documents that were uploaded to cloudsearch in the last scrape
'''
try:
# get documents to be deleted
with open(self.delete_file, "r") as delete_file:
delete_docs = delete_file.read()

# delete
response = self.cloudsearch.upload_documents(
documents=delete_docs, contentType="application/json"
)
print "DEBUG - delete_cloudsearch_docs() - {} - {}".format(type(self).__name__, response.get("status"))
return response
except Exception as err:
print "ERROR - delete_cloudsearch_docs() - {} - {}".format(type(self).__name__, str(err))

def get_total_number_of_pages(self):
'''
Get the total number of pages to be scraped
Expand Down
3 changes: 2 additions & 1 deletion healthtools/scrapers/clinical_officers.py
Expand Up @@ -19,7 +19,8 @@ def __init__(self):
self.cloudsearch = boto3.client(
"cloudsearchdomain", **CLOUDSEARCH_COS_ENDPOINT)
self.s3 = boto3.client("s3", **S3_CONFIG)
self.s3_key = "clinical_officers.json"
self.s3_key = "data/clinical_officers.json"
self.delete_file = "delete_clinical_officers.json"

def format_for_cloudsearch(self, entry):
'''
Expand Down
3 changes: 2 additions & 1 deletion healthtools/scrapers/doctors.py
Expand Up @@ -19,7 +19,8 @@ def __init__(self):
self.cloudsearch = boto3.client(
"cloudsearchdomain", **CLOUDSEARCH_DOCTORS_ENDPOINT)
self.s3 = boto3.client("s3", **S3_CONFIG)
self.s3_key = "doctors.json"
self.s3_key = "data/doctors.json"
self.delete_file = "delete_doctors.json"

def format_for_cloudsearch(self, entry):
'''
Expand Down
3 changes: 2 additions & 1 deletion healthtools/scrapers/foreign_doctors.py
Expand Up @@ -18,7 +18,8 @@ def __init__(self):
self.cloudsearch = boto3.client(
"cloudsearchdomain", **CLOUDSEARCH_DOCTORS_ENDPOINT)
self.s3 = boto3.client("s3", **S3_CONFIG)
self.s3_key = "foreign_doctors.json"
self.s3_key = "data/foreign_doctors.json"
self.delete_file = "delete_foreign_doctors.json"

def format_for_cloudsearch(self, entry):
'''
Expand Down
10 changes: 7 additions & 3 deletions healthtools/tests/test_scrapers.py
Expand Up @@ -17,17 +17,17 @@ def test_it_gets_the_total_number_of_pages(self):

def test_it_scrapes_doctors_page(self):
entries = self.doctors_scraper.scrape_page(
"http://medicalboard.co.ke/online-services/retention/?currpage=1")
"http://medicalboard.co.ke/online-services/retention/?currpage=1")[0]
self.assertTrue(len(entries[0]["fields"]) == 10)

def test_it_scrapes_foreign_doctors_page(self):
entries = self.foreign_doctors_scraper.scrape_page(
"http://medicalboard.co.ke/online-services/foreign-doctors-license-register/?currpage=1")
"http://medicalboard.co.ke/online-services/foreign-doctors-license-register/?currpage=1")[0]
self.assertTrue(len(entries[0]["fields"]) == 10)

def test_it_scrapes_clinical_officers_page(self):
entries = self.clinical_officers_scraper.scrape_page(
"http://clinicalofficerscouncil.org/online-services/retention/?currpage=1")
"http://clinicalofficerscouncil.org/online-services/retention/?currpage=1")[0]
self.assertTrue(len(entries[0]["fields"]) == 7)

def test_it_scrapes_whole_doctors_site(self):
Expand Down Expand Up @@ -98,3 +98,7 @@ def test_clinical_officers_scraper_archives_to_s3(self):
self.clinical_officers_scraper.s3.delete_object(
Bucket="cfa-healthtools-ke",
Key=self.clinical_officers_scraper.s3_key)

def test_foreign_doctors_scraper_deletes_cloudsearch_docs(self):
response = self.foreign_doctors_scraper.delete_cloudsearch_docs()
self.assertEqual(response.get("status"), "success")

0 comments on commit 8ee6dbc

Please sign in to comment.