In [1]:
import pandas as pd
import os
import requests
import fitz
import time

In [2]:
COLLECTION_NAME = 'TIFpdfs'
SOURCE_FOLDER = '../parsed_pdfs'

In [3]:
sources = pd.read_csv('source_status.csv')

In [4]:
def check_broken(save_path):
    try:
        with fitz.open(save_path) as pdf:
            if len(pdf) == 0:
                return True

    except fitz.FileDataError as file_error:
        return True
    
    return False

In [5]:
def download_pdf(url, year, save_path, ignore_corrupted=False, dont_redownload=False):

	# Check to see if we already downloaded it
	if os.path.exists(save_path):
		print("Already have, skipping download")

		# Check if it's broken
		is_broken = not ignore_corrupted and check_broken(save_path)
		if is_broken:
			print(f'Download is corrupted, downloading again')
			os.remove(save_path)
			download_pdf(url, year, save_path, True)

			is_broken = check_broken(save_path)
			if is_broken:
				# Something fishy is going on
				print(f'Cannot download {url}')
				os.remove(save_path)
				return False
			else:
				return True
		else:
			return True
		
	try:
		response = requests.get(url, stream=True)

		with open(save_path, 'wb') as pdf_file:
			for chunk in response.iter_content(chunk_size=1024):
				if chunk:
					pdf_file.write(chunk)

		# Check to see if the file is corrupted
		is_broken = not ignore_corrupted and check_broken(save_path)

		# Try again if broken
		if is_broken:
			print(f'Corrupted download of {url}, retrying')
			os.remove(save_path)
			download_pdf(url, year, save_path, True)

			is_broken = check_broken(save_path)
			if is_broken:
				# Something fishy is going on
				print(f'Cannot download {url}')
				os.remove(save_path)
				return False
			else:
				return True
		else:
			return True
		
	except (requests.ConnectionError, requests.Timeout, requests.HTTPError) as e:

		if dont_redownload:
			print(f'Cannot download {url}')
			return False
		
		print('Error while downloading document. Waiting a bit then trying again')
		time.sleep(10)
		
		return download_pdf(url, year, save_path, ignore_corrupted=ignore_corrupted, dont_redownload=True)
		
	except Exception as e:
		print(f'General error while getting work: {e}')

		if dont_redownload:
			print(f'Cannot download {url}')
			return False
		
		print('Waiting a bit then trying again')
		time.sleep(10)
		
		return download_pdf(url, year, save_path, ignore_corrupted=ignore_corrupted, dont_redownload=True)

In [9]:
for _, row in sources.iterrows():
	save_path = os.path.join(COLLECTION_NAME, f'{row["year"]}_{row["tif_number"]}.pdf')

	download_pdf(row['url'], row['year'], save_path)
	

Corrupted download of https://www.chicago.gov/content/dam/city/depts/dcd/tif/03reports/T_129_DiverseyNaragansettAR04.pdf, retrying
Cannot download https://www.chicago.gov/content/dam/city/depts/dcd/tif/03reports/T_129_DiverseyNaragansettAR04.pdf
Corrupted download of https://www.chicago.gov/content/dam/city/depts/dcd/tif/03reports/T_130_EdgewaterAshlandAR04.pdf, retrying
Cannot download https://www.chicago.gov/content/dam/city/depts/dcd/tif/03reports/T_130_EdgewaterAshlandAR04.pdf
Already have, skipping download


KeyboardInterrupt: 