-
Notifications
You must be signed in to change notification settings - Fork 149
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implemented simple image download retry #261
Changes from all commits
b99aafc
acdc611
6e1640c
2469d2c
2011380
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -251,10 +251,15 @@ def getPageTitlesAPI(config={}, session=None): | |
try: | ||
r = session.post(url=config['api'], data=params) | ||
break | ||
except ConnectionError as err: | ||
print "Connection error: %s" % (str(err),) | ||
except requests.exceptions.RequestException as err: | ||
print "Request error: %s" % (str(err),) | ||
retryCount += 1 | ||
time.sleep(20) | ||
except TypeError as e: | ||
if '__str__ returned non-string' in str(e): | ||
print 'urllib3 had an SSL handshake error with pyOpenSSL' | ||
retryCount += 1 | ||
time.sleep(20) | ||
handleStatusCode(r) | ||
# FIXME Handle HTTP errors here! | ||
jsontitles = getJSON(r) | ||
|
@@ -540,9 +545,13 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None): | |
r = session.post(url=config['index'], data=params, headers=headers) | ||
handleStatusCode(r) | ||
xml = fixBOM(r) | ||
except requests.exceptions.ConnectionError as e: | ||
print ' Connection error: %s'%(str(e[0])) | ||
except requests.exceptions.RequestException as e: | ||
print ' Request error: %s'%(str(e[0])) | ||
xml = '' | ||
except TypeError as e: | ||
if '__str__ returned non-string' in str(e): | ||
print 'urllib3 had an SSL handshake error with pyOpenSSL' | ||
xml = '' | ||
c += 1 | ||
|
||
return xml | ||
|
@@ -1058,53 +1067,86 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): | |
if not os.path.isdir(imagepath): | ||
print 'Creating "%s" directory' % (imagepath) | ||
os.makedirs(imagepath) | ||
|
||
|
||
maxseconds = 100 # max seconds to wait in a single sleeping | ||
maxretries = config['retries'] # x retries and skip | ||
increment = 20 # increment every retry | ||
|
||
c = 0 | ||
lock = True | ||
if not start: | ||
lock = False | ||
for filename, url, uploader in images: | ||
retried = 0 | ||
completed = False | ||
if filename == start: # start downloading from start (included) | ||
lock = False | ||
if lock: | ||
continue | ||
|
||
delay(config=config, session=session) | ||
|
||
# saving file | ||
# truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash | ||
# limit). Later .desc is added to filename, so better 100 as max) | ||
filename2 = urllib.unquote(filename) | ||
if len(filename2) > other['filenamelimit']: | ||
# split last . (extension) and then merge | ||
filename2 = truncateFilename(other=other, filename=filename2) | ||
print 'Filename is too long, truncating. Now it is:', filename2 | ||
filename3 = u'%s/%s' % (imagepath, filename2) | ||
imagefile = open(filename3, 'wb') | ||
r = requests.get(url=url) | ||
imagefile.write(r.content) | ||
imagefile.close() | ||
# saving description if any | ||
try: | ||
title = u'Image:%s' % (filename) | ||
xmlfiledesc = getXMLFileDesc( | ||
config=config, | ||
title=title, | ||
session=session) # use Image: for backwards compatibility | ||
except PageMissingError: | ||
xmlfiledesc = '' | ||
logerror( | ||
config=config, | ||
text=u'The page "%s" was missing in the wiki (probably deleted)' % (title.decode('utf-8')) | ||
) | ||
|
||
f = open('%s/%s.desc' % (imagepath, filename2), 'w') | ||
# <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text> | ||
if not re.search(r'</mediawiki>', xmlfiledesc): | ||
# failure when retrieving desc? then save it as empty .desc | ||
xmlfiledesc = '' | ||
f.write(xmlfiledesc.encode('utf-8')) | ||
f.close() | ||
delay(config=config, session=session) | ||
|
||
while not completed: | ||
if retried > 0 and retried < maxretries: | ||
wait = increment * retried < maxseconds and increment * \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Place the code in one line and the comments should precede the code. |
||
retried or maxseconds # incremental until maxseconds | ||
# print xml | ||
print ' In attempt %d, Image "%s" failed to download. Waiting %d seconds and reloading...'%(retried, filename, wait) | ||
time.sleep(wait) | ||
if retried >= maxretries: | ||
print ' We have retried %d times. Now skipping.' % (retried) | ||
print ' Image download error for "%s", network error or whatever...' % (filename) | ||
break | ||
retried += 1 | ||
|
||
# saving file | ||
# truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash | ||
# limit). Later .desc is added to filename, so better 100 as max) | ||
filename2 = urllib.unquote(filename) | ||
if len(filename2) > other['filenamelimit']: | ||
# split last . (extension) and then merge | ||
filename2 = truncateFilename(other=other, filename=filename2) | ||
print 'Filename is too long, truncating. Now it is:', filename2 | ||
filename3 = u'%s/%s' % (imagepath, filename2) | ||
try: | ||
r = requests.get(url=url) | ||
if r.status_code != 200: | ||
print ' Image download error: %d'%(r.status_code) | ||
continue | ||
imagefile = open(filename3, 'wb') | ||
imagefile.write(r.content) | ||
imagefile.close() | ||
except requests.exceptions.RequestException as e: | ||
print ' Image download error: %s'%(str(e[0])) | ||
continue | ||
except TypeError as e: | ||
if '__str__ returned non-string' in str(e): | ||
print 'urllib3 had an SSL handshake error with pyOpenSSL' | ||
continue | ||
|
||
# saving description if any | ||
try: | ||
title = u'Image:%s' % (filename) | ||
xmlfiledesc = getXMLFileDesc( | ||
config=config, | ||
title=title, | ||
session=session) # use Image: for backwards compatibility | ||
except PageMissingError: | ||
xmlfiledesc = '' | ||
logerror( | ||
config=config, | ||
text=u'The page "%s" was missing in the wiki (probably deleted)' % (title.decode('utf-8')) | ||
) | ||
|
||
f = open('%s/%s.desc' % (imagepath, filename2), 'w') | ||
# <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text> | ||
if not re.search(r'</mediawiki>', xmlfiledesc): | ||
# failure when retrieving desc? then save it as empty .desc | ||
xmlfiledesc = '' | ||
f.write(xmlfiledesc.encode('utf-8')) | ||
f.close() | ||
completed = True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you test this code? I am purely reading the code only and it seems like this will always result in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Completed is set to false before the while loop. |
||
|
||
c += 1 | ||
if c % 10 == 0: | ||
print ' Downloaded %d images' % (c) | ||
|
@@ -1370,11 +1412,17 @@ def getParameters(params=[]): | |
try: | ||
check = checkAPI(api=api, session=session) | ||
break | ||
except requests.exceptions.ConnectionError as e: | ||
print 'Connection error: %s'%(str(e)) | ||
except requests.exceptions.RequestException as e: | ||
print 'Request error: %s'%(str(e)) | ||
retry += 1 | ||
print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay) | ||
time.sleep(retrydelay) | ||
except TypeError as e: | ||
if '__str__ returned non-string' in str(e): | ||
print 'urllib3 had an SSL handshake error with pyOpenSSL' | ||
retry += 1 | ||
print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay) | ||
time.sleep(retrydelay) | ||
if api and check: | ||
index2 = check[1] | ||
api = check[2] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we put this as something that the user can configure, or should we enforce this as a hard limit?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's currently a fixed value for the XML part as well.
I've only followed that. The number of retries is configurable though.