-
Notifications
You must be signed in to change notification settings - Fork 1
/
archives.py
111 lines (86 loc) · 3.01 KB
/
archives.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
__author__ = '/u/Praisebetoscience'
import time
import requests
from requests.exceptions import ConnectionError, ConnectTimeout, HTTPError
from urllib.parse import urlencode
import re
LEN_MAX = 35
ARCHIVE_ORG_FORMAT = "%Y%m%d%H%M%S"
RECOVERABLE_EXC = (ConnectionError, ConnectTimeout, HTTPError)
def ratelimit(max_per_second):
min_interval = 1.0 / float(max_per_second)
def decorate(func):
last_time_called = [0.0]
def rate_limited_func(*args, **kargs):
elapsed = time.clock() - last_time_called[0]
left_to_wait = min_interval - elapsed
if left_to_wait > 0:
time.sleep(left_to_wait)
ret = func(*args, **kargs)
last_time_called[0] = time.clock()
return ret
return rate_limited_func
return decorate
class ArchiveIsArchive(object):
def __init__(self, url):
self.archived = self.archive(url)
self.error_link = "https://archive.is/?" + urlencode({'url': url, 'run': 1})
@staticmethod
@ratelimit(0.5)
def archive(url):
params = {'url': url}
try:
res = requests.post('https://archive.is/submit/', params)
except RECOVERABLE_EXC:
return False
if res.status_code != 200:
return False
url_re = re.search(r'^.*(?:archiveurl.{0,10}|replace\(")(?P<url>https?://archive\.is/[0-z]{1,6}).*$',
res.text, flags=re.I | re.M)
if url_re:
return url_re.group('url')
return False
class ArchiveOrgArchive(object):
def __init__(self, url):
self.archived = self.archive(url)
self.error_link = "https://web.archive.org/save/" + url
@staticmethod
@ratelimit(0.5)
def archive(url):
try:
res = requests.get('https://web.archive.org/save/' + url)
except RECOVERABLE_EXC:
return False
if res.status_code == 200:
date = time.strftime(ARCHIVE_ORG_FORMAT)
return 'https://web.archive.org/' + date + '/' + url
if res.status_code == 403:
return None
return False
class MegalodonJPArchive(object):
def __init__(self, url):
self.archived = self.archive(url)
self.error_link = "http://megalodon.jp"
@staticmethod
@ratelimit(0.5)
def archive(url):
params = {'url': url}
try:
res = requests.post("http://megalodon.jp/pc/get_simple/decide", params)
except RECOVERABLE_EXC:
return False
if res.url == 'http://megalodon.jp/pc/get_simple/decide':
return False
return res.url
class ArchiveContainer(object):
def __init__(self, url, text):
self.url = url
self.text = text[:LEN_MAX] + "..." if len(text) > LEN_MAX else text
self.archives = [ArchiveIsArchive(url), ArchiveOrgArchive(url), MegalodonJPArchive(url)]
def __iter__(self):
for elem in self.archives:
yield elem
def main():
pass
if __name__ == "__main__":
main()