-
Notifications
You must be signed in to change notification settings - Fork 0
/
collecte.py
95 lines (78 loc) · 2.91 KB
/
collecte.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import metmuseum
import time
import asyncio
import pandas as pd
from nettoyage import clean
from termcolor import cprint
REQUEST_LIMIT = 80
def compute_remaining_time(beginning_time, nrequests, max_requests):
ttime = time.time()
elapsed_time = ttime - beginning_time
avg_time = elapsed_time/nrequests
return (max_requests - nrequests)*avg_time
async def fetch_data():
beginning_time = time.time()
request_counter = 0
tasks = []
res = []
requests_counter = 0
cprint(f"{time.strftime('%X')}", "light_cyan", end="")
print(" Envoi des requetes")
# Recupere les id des peintures crees en Europe
# Je n'ai pas utilise le departement peintures europeennes
# car certaines peintures appartiennent a d'autres departements
ids = await metmuseum.fetch(geoLocation="Europe", medium="Paintings", q="\"\"")
requests_number = ids["total"]
print("Requete ", end="")
cprint("0", "light_yellow", end="")
print("/", end="")
cprint(f"{str(requests_number)}", "light_magenta", end="")
for id in ids["objectIDs"]:
if request_counter==REQUEST_LIMIT:
print("\rRequete ", end="")
cprint(f"{requests_counter}", "light_yellow", end="")
print("/", end="")
cprint(f"{str(requests_number)}", "light_magenta", end="")
print(" | Temps restant estime ", end="")
remaining_time = time.gmtime(compute_remaining_time(beginning_time,
requests_counter, requests_number))
cprint(f"{time.strftime('%M:%S', remaining_time)}", "light_red", end="")
print(" min", end="")
res += await asyncio.gather(*tasks)
time.sleep(1.2)
request_counter = 0
tasks = []
tasks.append(asyncio.ensure_future(metmuseum.fetch_object(id)))
requests_counter += 1
request_counter += 1
if requests_number % REQUEST_LIMIT != 0:
print("\rRequete ", end="")
cprint(f"{requests_counter}", "light_yellow", end="")
print("/", end="")
cprint(f"{str(requests_number)}", "light_magenta", end="")
print(" | Temps restant estime ", end="")
remaining_time = time.gmtime(compute_remaining_time(beginning_time,
requests_counter, requests_number))
cprint(f"{time.strftime('%M:%S', remaining_time)}", "light_red", end="")
print(" min", end="")
res += await asyncio.gather(*tasks)
print()
cprint(f"{time.strftime('%X')}", "light_cyan", end="")
print(" Exportation des donnees vers shiny/data/data.csv")
data_frame = pd.json_normalize(res)
data_frame = clean(data_frame)
data_frame.to_csv("./shiny/data/data.csv", index=False, encoding="utf-8")
async def main():
beginning_time = time.time()
print("===== Collecte de donnees sur ", end="")
cprint("https://www.metmuseum.org", "light_green", end="")
print(" =====")
task1 = asyncio.create_task(fetch_data())
await task1
if task1.done():
elapsed_time = time.gmtime(time.time()-beginning_time)
print(f"Temps d'execution : ", end="")
cprint(f"{time.strftime('%M:%S', elapsed_time)}", "light_red")
await metmuseum.close_session()
if __name__=="__main__":
asyncio.run(main())