/
TutorialCrawler.java
115 lines (98 loc) · 3.59 KB
/
TutorialCrawler.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
package ai.preferred.crawler.tutorial.master;
import ai.preferred.crawler.tutorial.entity.Paper;
import ai.preferred.venom.Crawler;
import ai.preferred.venom.Session;
import ai.preferred.venom.fetcher.Fetcher;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
public class TutorialCrawler {
// You can use this to log to console
private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(TutorialCrawler.class);
/**
* Exercise 1: Creating a crawler with default settings.
* <p>
* The crawler class is the "brain" behind the fetcher. It
* checks the queue and tells the fetcher what to crawl, and
* where should the response be processed. Creating a default
* crawler will automatically create a default fetcher.
* </p>
*
* @return a new instance of crawler
*/
public static Crawler createCrawler() {
// Create the crawler here
final Crawler crawler = null;
return crawler;
}
/**
* Exercise 2: Creating a fetcher that includes these (3) validators:
* - EmptyContentValidator,
* - StatusOkValidator,
* - TutorialValidator.
* <p>
* The fetcher class is like a internet browser, it allows
* you to fetch a page given a request (URL). You can also constrain
* the validity of the page by using validators.
* </p>
*
* @return a new instance of fetcher
*/
public static Fetcher createFetcher() {
// Create the fetcher here
final Fetcher fetcher = null;
return fetcher;
}
// Create session keys for things you would like to retrieve in/from handler
static final Session.Key<List<Paper>> PAPER_LIST_KEY = new Session.Key<>();
/**
* Exercise 3: Creating a session store with PAPER_LIST_KEY.
* <p>
* A session store allows you to exchange information/objects between
* your main method and your handler. As our crawler runs asynchronously,
* you will not be able to directly pass parameters to or directly
* return objects from the handler. To overcome this, we will use a session
* store. We have already created the session key above {@see PAPER_LIST_KEY},
* your task will be to insert this key into a session.
* </p>
*
* @return a new instance of session
*/
public static Session createSession(List<Paper> papers) {
final Session session = null;
return session;
}
/**
* Exercise 4: Creating a crawler that uses a specified fetcher and session.
* <p>
* The crawler class is the "brain" behind the fetcher. It
* checks the queue and tells the fetcher what to crawl, and
* where should the response be processed. To use the fetcher and
* session you created you should put it into crawler's builder.
* </p>
*
* @return a new instance of fetcher
*/
public static Crawler createCrawler(Fetcher fetcher, Session session) {
// Create the crawler here
final Crawler crawler = null;
return crawler;
}
/**
* Exercise 7: Putting it all together.
* <p>
* To run your crawler, you have to put it together into a main
* method. In this exercise you will be crawling the page
* {@literal https://preferred.ai/publications/}. Use this space
* to initialise your crawler and schedule the request.
* </p>
*/
public static void main(String[] args) throws Exception {
final List<Paper> papers = new ArrayList<>();
// try-with block automatically closes the crawler upon completion.
try (final Crawler crawler = null) {
}
LOGGER.info("You have crawled {} papers.", papers.size());
papers.forEach(paper -> LOGGER.info("Name: {}, Url: {}", paper.getName(), paper.getUrl()));
}
}