Skip to content
Permalink
Browse files

Added CSV writing

  • Loading branch information...
lwj5 committed Sep 28, 2018
1 parent bb6fd85 commit ee569bf39d06bd228fd18d2c44e9956818085a33
@@ -1,3 +1,4 @@
/target/
/.idea/
/*.iml
/*.iml
results.csv
@@ -76,6 +76,11 @@
<artifactId>venom</artifactId>
<version>[4.0,4.1)</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.6</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
@@ -0,0 +1,60 @@
package ai.preferred.crawler.example;

import ai.preferred.crawler.example.entity.Listing;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileWriter;
import java.io.IOException;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.List;

public class EntityCSVStorage implements AutoCloseable {

private static final Logger LOGGER = LoggerFactory.getLogger(EntityCSVStorage.class);

private final CSVPrinter printer;

public EntityCSVStorage(String file) throws IOException {
printer = new CSVPrinter(new FileWriter(file), CSVFormat.EXCEL);
printer.printRecord(getHeaderList(Listing.class));
}

private static List<String> getHeaderList(Class clazz) {
final List<String> result = new ArrayList<>();
for (final Field field : clazz.getDeclaredFields()) {
result.add(field.getName());
}
return result;
}

private List<Object> toList(Object object) throws IllegalAccessException {
final Field[] fields = object.getClass().getDeclaredFields();
final List<Object> result = new ArrayList<>();
for (final Field field : fields) {
field.setAccessible(true);
result.add(field.get(object));
}
return result;
}

public synchronized boolean append(Listing listing) {
try {
printer.printRecord(toList(listing));
printer.flush();
} catch (IOException | IllegalAccessException e) {
LOGGER.error("unable to store property: ", e);
return false;
}
return true;
}

@Override
public void close() throws IOException {
printer.close(true);
}

}
@@ -5,6 +5,7 @@
*/
package ai.preferred.crawler.example.master;

import ai.preferred.crawler.example.EntityCSVStorage;
import ai.preferred.crawler.example.entity.Listing;
import ai.preferred.venom.Crawler;
import ai.preferred.venom.Session;
@@ -29,28 +30,41 @@
// Create session keys for things you would like to retrieve from handler
static final Session.Key<ArrayList<Listing>> JOB_LIST_KEY = new Session.Key<>();

// Create session keys for CSV printer to print from handler
static final Session.Key<EntityCSVStorage> CSV_STORAGE_KEY = new Session.Key<>();

public static void main(String[] args) {

// Let's init the session, this allows us to retrieve the array list in the handler
final ArrayList<Listing> jobListing = new ArrayList<>();
final Session session = Session.builder()
.put(JOB_LIST_KEY, jobListing)
.build();
// Get project directory
String workingDir = System.getProperty("user.dir");

// Start CSV printer
try (final EntityCSVStorage printer = new EntityCSVStorage(workingDir + "/results.csv")) {

// Let's init the session, this allows us to retrieve the array list in the handler
final ArrayList<Listing> jobListing = new ArrayList<>();
final Session session = Session.builder()
.put(JOB_LIST_KEY, jobListing)
.put(CSV_STORAGE_KEY, printer)
.build();

// Start crawler
try (final Crawler crawler = crawler(fetcher(), session).start()) {
LOGGER.info("Starting crawler...");

final String startUrl = "https://stackoverflow.com/jobs?l=Singapore&d=20&u=Km";

// Start crawler
try (Crawler crawler = crawler(fetcher(), session).start()) {
LOGGER.info("Starting crawler...");
// pass in URL and handler or use a HandlerRouter
crawler.getScheduler().add(new VRequest(startUrl), new ListingHandler());
}

final String startUrl = "https://stackoverflow.com/jobs?l=Singapore&d=20&u=Km";
// We will retrieve all the listing here
LOGGER.info("We have found {} listings!", jobListing.size());

// pass in URL and handler or use a HandlerRouter
crawler.getScheduler().add(new VRequest(startUrl), new ListingHandler());
} catch (Exception e) {
LOGGER.error("Could not run crawler: ", e);
}

// We will retrieve all the listing here
LOGGER.info("We have found {} listings!", jobListing.size());
}


@@ -5,6 +5,7 @@
*/
package ai.preferred.crawler.example.master;

import ai.preferred.crawler.example.EntityCSVStorage;
import ai.preferred.crawler.example.entity.Listing;
import ai.preferred.venom.Handler;
import ai.preferred.venom.Session;
@@ -33,10 +34,13 @@ public void handle(Request request, VResponse response, Scheduler scheduler, Ses
// Get the job listing array list we created
final ArrayList<Listing> jobListing = session.get(ListingCrawler.JOB_LIST_KEY);

// Get the job listing array list we created
final EntityCSVStorage csvStorage = session.get(ListingCrawler.CSV_STORAGE_KEY);

// Get HTML
final String html = response.getHtml();

// JSOUP
// JSoup
final Document document = response.getJsoup();

// We will use a parser class
@@ -46,11 +50,16 @@ public void handle(Request request, VResponse response, Scheduler scheduler, Ses

// Add to the array list
jobListing.add(listing);

// Write record in CSV
csvStorage.append(listing);
});

// Crawl another page if there's a next page
if (finalResult.getNextPage() != null) {
final String nextPageURL = finalResult.getNextPage();

// Schedule the next page
scheduler.add(new VRequest(nextPageURL), this);
}

@@ -110,8 +110,6 @@ public static void main(String[] args) throws Exception {
}

LOGGER.info("You have crawled {} papers.", papers.size());
papers.forEach(paper -> {
LOGGER.info("Name: {}, Url: {}", paper.getName(), paper.getUrl());
});
papers.forEach(paper -> LOGGER.info("Name: {}, Url: {}", paper.getName(), paper.getUrl()));
}
}
@@ -63,7 +63,7 @@ public void testCreateFetcher() throws Exception {

final Field field2 = pipelineValidator.getClass().getDeclaredField("validators");
field2.setAccessible(true);
final List<Validator> validators = (List<Validator>) field2.get(pipelineValidator);
@SuppressWarnings("unchecked") final List<Validator> validators = (List<Validator>) field2.get(pipelineValidator);

Assertions.assertEquals(validators.size(), 3, "Incorrect number of validators found!");

0 comments on commit ee569bf

Please sign in to comment.
You can’t perform that action at this time.