Skip to content
Permalink
Browse files

Added stackoverflow job and single crawler

  • Loading branch information...
lwj5 committed Nov 2, 2018
1 parent 354c508 commit cd9dabf2d41d9d70e7938b2374f0a76c8f9011bf
@@ -1,4 +1,5 @@
# Compiled class file
/target/
*.class

# Log file
17 pom.xml
@@ -5,7 +5,7 @@
<modelVersion>4.0.0</modelVersion>

<groupId>ai.preferred</groupId>
<artifactId>iproperty-crawler-example</artifactId>
<artifactId>venom-examples</artifactId>
<version>1.0</version>

<properties>
@@ -56,21 +56,6 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.6.0</version>
<executions>
<execution>
<goals>
<goal>java</goal>
</goals>
</execution>
</executions>
<configuration>
<mainClass>ai.preferred.crawler.iproperty.master.ListingCrawler</mainClass>
</configuration>
</plugin>
</plugins>

<resources>
@@ -0,0 +1,54 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package ai.preferred.crawler.single;

import ai.preferred.venom.Crawler;
import ai.preferred.venom.Handler;
import ai.preferred.venom.fetcher.AsyncFetcher;
import ai.preferred.venom.fetcher.Fetcher;
import ai.preferred.venom.request.Request;
import ai.preferred.venom.request.VRequest;
import org.slf4j.LoggerFactory;

/**
* @author Ween Jiann Lee
*/
public class SingleCrawler {

// You can use this to log to console
private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(SingleCrawler.class);

private static final String URL = "https://whoer.net";

public static void main(String[] args) {

try (Crawler crawler = crawler(httpFetcher()).start()) {
LOGGER.info("Starting crawler...");

// pass in URL and handler
final Request request = new VRequest(URL);
final Handler handler = new SingleHandler();

crawler.getScheduler().add(request, handler);
} catch (Exception e) {
LOGGER.error("Could not run crawler: ", e);
}

}

private static Fetcher httpFetcher() {
// You can look in builder the different things you can add
return AsyncFetcher.builder()
.build();
}

private static Crawler crawler(Fetcher fetcher) {
// You can look in builder the different things you can add
return Crawler.builder()
.fetcher(fetcher)
.build();
}
}
@@ -0,0 +1,44 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package ai.preferred.crawler.single;

import ai.preferred.venom.Handler;
import ai.preferred.venom.Session;
import ai.preferred.venom.Worker;
import ai.preferred.venom.job.Scheduler;
import ai.preferred.venom.request.Request;
import ai.preferred.venom.response.VResponse;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
* @author Ween Jiann Lee
*/
public class SingleHandler implements Handler {

private static final Logger LOGGER = LoggerFactory.getLogger(SingleHandler.class);

@Override
public void handle(Request request, VResponse response, Scheduler scheduler, Session session, Worker worker) {
LOGGER.info("Processing {}", request.getUrl());

// Get content type
System.out.println(response.getContentType());

// Get HTML
final String html = response.getHtml();
System.out.println(html);

// Get our IP
final Document document = response.getJsoup();
final String ip = document.select("#content > div.main-box > div > div.column > div > strong")
.first().text();

LOGGER.info("My IP is {}, let's go to {} to verify", ip, request.getUrl());
}
}
@@ -0,0 +1,59 @@
package ai.preferred.crawler.stackoverflow;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileWriter;
import java.io.IOException;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.List;

public class EntityCSVStorage<T> implements AutoCloseable {

private static final Logger LOGGER = LoggerFactory.getLogger(EntityCSVStorage.class);

private final CSVPrinter printer;

public EntityCSVStorage(String file, Class<T> clazz) throws IOException {
printer = new CSVPrinter(new FileWriter(file), CSVFormat.EXCEL);
printer.printRecord(getHeaderList(clazz));
}

private static List<String> getHeaderList(Class clazz) {
final List<String> result = new ArrayList<>();
for (final Field field : clazz.getDeclaredFields()) {
result.add(field.getName());
}
return result;
}

private List<Object> toList(Object object) throws IllegalAccessException {
final Field[] fields = object.getClass().getDeclaredFields();
final List<Object> result = new ArrayList<>();
for (final Field field : fields) {
field.setAccessible(true);
result.add(field.get(object));
}
return result;
}

public synchronized boolean append(T object) {
try {
printer.printRecord(toList(object));
printer.flush();
} catch (IOException | IllegalAccessException e) {
LOGGER.error("unable to store property: ", e);
return false;
}
return true;
}

@Override
public void close() throws IOException {
printer.close(true);
}

}
@@ -0,0 +1,13 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package ai.preferred.crawler.stackoverflow;

/**
* @author Ween Jiann Lee
*/
public class Helper {

}
@@ -0,0 +1,88 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package ai.preferred.crawler.stackoverflow.master;

import ai.preferred.crawler.stackoverflow.EntityCSVStorage;
import ai.preferred.crawler.stackoverflow.master.entity.Listing;
import ai.preferred.venom.Crawler;
import ai.preferred.venom.Session;
import ai.preferred.venom.fetcher.AsyncFetcher;
import ai.preferred.venom.fetcher.Fetcher;
import ai.preferred.venom.request.VRequest;
import ai.preferred.venom.validator.EmptyContentValidator;
import ai.preferred.venom.validator.StatusOkValidator;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;

/**
* @author Ween Jiann Lee
*/
public class ListingCrawler {

// You can use this to log to console
private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(ListingCrawler.class);

// Create session keys for things you would like to retrieve from handler
static final Session.Key<ArrayList<Listing>> JOB_LIST_KEY = new Session.Key<>();

// Create session keys for CSV printer to print from handler
static final Session.Key<EntityCSVStorage<Listing>> CSV_STORAGE_KEY = new Session.Key<>();

public static void main(String[] args) {

// Get project directory
String workingDir = System.getProperty("user.dir");

// Start CSV printer
try (final EntityCSVStorage<Listing> printer = new EntityCSVStorage<>(
workingDir + "data/results.csv", Listing.class)) {

// Let's init the session, this allows us to retrieve the array list in the handler
final ArrayList<Listing> jobListing = new ArrayList<>();
final Session session = Session.builder()
.put(JOB_LIST_KEY, jobListing)
.put(CSV_STORAGE_KEY, printer)
.build();

// Start crawler
try (final Crawler crawler = crawler(fetcher(), session).start()) {
LOGGER.info("Starting crawler...");

final String startUrl = "https://stackoverflow.com/jobs?l=Singapore&d=20&u=Km";

// pass in URL and handler or use a HandlerRouter
crawler.getScheduler().add(new VRequest(startUrl), new ListingHandler());
}

// We will retrieve all the listing here
LOGGER.info("We have found {} listings!", jobListing.size());

} catch (Exception e) {
LOGGER.error("Could not run crawler: ", e);
}

}


private static Fetcher fetcher() {
// You can look in builder the different things you can add
return AsyncFetcher.builder()
.validator(
EmptyContentValidator.INSTANCE,
StatusOkValidator.INSTANCE,
new ListingValidator())
.build();
}

private static Crawler crawler(Fetcher fetcher, Session session) {
// You can look in builder the different things you can add
return Crawler.builder()
.fetcher(fetcher)
.session(session)
.build();
}
}
@@ -0,0 +1,67 @@
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package ai.preferred.crawler.stackoverflow.master;

import ai.preferred.crawler.stackoverflow.EntityCSVStorage;
import ai.preferred.crawler.stackoverflow.master.entity.Listing;
import ai.preferred.venom.Handler;
import ai.preferred.venom.Session;
import ai.preferred.venom.Worker;
import ai.preferred.venom.job.Scheduler;
import ai.preferred.venom.request.Request;
import ai.preferred.venom.request.VRequest;
import ai.preferred.venom.response.VResponse;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;

/**
* @author Ween Jiann Lee
*/
public class ListingHandler implements Handler {

private static final Logger LOGGER = LoggerFactory.getLogger(ListingHandler.class);

@Override
public void handle(Request request, VResponse response, Scheduler scheduler, Session session, Worker worker) {
LOGGER.info("Processing {}", request.getUrl());

// Get the job listing array list we created
final ArrayList<Listing> jobListing = session.get(ListingCrawler.JOB_LIST_KEY);

// Get the job listing array list we created
final EntityCSVStorage<Listing> csvStorage = session.get(ListingCrawler.CSV_STORAGE_KEY);

// Get HTML
final String html = response.getHtml();

// JSoup
final Document document = response.getJsoup();

// We will use a parser class
final ListingParser.FinalResult finalResult = ListingParser.parse(response);
finalResult.getListings().forEach(listing -> {
LOGGER.info("Found job: {} in {} [{}]", listing.getName(), listing.getCompany(), listing.getUrl());

// Add to the array list
jobListing.add(listing);

// Write record in CSV
csvStorage.append(listing);
});

// Crawl another page if there's a next page
if (finalResult.getNextPage() != null) {
final String nextPageURL = finalResult.getNextPage();

// Schedule the next page
scheduler.add(new VRequest(nextPageURL), this);
}

}
}

0 comments on commit cd9dabf

Please sign in to comment.
You can’t perform that action at this time.