Skip to content
Permalink
Browse files

Implemented MysqlFileManager (#3)

* Added MysqlFileManager implementation

* Updated pom.xml and .travis.yml to resolve version-ranges

* Compatible with JAVA 8 +
  • Loading branch information...
lwj5 committed Feb 26, 2019
1 parent 806dff4 commit 0ad77a5f734ccaeb6ab5f4d169fc5b54144fb275
Showing with 2,803 additions and 213 deletions.
  1. +3 −1 .travis.yml
  2. +1 −1 README.md
  3. +77 −18 pom.xml
  4. +13 −8 src/main/java/ai/preferred/venom/Crawler.java
  5. +1 −1 src/main/java/ai/preferred/venom/ThreadedWorkerManager.java
  6. +4 −24 src/main/java/ai/preferred/venom/fetcher/AsyncFetcher.java
  7. +5 −5 src/main/java/ai/preferred/venom/fetcher/AsyncResponseConsumer.java
  8. +20 −0 src/main/java/ai/preferred/venom/fetcher/Callback.java
  9. +267 −0 src/main/java/ai/preferred/venom/fetcher/StorageFetcher.java
  10. +13 −24 src/main/java/ai/preferred/venom/job/AbstractQueueScheduler.java
  11. +10 −0 src/main/java/ai/preferred/venom/job/FIFOScheduler.java
  12. +10 −0 src/main/java/ai/preferred/venom/job/PriorityQueueScheduler.java
  13. +108 −0 src/main/java/ai/preferred/venom/request/StorageFetcherRequest.java
  14. +1 −61 src/main/java/ai/preferred/venom/response/BaseResponse.java
  15. +0 −9 src/main/java/ai/preferred/venom/response/Response.java
  16. +6 −8 src/main/java/ai/preferred/venom/response/Retrievable.java
  17. +85 −0 src/main/java/ai/preferred/venom/response/StorageResponse.java
  18. +1 −7 src/main/java/ai/preferred/venom/response/VResponse.java
  19. +146 −0 src/main/java/ai/preferred/venom/storage/DummyFileManager.java
  20. +7 −3 src/main/java/ai/preferred/venom/storage/FileManager.java
  21. +70 −0 src/main/java/ai/preferred/venom/storage/FileManagerCallback.java
  22. +519 −0 src/main/java/ai/preferred/venom/storage/MysqlFileManager.java
  23. +13 −20 src/main/java/ai/preferred/venom/storage/Record.java
  24. +370 −0 src/main/java/ai/preferred/venom/storage/StorageRecord.java
  25. +62 −0 src/main/java/ai/preferred/venom/storage/StorageUtil.java
  26. +51 −0 src/main/java/ai/preferred/venom/utils/UrlUtil.java
  27. +30 −0 src/test/java/ai/preferred/venom/CrawlerTest.java
  28. +1 −1 src/{main/java/ai/preferred/venom/utils → test/java/ai/preferred/venom}/InlineExecutorService.java
  29. +1 −1 src/test/java/ai/preferred/venom/SleepSchedulerTest.java
  30. +0 −1 src/test/java/ai/preferred/venom/ThreadedWorkerManagerTest.java
  31. +134 −0 src/test/java/ai/preferred/venom/fetcher/AsyncFetcherTest.java
  32. +3 −20 src/test/java/ai/preferred/venom/fetcher/FakeFetcher.java
  33. +261 −0 src/test/java/ai/preferred/venom/fetcher/StorageFetcherTest.java
  34. +108 −0 src/test/java/ai/preferred/venom/storage/DummyFileManagerTest.java
  35. +97 −0 src/test/java/ai/preferred/venom/storage/FakeFileManager.java
  36. +78 −0 src/test/java/ai/preferred/venom/storage/FileManagerCallbackTest.java
  37. +150 −0 src/test/java/ai/preferred/venom/storage/MysqlFileManagerTest.java
  38. +77 −0 src/test/java/ai/preferred/venom/storage/StorageRecordTest.java
@@ -1,10 +1,12 @@
language: java
jdk:
- oraclejdk8
- openjdk11
cache:
directories:
- ~/.m2/repository

before_install: mvn versions:resolve-ranges

install: mvn -s .mvn.xml install -DskipTests=true -Dgpg.skip -Dmaven.javadoc.skip=true -B -V

script: mvn clean test jacoco:report coveralls:report
@@ -4,7 +4,7 @@ Your preferred open source focused crawler for the Deep Web.
[![Maven Central](https://maven-badges.herokuapp.com/maven-central/ai.preferred/venom/badge.svg)](https://maven-badges.herokuapp.com/maven-central/ai.preferred/venom)
[![Build Status](https://travis-ci.org/PreferredAI/venom.svg)](https://travis-ci.org/PreferredAI/venom)
[![Coverage Status](https://coveralls.io/repos/github/PreferredAI/venom/badge.svg)](https://coveralls.io/github/PreferredAI/venom)
[![Javadocs](https://www.javadoc.io/badge/ai.preferred/venom.svg)](https://www.javadoc.io/doc/ai.preferred/venom)
[![Javadoc](https://www.javadoc.io/badge/ai.preferred/venom.svg)](https://www.javadoc.io/doc/ai.preferred/venom)

## Overview
Our aim is to create a blazing fast, fully customizable and robust crawler that is simple and handy to use.
95 pom.xml
@@ -10,13 +10,18 @@

<groupId>ai.preferred</groupId>
<artifactId>venom</artifactId>
<version>4.1.2</version>
<version>4.1.3-SNAPSHOT</version>
<packaging>jar</packaging>

<name>${project.groupId}:${project.artifactId}</name>
<description>An open source focused crawler for the Deep Web built on Apache HttpAsyncClient</description>
<description>Your preferred open source focused crawler for the deep web.</description>
<url>https://venom.preferred.ai</url>

<organization>
<name>Preferred.AI</name>
<url>https://preferred.ai</url>
</organization>

<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
@@ -49,17 +54,17 @@
</developers>

<scm>
<connection>scm:git:git://github.com/PreferredAI/Venom.git</connection>
<developerConnection>scm:git:ssh://github.com:PreferredAI/Venom.git</developerConnection>
<url>http://github.com/PreferredAI/Venom/tree/master</url>
<connection>scm:git:git://github.com/PreferredAI/venom.git</connection>
<developerConnection>scm:git:ssh://git@github.com:PreferredAI/venom.git</developerConnection>
<url>http://github.com/PreferredAI/venom</url>
</scm>

<build>
<plugins>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.7</version>
<version>1.6.8</version>
<extensions>true</extensions>
<configuration>
<serverId>ossrh</serverId>
@@ -70,12 +75,33 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.7.0</version>
<version>3.8.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-enforcer-plugin</artifactId>
<version>3.0.0-M2</version>
<executions>
<execution>
<id>enforce-no-snapshots</id>
<goals>
<goal>enforce</goal>
</goals>
<configuration>
<rules>
<requireReleaseDeps>
<message>No Snapshots Allowed!</message>
</requireReleaseDeps>
</rules>
<fail>true</fail>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.0</version>
@@ -88,7 +114,7 @@
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>5.2.0</version>
<version>5.4.0</version>
</dependency>
</dependencies>
</plugin>
@@ -149,7 +175,7 @@
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<version>0.8.1</version>
<version>0.8.3</version>
<executions>
<execution>
<id>prepare-agent</id>
@@ -163,6 +189,13 @@
<groupId>org.eluder.coveralls</groupId>
<artifactId>coveralls-maven-plugin</artifactId>
<version>4.3.0</version>
<dependencies>
<dependency>
<groupId>javax.xml.bind</groupId>
<artifactId>jaxb-api</artifactId>
<version>2.3.0</version>
</dependency>
</dependencies>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
@@ -185,6 +218,11 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>versions-maven-plugin</artifactId>
<version>2.7</version>
</plugin>
</plugins>

<resources>
@@ -205,7 +243,7 @@
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpasyncclient</artifactId>
<version>4.1.4</version>
<version>[4.1.4,4.2)</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
@@ -215,32 +253,47 @@
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>[2.28,3.0)</version>
<version>[2.33,3.0)</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>[1.16,2.0)</version>
<version>[1.20,2.0)</version>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>[62.0,63.0)</version>
<version>[63.0,64.0)</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>25.1-jre</version>
<version>27.0.1-jre</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>[3.6,4.0)</version>
<version>[3.8,4.0)</version>
</dependency>
<dependency>
<groupId>javax.validation</groupId>
<artifactId>validation-api</artifactId>
<version>2.0.0.Final</version>
<version>2.0.1.Final</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>[5.1,6.0)</version>
</dependency>
<dependency>
<groupId>com.zaxxer</groupId>
<artifactId>HikariCP</artifactId>
<version>[3.3,3.4)</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>[20180000,)</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
@@ -250,13 +303,13 @@
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>5.2.0</version>
<version>5.4.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.github.tomakehurst</groupId>
<artifactId>wiremock-standalone</artifactId>
<version>[2.18,2.19)</version>
<version>[2.21,2.22)</version>
<scope>test</scope>
</dependency>
<dependency>
@@ -265,6 +318,12 @@
<version>[1.7,1.8)</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>ch.vorburger.mariaDB4j</groupId>
<artifactId>mariaDB4j</artifactId>
<version>[2.4,2.5)</version>
<scope>test</scope>
</dependency>
</dependencies>

<distributionManagement>
@@ -35,7 +35,10 @@
import javax.annotation.Nullable;
import javax.validation.constraints.NotNull;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinWorkerThread;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

/**
@@ -109,7 +112,7 @@
/**
* The sleep scheduler used.
*/
@NotNull
@Nullable
private final SleepScheduler sleepScheduler;

/**
@@ -193,11 +196,13 @@ public static Crawler buildDefault() {
private void sleep(final Job job, final long lastRequestTime) throws InterruptedException {
final long sleepTime;
if (job.getRequest().getSleepScheduler() == null) {
sleepTime = sleepScheduler.getSleepTime();
} else if (job.getRequest().getSleepScheduler() != null) {
sleepTime = job.getRequest().getSleepScheduler().getSleepTime();
if (sleepScheduler != null) {
sleepTime = sleepScheduler.getSleepTime();
} else {
sleepTime = 0;
}
} else {
sleepTime = 0;
sleepTime = job.getRequest().getSleepScheduler().getSleepTime();
}

final long timeElapsed = System.nanoTime() - lastRequestTime;
@@ -525,7 +530,7 @@ public Builder setFetcher(final @NotNull Fetcher fetcher) {
* @param parallelism the parallelism level.
* @return this
*/
public Builder setParallism(final int parallelism) {
public Builder setParallelism(final int parallelism) {
if (parallelism <= 0) {
LOGGER.warn("Attribute 'numThreads' not within range, defaulting to system default.");
} else {
@@ -616,7 +621,7 @@ public Builder setPropRetainProxy(final double propRetainProxy) {
* @param sleepScheduler sleepAndGetTime scheduler to be used.
* @return this
*/
public Builder setSleepScheduler(final @NotNull SleepScheduler sleepScheduler) {
public Builder setSleepScheduler(final SleepScheduler sleepScheduler) {
this.sleepScheduler = sleepScheduler;
return this;
}
@@ -49,7 +49,7 @@
*
* @param executor An executor service
*/
public ThreadedWorkerManager(final ExecutorService executor) {
public ThreadedWorkerManager(@Nullable final ExecutorService executor) {
this.executor = executor;
if (executor instanceof ForkJoinPool || executor == null) {
this.worker = new ForkJoinWorker();
@@ -60,40 +60,20 @@
import java.util.concurrent.ThreadFactory;

/**
* This class holds the implementation to provide how items are fetched, to fetch the item,
* This class holds the implementation to provide how items are fetched from the web,
* to validate the item and to store it if specified.
*
* @author Maksim Tkachenko
* @author Truong Quoc Tuan
* @author Ween Jiann Lee
*/
public class AsyncFetcher implements Fetcher {
public final class AsyncFetcher implements Fetcher {

/**
* Logger.
*/
private static final Logger LOGGER = LoggerFactory.getLogger(AsyncFetcher.class);

/**
* An instance of empty callback.
*/
private static final Callback EMPTY_CALLBACK = new Callback() {
@Override
public void completed(final @NotNull Request request, final @NotNull Response response) {

}

@Override
public void failed(final @NotNull Request request, final @NotNull Exception ex) {

}

@Override
public void cancelled(final @NotNull Request request) {

}
};

/**
* A list of callbacks to execute upon response.
*/
@@ -347,7 +327,7 @@ private HttpHost determineTarget(final HttpUriRequest request) throws ClientProt

@Override
public Future<Response> fetch(final Request request) {
return fetch(request, EMPTY_CALLBACK);
return fetch(request, Callback.EMPTY_CALLBACK);
}

@Override
@@ -430,7 +410,7 @@ public void close() throws IOException {
/**
* A builder for async fetcher class.
*/
public static class Builder {
public static final class Builder {

/**
* A list of callbacks to execute upon response.
@@ -21,6 +21,7 @@
import ai.preferred.venom.response.BaseResponse;
import ai.preferred.venom.response.Response;
import ai.preferred.venom.utils.ResponseDecompressor;
import ai.preferred.venom.utils.UrlUtil;
import ai.preferred.venom.validator.Validator;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
@@ -42,7 +43,6 @@
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Set;
@@ -157,14 +157,14 @@ private BaseResponse createVenomResponse(final boolean compressed) throws IOExce
final ContentType contentType = getContentType(entity);
final Header[] headers = httpResponse.getAllHeaders();

String baseUrl = request.getUrl();
String tryBaseUrl;
try {
final URI uri = new URI(request.getUrl());
final URI baseUri = new URI(uri.getScheme(), null, uri.getHost(), uri.getPort(), uri.getPath(), null, null);
baseUrl = baseUri.toString();
tryBaseUrl = UrlUtil.getBaseUrl(request);
} catch (URISyntaxException e) {
LOGGER.warn("Could not parse base URL: " + request.getUrl());
tryBaseUrl = request.getUrl();
}
final String baseUrl = tryBaseUrl;

return new BaseResponse(
httpResponse.getStatusLine().getStatusCode(),
Oops, something went wrong.

0 comments on commit 0ad77a5

Please sign in to comment.
You can’t perform that action at this time.