Permalink
Browse files

Initial commit

  • Loading branch information...
lwj5 committed Jun 5, 2018
0 parents commit 40b9fd18eda7dfc553be73cc72a8c1f274c3ef65
Showing with 4,951 additions and 0 deletions.
  1. +3 −0 .gitignore
  2. +204 −0 pom.xml
  3. +491 −0 src/main/java/ai/preferred/venom/Crawler.java
  4. +49 −0 src/main/java/ai/preferred/venom/Handleable.java
  5. +53 −0 src/main/java/ai/preferred/venom/Handler.java
  6. +43 −0 src/main/java/ai/preferred/venom/HandlerRouter.java
  7. +31 −0 src/main/java/ai/preferred/venom/Interruptible.java
  8. +100 −0 src/main/java/ai/preferred/venom/ProxyProvider.java
  9. +95 −0 src/main/java/ai/preferred/venom/Session.java
  10. +60 −0 src/main/java/ai/preferred/venom/SleepScheduler.java
  11. +87 −0 src/main/java/ai/preferred/venom/ThreadedWorkerManager.java
  12. +122 −0 src/main/java/ai/preferred/venom/UrlRouter.java
  13. +43 −0 src/main/java/ai/preferred/venom/ValidatorRouter.java
  14. +82 −0 src/main/java/ai/preferred/venom/Worker.java
  15. +34 −0 src/main/java/ai/preferred/venom/WorkerManager.java
  16. +573 −0 src/main/java/ai/preferred/venom/fetcher/AsyncFetcher.java
  17. +191 −0 src/main/java/ai/preferred/venom/fetcher/AsyncResponseConsumer.java
  18. +52 −0 src/main/java/ai/preferred/venom/fetcher/Callback.java
  19. +63 −0 src/main/java/ai/preferred/venom/fetcher/Fetcher.java
  20. +51 −0 src/main/java/ai/preferred/venom/fetcher/StopCodeException.java
  21. +68 −0 src/main/java/ai/preferred/venom/fetcher/ValidationException.java
  22. +119 −0 src/main/java/ai/preferred/venom/job/AbstractQueueScheduler.java
  23. +138 −0 src/main/java/ai/preferred/venom/job/BasicJob.java
  24. +121 −0 src/main/java/ai/preferred/venom/job/Job.java
  25. +106 −0 src/main/java/ai/preferred/venom/job/LazyScheduler.java
  26. +59 −0 src/main/java/ai/preferred/venom/job/Priority.java
  27. +60 −0 src/main/java/ai/preferred/venom/job/PriorityQueueScheduler.java
  28. +113 −0 src/main/java/ai/preferred/venom/job/Scheduler.java
  29. +77 −0 src/main/java/ai/preferred/venom/request/CrawlerRequest.java
  30. +93 −0 src/main/java/ai/preferred/venom/request/HttpFetcherRequest.java
  31. +95 −0 src/main/java/ai/preferred/venom/request/Request.java
  32. +52 −0 src/main/java/ai/preferred/venom/request/Unwrappable.java
  33. +296 −0 src/main/java/ai/preferred/venom/request/VRequest.java
  34. +112 −0 src/main/java/ai/preferred/venom/response/BaseResponse.java
  35. +90 −0 src/main/java/ai/preferred/venom/response/Response.java
  36. +35 −0 src/main/java/ai/preferred/venom/response/Retrievable.java
  37. +35 −0 src/main/java/ai/preferred/venom/response/Unwrappable.java
  38. +144 −0 src/main/java/ai/preferred/venom/response/VResponse.java
  39. +70 −0 src/main/java/ai/preferred/venom/storage/FileManager.java
  40. +98 −0 src/main/java/ai/preferred/venom/storage/Record.java
  41. +39 −0 src/main/java/ai/preferred/venom/storage/StorageException.java
  42. +34 −0 src/main/java/ai/preferred/venom/uagent/DefaultUserAgent.java
  43. +34 −0 src/main/java/ai/preferred/venom/uagent/UserAgent.java
  44. +61 −0 src/main/java/ai/preferred/venom/utils/InlineExecutorService.java
  45. +72 −0 src/main/java/ai/preferred/venom/utils/ResponseDecompressor.java
  46. +63 −0 src/main/java/ai/preferred/venom/utils/UrlUtils.java
  47. +38 −0 src/main/java/ai/preferred/venom/validator/EmptyContentValidator.java
  48. +48 −0 src/main/java/ai/preferred/venom/validator/MimeTypeValidator.java
  49. +60 −0 src/main/java/ai/preferred/venom/validator/PipelineValidator.java
  50. +42 −0 src/main/java/ai/preferred/venom/validator/StatusOkValidator.java
  51. +52 −0 src/main/java/ai/preferred/venom/validator/Validator.java
  52. 0 src/main/resources/.gitkeep
  53. 0 src/test/java/.gitkeep
  54. 0 src/test/resources/.gitkeep
@@ -0,0 +1,3 @@
/target/
/.idea
/*.iml
204 pom.xml
@@ -0,0 +1,204 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<groupId>ai.preferred</groupId>
<artifactId>venom</artifactId>
<version>4.0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<name>${project.groupId}:${project.artifactId}</name>
<description>An open source focused crawler for the Deep Web built on Apache HttpAsyncClient</description>
<url>https://venom.preferred.ai</url>

<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
</license>
</licenses>

<developers>
<developer>
<name>Maksim TKACHENKO</name>
<email>mtkachenko.2015 -at- smu.edu.sg</email>
<organization>Preferred.AI</organization>
<organizationUrl>https://preferred.ai/</organizationUrl>
<timezone>+8</timezone>
</developer>
<developer>
<name>TRUONG Quoc Tuan</name>
<email>qttruong.2017 -at- smu.edu.sg</email>
<organization>Preferred.AI</organization>
<organizationUrl>https://preferred.ai/</organizationUrl>
<timezone>+8</timezone>
</developer>
<developer>
<name>LEE Ween Jiann</name>
<email>weenjiannlee -at- smu.edu.sg</email>
<organization>Preferred.AI</organization>
<organizationUrl>https://preferred.ai/</organizationUrl>
<timezone>+8</timezone>
</developer>
</developers>

<scm>
<connection>scm:git:git://github.com/PreferredAI/venom.git</connection>
<developerConnection>scm:git:ssh://github.com:PreferredAI/venom.git</developerConnection>
<url>http://github.com/PreferredAI/venom/tree/master</url>
</scm>

<build>
<plugins>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.7</version>
<extensions>true</extensions>
<configuration>
<serverId>ossrh</serverId>
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>true</autoReleaseAfterClose>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>3.0.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>3.0.0-M1</version>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
<configuration>
<includeDependencySources>true</includeDependencySources>
<dependencySourceIncludes>
<dependencySourceInclude>ai.preferred:*</dependencySourceInclude>
</dependencySourceIncludes>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>1.6</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
<configuration>
<keyname>${gpg.keyname}</keyname>
<passphrase>${gpg.passphrase}</passphrase>
</configuration>
</execution>
</executions>
</plugin>
</plugins>

<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>

<testResources>
<testResource>
<directory>src/test/resources</directory>
</testResource>
</testResources>

</build>

<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>[4.4,4.5)</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpasyncclient</artifactId>
<version>[4.1,4.2)</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>[1.10,1.11)</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>[2.28,3.0)</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>[1.16,2.0)</version>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>[59.0,60.0)</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>23.0</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>[3.6,3.7)</version>
</dependency>
<dependency>
<groupId>javax.validation</groupId>
<artifactId>validation-api</artifactId>
<version>2.0.0.Final</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>[1.7,1.8)</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>[4.0,)</version>
<scope>test</scope>
</dependency>
</dependencies>

</project>
Oops, something went wrong.

0 comments on commit 40b9fd1

Please sign in to comment.