If the redirect is permanent and the source is not root and the
- * destination is root, keep the destination
- *
- *
a.com/xyz/index.html -> a.com*
- *
- *
If the redirect is permanent and neither the source nor the destination
- * is root, then keep the destination
- *
- *
a.com/xyz/index.html -> a.com/abc/page.html*
- *
- *
If the redirect is temporary and source is root and destination is not
- * root, then keep the source
- *
- *
*a.com -> a.com/xyz/index.html
- *
- *
If the redirect is temporary and source is not root and destination is
- * root, then keep the destination
- *
- *
a.com/xyz/index.html -> a.com*
- *
- *
If the redirect is temporary and neither the source or the destination
- * is root, then keep the shortest url. First check for the shortest host, and
- * if both are equal then check by path. Path is first by length then by the
- * number of / path separators.
If the redirect is permanent and the source is not root and the
+ * destination is root, keep the destination
+ *
+ *
+ *
a.com/xyz/index.html -> a.com*
+ *
+ *
+ *
If the redirect is permanent and neither the source nor the destination
+ * is root, then keep the destination
+ *
+ *
+ *
a.com/xyz/index.html -> a.com/abc/page.html*
+ *
+ *
+ *
If the redirect is temporary and source is root and destination is not
+ * root, then keep the source
+ *
+ *
+ *
*a.com -> a.com/xyz/index.html
+ *
+ *
+ *
If the redirect is temporary and source is not root and destination is
+ * root, then keep the destination
+ *
+ *
+ *
a.com/xyz/index.html -> a.com*
+ *
+ *
+ *
If the redirect is temporary and neither the source or the destination
+ * is root, then keep the shortest url. First check for the shortest host, and
+ * if both are equal then check by path. Path is first by length then by the
+ * number of / path separators.
If the redirect is temporary and both the source and the destination
+ * are root, then keep the shortest sub-domain
+ *
+ *
+ *
*www.a.com -> www.news.a.com
+ *
+ *
+ *
*
* While not in this logic there is a further piece of representative url
* logic that occurs during indexing and after scoring. During creation of the
diff --git a/sparkler-core/sparkler-api/src/test/resources/domain-suffixes.xml b/sparkler-core/sparkler-api/src/test/resources/domain-suffixes.xml
new file mode 100644
index 00000000..1c64f367
--- /dev/null
+++ b/sparkler-core/sparkler-api/src/test/resources/domain-suffixes.xml
@@ -0,0 +1,155 @@
+
+
+
+
+
+
+
+
+
+
+
+ INFRASTRUCTURE
+
+ (from http://en.wikipedia.org/wiki/.root)
+ vrsn-end-of-zone-marker-dummy-record.root is a domain name
+ listed in the DNS root zone as a diagnostic marker, whose
+ presence demonstrates the root zone was not truncated upon
+ loading by a root nameserver. It could be argued it represents
+ a top-level domain of .root, although technically no such
+ delegation exists.
+
+
+
+
+ INFRASTRUCTURE
+
+ (from http://en.wikipedia.org/wiki/.arpa) .arpa is an Internet
+ top-level domain (TLD) used exclusively for
+ Internet-infrastructure purposes. It does not function as a
+ normal TLD where websites are registered, but rather as a
+ meta-TLD used to look up addresses, and for other purposes.
+
+
+
+
+
+
+ SPONSORED
+ for the air transport industry
+
+
+
+ UNSPONSORED
+ for business use
+
+
+
+ SPONSORED
+ for Catalan language/culture
+
+
+
+ UNSPONSORED
+
+ for commercial organizations, but unrestricted
+
+
+
+
+
+ Ascension Island
+
+
+
+ Andorra
+
+
+
+ United Arab Emirates
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/sparkler-core/sparkler-app/build.sbt b/sparkler-core/sparkler-app/build.sbt
deleted file mode 100644
index 65d5b6b1..00000000
--- a/sparkler-core/sparkler-app/build.sbt
+++ /dev/null
@@ -1,135 +0,0 @@
-cancelable := true
-
-developers := List( // TODO replace this with your information
- Developer("mslinn",
- "Mike Slinn",
- "mslinn@micronauticsresearch.com",
- url("https://github.com/mslinn")
- )
-)
-
-// define the statements initially evaluated when entering 'console', 'console-quick', but not 'console-project'
-initialCommands in console := """
- |""".stripMargin
-
-javacOptions ++= Seq(
- "-Xlint:deprecation",
- "-Xlint:unchecked",
- "-source", "1.8",
- "-target", "1.8",
- "-g:vars"
-)
-
-libraryDependencies ++= Seq(
- "edu.usc.irds.sparkler" % "sparkler-api" % "0.2.1-SNAPSHOT",
- "org.pf4j" % "pf4j" % "2.0.0",
- "org.apache.spark" % "spark-core_2.11" % "1.6.1",
- "org.apache.nutch" % "nutch" % "1.13",
- "org.apache.kafka" % "kafka-clients" % "0.10.0.0",
- "org.scala-lang" % "scala-library" % "2.11.8",
- "org.apache.solr" % "solr-solrj" % "6.4.0",
- "org.apache.solr" % "solr-core" % "6.4.0",
- "org.apache.tika" % "tika-parsers" % "1.14",
- "args4j" % "args4j" % "2.0.29",
- "org.slf4j" % "slf4j-api" % "1.7.25",
- "org.slf4j" % "slf4j-log4j12" % "1.7.25",
- "commons-validator" % "commons-validator" % "1.5.1",
- "junit" % "junit" % "4.12" % "test",
- "org.scalatest" %% "scalatest" % "3.0.3" % Test withSources(),
- "junit" % "junit" % "4.12" % Test
-)
-
-// If you want to apply a license, such as the Apache 2 license, uncomment the following:
-licenses += ("Apache-2.0", url("https://www.apache.org/licenses/LICENSE-2.0.html"))
-
-logLevel := Level.Warn
-
-// Only show warnings and errors on the screen for compilations.
-// This applies to both test:compile and compile and is Info by default
-logLevel in compile := Level.Warn
-
-// Level.INFO is needed to see detailed output when running tests
-logLevel in test := Level.Info
-
-name := "my-new-project" // TODO provide a short yet descriptive name
-
-organization := "com.micronautics"
-
-resolvers ++= Seq(
- "Typesafe" at "http://repo.typesafe.com/typesafe/releases/",
- "Java.net Maven2 Repository" at "http://download.java.net/maven/2/")
-resolvers += Resolver.mavenLocal
-
-scalacOptions ++= Seq( // From https://tpolecat.github.io/2017/04/25/scalac-flags.html
- "-deprecation", // Emit warning and location for usages of deprecated APIs.
- "-encoding", "utf-8", // Specify character encoding used by source files.
- "-explaintypes", // Explain type errors in more detail.
- "-feature", // Emit warning and location for usages of features that should be imported explicitly.
- "-language:existentials", // Existential types (besides wildcard types) can be written and inferred
- "-language:experimental.macros", // Allow macro definition (besides implementation and application)
- "-language:higherKinds", // Allow higher-kinded types
- "-language:implicitConversions", // Allow definition of implicit functions called views
- "-unchecked", // Enable additional warnings where generated code depends on assumptions.
- "-Xcheckinit", // Wrap field accessors to throw an exception on uninitialized access.
- //"-Xfatal-warnings", // Fail the compilation if there are any warnings.
- "-Xfuture", // Turn on future language features.
- "-Xlint:adapted-args", // Warn if an argument list is modified to match the receiver.
- "-Xlint:by-name-right-associative", // By-name parameter of right associative operator.
- "-Xlint:constant", // Evaluation of a constant arithmetic expression results in an error.
- "-Xlint:delayedinit-select", // Selecting member of DelayedInit.
- "-Xlint:doc-detached", // A Scaladoc comment appears to be detached from its element.
- "-Xlint:inaccessible", // Warn about inaccessible types in method signatures.
- "-Xlint:infer-any", // Warn when a type argument is inferred to be `Any`.
- "-Xlint:missing-interpolator", // A string literal appears to be missing an interpolator id.
- "-Xlint:nullary-override", // Warn when non-nullary `def f()' overrides nullary `def f'.
- "-Xlint:nullary-unit", // Warn when nullary methods return Unit.
- "-Xlint:option-implicit", // Option.apply used implicit view.
- "-Xlint:package-object-classes", // Class or object defined in package object.
- "-Xlint:poly-implicit-overload", // Parameterized overloaded implicit methods are not visible as view bounds.
- "-Xlint:private-shadow", // A private field (or class parameter) shadows a superclass field.
- "-Xlint:stars-align", // Pattern sequence wildcard must align with sequence component.
- "-Xlint:type-parameter-shadow", // A local type parameter shadows a type already in scope.
- "-Xlint:unsound-match", // Pattern match may not be typesafe.
- //"-Yno-adapted-args", // Do not adapt an argument list (either by inserting () or creating a tuple) to match the receiver.
- //"-Ywarn-dead-code", // Warn when dead code is identified.
- "-Ywarn-inaccessible", // Warn about inaccessible types in method signatures.
- "-Ywarn-infer-any", // Warn when a type argument is inferred to be `Any`.
- "-Ywarn-nullary-override", // Warn when non-nullary `def f()' overrides nullary `def f'.
- "-Ywarn-nullary-unit", // Warn when nullary methods return Unit.
- "-Ywarn-numeric-widen" // Warn when numerics are widened.
- //"-Ywarn-unused:implicits", // Warn if an implicit parameter is unused.
- //"-Ywarn-unused:imports", // Warn if an import selector is not referenced.
- //"-Ywarn-unused:locals", // Warn if a local definition is unused.
- //"-Ywarn-unused:params", // Warn if a value parameter is unused.
- //"-Ywarn-unused:patvars", // Warn if a variable bound in a pattern is unused.
- //"-Ywarn-unused:privates", // Warn if a private member is unused.
- //"-Ywarn-value-discard" // Warn when non-Unit expression results are unused.
-)
-
-// The REPL can’t cope with -Ywarn-unused:imports or -Xfatal-warnings so turn them off for the console
-scalacOptions in (Compile, console) --= Seq("-Ywarn-unused:imports", "-Xfatal-warnings")
-
-scalacOptions in (Compile, doc) ++= baseDirectory.map {
- (bd: File) => Seq[String](
- "-sourcepath", bd.getAbsolutePath, // todo replace my-new-project with the github project name, and replace mslinn with your github id
- "-doc-source-url", "https://github.com/mslinn/my-new-project/tree/master€{FILE_PATH}.scala"
- )
-}.value
-
-
-scalaVersion := "2.11.8"
-
-scmInfo := Some(
- ScmInfo( // TODO replace mslinn with your github id
- url(s"https://github.com/mslinn/$name"),
- s"git@github.com:mslinn/$name.git"
- )
-)
-
-
-
-
-
-
-version := "0.1.0"
-
diff --git a/sparkler-core/sparkler-app/pom.xml b/sparkler-core/sparkler-app/pom.xml
deleted file mode 100644
index d79d8d5c..00000000
--- a/sparkler-core/sparkler-app/pom.xml
+++ /dev/null
@@ -1,454 +0,0 @@
-
-
-
-
- sparkler-parent
- edu.usc.irds.sparkler
- 0.2.1-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- sparkler-app
- jar
-
- sparkler
- http://irds.usc.edu/sparkler/
-
-
- ${project.parent.basedir}${file.separator}${project.conf.dir}
- ${project.parent.basedir}${file.separator}${project.resources.dir}
-
- edu.usc.irds.sparkler.Main
-
-
-
-
-
-
- edu.usc.irds.sparkler
- sparkler-api
- ${project.version}
-
-
-
- org.pf4j
- pf4j
- ${pf4j.version}
-
-
-
- org.apache.spark
- spark-core_${version.scala.epoch}
- ${spark.version}
-
-
-
-
- org.apache.spark
- spark-sql_${version.scala.epoch}
- ${spark.version}
-
-
-
-
- org.apache.nutch
- nutch
- ${nutch.version}
-
-
- org.apache.tika
- tika-core
-
-
-
-
-
- org.apache.kafka
- kafka-clients
- ${kafka.version}
-
-
-
-
- org.apache.solr
- solr-solrj
- ${solr.version}
-
-
- org.apache.solr
- solr-core
- ${solr.version}
-
-
- org.apache.tika
- tika-parsers
- ${tika.version}
-
-
-
-
- com.fasterxml.jackson.core
- jackson-databind
- 2.10.0
-
-
-
- com.fasterxml.jackson.core
- jackson-core
- 2.10.0
-
-
-
- args4j
- args4j
- ${args4j.version}
-
-
- org.slf4j
- slf4j-api
- ${slf4j.version}
-
-
- org.slf4j
- slf4j-log4j12
- ${slf4j.version}
-
-
- commons-validator
- commons-validator
- ${commons.validator.version}
-
-
-
- com.googlecode.json-simple
- json-simple
- 1.1.1
-
-
-
-
-
-
- junit
- junit
- ${junit.version}
- test
-
-
-
-
-
-
-
- ${sparkler.resources.dir}
-
-
-
-
- org.apache.maven.plugins
- maven-shade-plugin
- 3.2.2
-
-
- package
-
- shade
-
-
- ${project.parent.basedir}${file.separator}build/${project.artifactId}-${project.version}.jar
- false
- false
-
-
-
- *
-
-
-
-
- *:*
-
- META-INF/*.SF
- META-INF/*.DSA
- META-INF/*.RSA
-
-
-
-
-
- reference.conf
-
-
-
- ${exec.mainClass}
-
-
-
-
-
-
-
- org.apache.http
- shaded.org.apache.http
-
-
-
-
-
-
-
- com.carrotgarden.maven
- scalor-maven-plugin_2.12
- 1.5.0.20190502185130
-
-
-
-
-
- org.scala-sbt
- compiler-bridge_${version.scala.epoch}
- ${version.scala.zinc}
-
-
-
-
-
-
- org.scala-lang
- scala-compiler
- ${version.scala.release}
-
-
-
-
-
-
- org.scalamacros
- paradise_${version.scala.release}
- ${version.scala.plugin.macro}
-
-
-
-
-
-
-
-
-
- setup-cross
- eclipse-config
-
- register-macro
- register-main
- register-test
-
- compile-macro
- compile-main
- compile-test
-
- scala-js-link-main
- scala-js-link-test
-
-
-
-
-
-
-
-
- org.scalastyle
- scalastyle-maven-plugin
- 0.8.0
-
- false
- true
- true
- false
- ${basedir}/src/main/scala
- ${basedir}/src/test/scala
- ${project.parent.basedir}/scalastyle_config.xml
- ${project.basedir}/target/scalastyle-output.xml
-
- UTF-8
-
-
-
-
- check
-
-
-
-
-
- maven-assembly-plugin
- 2.5.3
-
- src/assembly/dep.xml
- posix
-
-
-
-
-
-
- maven-resources-plugin
- 2.6
-
-
- copy-bins
- validate
-
- copy-resources
-
-
- ${project.parent.basedir}${file.separator}${project.bins.dir}
-
-
- ${project.parent.basedir}${file.separator}bin
- false
-
-
-
-
-
- copy-resources
- validate
-
- copy-resources
-
-
- ${project.parent.basedir}${file.separator}${project.resources.dir}
-
-
- ${project.parent.basedir}${file.separator}conf
- true
-
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-antrun-plugin
- 1.6
-
-
- fix-perms
- validate
-
-
-
-
-
-
- run
-
-
-
-
-
-
-
-
- sbt
-
-
-
-
- net.alchim31.maven
- scala-maven-plugin
-
-
- scala-compile-first
- none
-
- add-source
- compile
-
-
-
- scala-test-compile
- none
-
- testCompile
-
-
-
-
-
- org.codehaus.mojo
- exec-maven-plugin
- 1.6.0
-
-
- process-resources
-
- exec
-
-
-
-
- sbt
-
- .
-
- package
-
-
-
-
- maven-antrun-plugin
-
-
- process-resources
-
-
-
-
-
-
-
-
-
-
- run
-
-
-
-
-
-
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml b/sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml
deleted file mode 100644
index 628595cf..00000000
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml
+++ /dev/null
@@ -1,82 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.1-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- fetcher-chrome
- jar
-
- fetcher-chrome
- http://maven.apache.org
-
- UTF-8
- 1.0.1
- ${project.artifactId}
- edu.usc.irds.sparkler.plugin.FetcherChromeActivator
-
-
-
-
- org.seleniumhq.selenium
- selenium-chrome-driver
- 3.141.59
-
-
-
- org.seleniumhq.selenium
- selenium-java
- 3.141.59
-
-
-
-
-
- com.lihaoyi
- requests_2.12
- 0.1.7
-
-
- com.machinepublishers
- jbrowserdriver
- ${jbrowserdriver.version}
-
-
- slf4j-api
- org.slf4j
-
-
-
-
-
-
-
- maven-assembly-plugin
-
-
- maven-jar-plugin
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/fetcher-htmlunit/pom.xml b/sparkler-core/sparkler-plugins/fetcher-htmlunit/pom.xml
deleted file mode 100644
index d9ac630d..00000000
--- a/sparkler-core/sparkler-plugins/fetcher-htmlunit/pom.xml
+++ /dev/null
@@ -1,70 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.1-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- fetcher-htmlunit
- jar
-
- fetcher-htmlunit
- http://maven.apache.org
-
- UTF-8
- 2.43.0
- ${project.artifactId}
- edu.usc.irds.sparkler.plugin.HtmlUnitFetcherActivator
-
-
-
- net.sourceforge.htmlunit
- htmlunit
- ${htmlunit.version}
-
-
-
-
-
- maven-assembly-plugin
-
-
-
- maven-jar-plugin
-
-
- org.apache.maven.plugins
- maven-surefire-plugin
- 3.0.0-M5
-
-
-
- listener
- edu.usc.irds.sparkler.test.WebServerRunListener
-
-
-
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/fetcher-htmlunit/src/test/java/edu/usc/irds/sparkler/plugin/HtmlUnitFetcherTest.java b/sparkler-core/sparkler-plugins/fetcher-htmlunit/src/test/java/edu/usc/irds/sparkler/plugin/HtmlUnitFetcherTest.java
index a6f391b1..a63cefb0 100644
--- a/sparkler-core/sparkler-plugins/fetcher-htmlunit/src/test/java/edu/usc/irds/sparkler/plugin/HtmlUnitFetcherTest.java
+++ b/sparkler-core/sparkler-plugins/fetcher-htmlunit/src/test/java/edu/usc/irds/sparkler/plugin/HtmlUnitFetcherTest.java
@@ -25,8 +25,6 @@
import static org.junit.Assert.assertEquals;
public class HtmlUnitFetcherTest {
-
-
@Test
public void testJBrowserImage() throws Exception {
HtmlUnitFetcher htmlUnitFetcher = TestUtils.newInstance(HtmlUnitFetcher.class, "fetcher.htmlunit");
diff --git a/sparkler-core/sparkler-plugins/fetcher-htmlunit/src/test/resources/log4j.properties b/sparkler-core/sparkler-plugins/fetcher-htmlunit/src/test/resources/log4j.properties
index 8c64dff1..87e82f9a 100644
--- a/sparkler-core/sparkler-plugins/fetcher-htmlunit/src/test/resources/log4j.properties
+++ b/sparkler-core/sparkler-plugins/fetcher-htmlunit/src/test/resources/log4j.properties
@@ -16,7 +16,7 @@
#
# Root logger option
-log4j.rootLogger=WARN, stdout
+log4j.rootLogger=ERROR, stdout
# Direct log messages to stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
@@ -24,4 +24,6 @@ log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L [%t] - %m%n
-log4j.logger.edu.usc.irds=DEBUG
\ No newline at end of file
+log4j.logger.edu.usc.irds=DEBUG
+log4j.logger.com.gargoylesoftware=FATAL
+log4j.logger.org.apache.commons.httpclient=OFF
\ No newline at end of file
diff --git a/sparkler-core/sparkler-plugins/fetcher-jbrowser/pom.xml b/sparkler-core/sparkler-plugins/fetcher-jbrowser/pom.xml
deleted file mode 100644
index 4343b394..00000000
--- a/sparkler-core/sparkler-plugins/fetcher-jbrowser/pom.xml
+++ /dev/null
@@ -1,62 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.1-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- fetcher-jbrowser
- jar
-
- fetcher-jbrowser
- http://maven.apache.org
-
- UTF-8
- 0.16.4
- ${project.artifactId}
- edu.usc.irds.sparkler.plugin.FetcherJBrowserActivator
-
-
-
- com.machinepublishers
- jbrowserdriver
- ${jbrowserdriver.version}
-
-
- slf4j-api
- org.slf4j
-
-
-
-
-
-
-
- maven-assembly-plugin
-
-
- maven-jar-plugin
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/pom.xml b/sparkler-core/sparkler-plugins/pom.xml
deleted file mode 100644
index c54595a9..00000000
--- a/sparkler-core/sparkler-plugins/pom.xml
+++ /dev/null
@@ -1,133 +0,0 @@
-
-
-
-
-
-
- sparkler-parent
- edu.usc.irds.sparkler
- 0.2.1-SNAPSHOT
- ../pom.xml
-
- edu.usc.irds.sparkler.plugin
- sparkler-plugins
- 4.0.0
- pom
- http://irds.usc.edu/sparkler/
-
-
- urlfilter-regex
- scorer-dd-svn
- fetcher-jbrowser
- fetcher-htmlunit
- fetcher-chrome
- urlfilter-samehost
-
- template-plugin
-
-
-
- ${project.version}
- ${project.groupId}
-
-
-
-
-
-
-
- edu.usc.irds.sparkler
- sparkler-api
- ${project.parent.version}
- provided
-
-
-
-
-
- org.pf4j
- pf4j
- ${pf4j.version}
- provided
-
-
- edu.usc.irds.sparkler
- sparkler-tests-base
- ${project.version}
- test
-
-
-
-
-
-
-
- maven-assembly-plugin
-
- false
- ${project.parent.parent.basedir}${file.separator}build${file.separator}plugins
-
- jar-with-dependencies
-
-
-
- ${plugin.id}
- ${plugin.class}
- ${plugin.version}
- ${plugin.provider}
- ${plugin.dependencies}
-
-
-
-
-
- make-assembly
- package
-
- single
-
-
-
-
-
- org.apache.maven.plugins
- maven-jar-plugin
- 2.4
-
-
-
- ${plugin.id}
- ${plugin.class}
- ${plugin.version}
- ${plugin.provider}
- ${plugin.dependencies}
-
-
-
-
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/scorer-dd-svn/pom.xml b/sparkler-core/sparkler-plugins/scorer-dd-svn/pom.xml
deleted file mode 100644
index bd15a79c..00000000
--- a/sparkler-core/sparkler-plugins/scorer-dd-svn/pom.xml
+++ /dev/null
@@ -1,61 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.1-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- scorer-dd-svn
- jar
-
- scorer-dd-svn
- http://maven.apache.org
-
- UTF-8
- ${project.artifactId}
- ${project.version}
- ${project.groupId}
- edu.usc.irds.sparkler.plugin.DdSvnScorerActivator
-
-
-
-
- org.apache.httpcomponents
- httpclient
- 4.3.6
-
-
-
-
-
-
-
-
- maven-assembly-plugin
-
-
- maven-jar-plugin
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/template-plugin/pom.xml b/sparkler-core/sparkler-plugins/template-plugin/pom.xml
deleted file mode 100644
index dbd87bba..00000000
--- a/sparkler-core/sparkler-plugins/template-plugin/pom.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.1-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- template-plugin
- jar
-
- template-plugin
- http://maven.apache.org
-
- UTF-8
- ${project.artifactId}
- ${project.version}
- ${project.groupId}
- edu.usc.irds.sparkler.plugin.MyPluginActivator
-
-
-
-
-
-
- maven-assembly-plugin
-
-
- maven-jar-plugin
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/urlfilter-regex/pom.xml b/sparkler-core/sparkler-plugins/urlfilter-regex/pom.xml
deleted file mode 100644
index b4117d02..00000000
--- a/sparkler-core/sparkler-plugins/urlfilter-regex/pom.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.1-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- urlfilter-regex
- jar
-
- urlfilter-regex
- http://maven.apache.org
-
- UTF-8
- ${project.artifactId}
- ${project.version}
- ${project.groupId}
- edu.usc.irds.sparkler.plugin.RegexURLFilterActivator
-
-
-
-
-
-
- maven-assembly-plugin
-
-
- maven-jar-plugin
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/urlfilter-regex/src/main/java/edu/usc/irds/sparkler/plugin/regex/RegexURLFilterBase.java b/sparkler-core/sparkler-plugins/urlfilter-regex/src/main/java/edu/usc/irds/sparkler/plugin/regex/RegexURLFilterBase.java
index e98cfb5a..869e4c47 100644
--- a/sparkler-core/sparkler-plugins/urlfilter-regex/src/main/java/edu/usc/irds/sparkler/plugin/regex/RegexURLFilterBase.java
+++ b/sparkler-core/sparkler-plugins/urlfilter-regex/src/main/java/edu/usc/irds/sparkler/plugin/regex/RegexURLFilterBase.java
@@ -45,10 +45,10 @@
* The regular expressions rules are expressed in a file.
*
*
- * The format of this file is made of many rules (one per line):
+ * The format of this file is made of many rules (one per line):
*
* [+-]<regex>
- *
+ *
* where plus (+)means go ahead and index it and minus (
* -)means no.
*
diff --git a/sparkler-core/sparkler-plugins/urlfilter-samehost/pom.xml b/sparkler-core/sparkler-plugins/urlfilter-samehost/pom.xml
deleted file mode 100644
index 0f19755a..00000000
--- a/sparkler-core/sparkler-plugins/urlfilter-samehost/pom.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.1-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- urlfilter-samehost
- jar
-
- urlfilter-samehost
- http://maven.apache.org
-
- UTF-8
- ${project.artifactId}
- ${project.version}
- ${project.groupId}
- edu.usc.irds.sparkler.plugin.UrlFilterSameHostActivator
-
-
-
-
-
-
- maven-assembly-plugin
-
-
- maven-jar-plugin
-
-
-
-
diff --git a/sparkler-core/sparkler-tests-base/pom.xml b/sparkler-core/sparkler-tests-base/pom.xml
deleted file mode 100644
index 8d8c56f3..00000000
--- a/sparkler-core/sparkler-tests-base/pom.xml
+++ /dev/null
@@ -1,50 +0,0 @@
-
-
- sparkler-parent
- edu.usc.irds.sparkler
- 0.2.1-SNAPSHOT
-
- 4.0.0
-
- sparkler-tests-base
- jar
-
- sparkler-tests-base
- http://maven.apache.org
-
-
- UTF-8
- 9.4.0.v20161208
-
-
-
-
- org.eclipse.jetty
- jetty-server
- ${jetty.version}
-
-
- org.eclipse.jetty
- jetty-servlet
- ${jetty.version}
-
-
- junit
- junit
-
-
- org.slf4j
- slf4j-api
- ${slf4j.version}
- provided
-
-
- org.slf4j
- slf4j-log4j12
- ${slf4j.version}
- provided
-
-
-
diff --git a/sparkler-core/sparkler-tests-base/src/main/java/edu/usc/irds/sparkler/test/WebServer.java b/sparkler-core/sparkler-tests-base/src/main/java/edu/usc/irds/sparkler/test/WebServer.java
index ce80c1ff..aa70a68a 100644
--- a/sparkler-core/sparkler-tests-base/src/main/java/edu/usc/irds/sparkler/test/WebServer.java
+++ b/sparkler-core/sparkler-tests-base/src/main/java/edu/usc/irds/sparkler/test/WebServer.java
@@ -32,9 +32,10 @@
public class WebServer extends Server {
private static final Logger LOG = LoggerFactory.getLogger(WebServer.class);
-
public static final int DEFAULT_PORT = 8080;
+ private int port;
+
public static String getDefaultPath(){
return WebServer.class.getClassLoader().getResource("webapp").toExternalForm();
}
@@ -45,6 +46,7 @@ public WebServer(){
public WebServer(int port, String resRoot){
super(port);
+ this.port = port;
LOG.info("Port:{}, Resources Root:{}", port, resRoot);
ResourceHandler rh0 = new ResourceHandler();
ContextHandler context0 = new ContextHandler();
@@ -67,6 +69,9 @@ public WebServer(int port, String resRoot){
this.setHandler(contexts);
}
+ public int getPort() {
+ return port;
+ }
public static void main(String[] args) throws Exception {
WebServer server = new WebServer();
diff --git a/sparkler-core/sparkler-tests-base/src/main/java/edu/usc/irds/sparkler/test/WebServerRunListener.java b/sparkler-core/sparkler-tests-base/src/main/java/edu/usc/irds/sparkler/test/WebServerRunListener.java
index 1496f89e..2ffae98d 100644
--- a/sparkler-core/sparkler-tests-base/src/main/java/edu/usc/irds/sparkler/test/WebServerRunListener.java
+++ b/sparkler-core/sparkler-tests-base/src/main/java/edu/usc/irds/sparkler/test/WebServerRunListener.java
@@ -21,25 +21,57 @@
import org.junit.runner.Result;
import org.junit.runner.notification.RunListener;
+import java.io.IOException;
+import java.net.Socket;
+
/**
* This implementation of {@link RunListener} starts a web server before the tests
* and gracefully stops it when tests end.
+ *
+ * Check port availability for tests that depend on the listener and run in parallel
+ *
+ * TODO: Improve this to run from Scala / SBT
*/
public class WebServerRunListener extends RunListener {
+ private static final int MAX_RETRIES = 5;
+ private static final int WAIT_FOR_PORT_MS = 5000;
+
private WebServer server = new WebServer();
+ private boolean isPortInUse(int port) {
+ // Assume the port is not in use
+ boolean isInUse = false;
+ try {
+ (new Socket("localhost", port)).close();
+ isInUse = true;
+ }
+ catch(IOException e) {
+ // Could not connect. Pass
+ }
+ return isInUse;
+ }
+
@Override
public void testRunStarted(Description description) throws Exception {
super.testRunStarted(description);
- System.out.println("STARTING...");
+ int numRetries = 0;
+ while (isPortInUse(server.getPort())) {
+ if (numRetries > MAX_RETRIES) {
+ throw new Exception("WebServerRunListener: Port " + server.getPort() + " is already in use!");
+ }
+ System.out.println("WebServerRunListener: Waiting for port: " + server.getPort());
+ Thread.sleep(WAIT_FOR_PORT_MS);
+ numRetries += 1;
+ }
server.start();
+ System.out.println("WebServerRunListener: STARTED");
}
@Override
public void testRunFinished(Result result) throws Exception {
super.testRunFinished(result);
- System.out.println("STOPPING...");
server.stop();
+ System.out.println("WebServerRunListener: STOPPED");
}
}
diff --git a/sparkler-core/sparkler-ui/pom.xml b/sparkler-core/sparkler-ui/pom.xml
deleted file mode 100644
index 58d4e8f9..00000000
--- a/sparkler-core/sparkler-ui/pom.xml
+++ /dev/null
@@ -1,78 +0,0 @@
-
-
-
-
- sparkler-parent
- edu.usc.irds.sparkler
- 0.2.1-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- sparkler-ui
- war
-
- sparkler-ui
- http://irds.usc.edu/sparkler/
-
-
- UTF-8
- ${project.basedir}${file.separator}sparkler-dashboard
-
-
-
-
- com.lucidworks
- banana
- 1.5.1
- war
-
-
-
-
-
-
- maven-clean-plugin
- ${maven.clean.plugin.version}
-
-
-
- ${banana.output.directory}
-
-
-
-
-
- maven-war-plugin
- ${maven.war.plugin.version}
-
- ${banana.output.directory}
-
-
- com.lucidworks
- banana
-
-
- WEB-INF/web.xml
-
-
-
-
-
-
diff --git a/sparkler-core/sparkler-ui/src/banana b/sparkler-core/sparkler-ui/src/banana
new file mode 160000
index 00000000..45eb3e07
--- /dev/null
+++ b/sparkler-core/sparkler-ui/src/banana
@@ -0,0 +1 @@
+Subproject commit 45eb3e07439985641432a69721b5ca80d0ee0762
diff --git a/sparkler-core/sparkler-ui/WEB-INF/web.xml b/sparkler-core/sparkler-ui/src/main/webapp/WEB-INF/web.xml
similarity index 100%
rename from sparkler-core/sparkler-ui/WEB-INF/web.xml
rename to sparkler-core/sparkler-ui/src/main/webapp/WEB-INF/web.xml
diff --git a/sparkler-core/version.sbt b/sparkler-core/version.sbt
new file mode 100644
index 00000000..ed63e13c
--- /dev/null
+++ b/sparkler-core/version.sbt
@@ -0,0 +1 @@
+version in ThisBuild := "0.3.1-SNAPSHOT"
From 72d6b3ebcbb23f70d78e606f6c2e0f92d51a96a0 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 10:29:48 +0100
Subject: [PATCH 002/335] add build props
---
sparkler-core/project/build.properties | 1 +
1 file changed, 1 insertion(+)
create mode 100644 sparkler-core/project/build.properties
diff --git a/sparkler-core/project/build.properties b/sparkler-core/project/build.properties
new file mode 100644
index 00000000..dbae93bc
--- /dev/null
+++ b/sparkler-core/project/build.properties
@@ -0,0 +1 @@
+sbt.version=1.4.9
From 8aa185648855a33a2d37fd344a1434213788f941 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 14:24:46 +0100
Subject: [PATCH 003/335] fix gson depenendency
---
sparkler-core/build.sbt | 1 +
sparkler-core/project/Dependencies.scala | 1 +
.../java/edu/usc/irds/sparkler/SparklerConfiguration.java | 5 +----
3 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index 0940e76a..ba2ec481 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -76,6 +76,7 @@ lazy val api = (project in file("sparkler-api"))
Dependencies.nutch exclude("*", "*"),
Dependencies.snakeYaml,
Dependencies.Solr.solrj,
+ Dependencies.gson,
// Test
Dependencies.jUnit % Test,
diff --git a/sparkler-core/project/Dependencies.scala b/sparkler-core/project/Dependencies.scala
index fe344e1f..0997135f 100644
--- a/sparkler-core/project/Dependencies.scala
+++ b/sparkler-core/project/Dependencies.scala
@@ -22,6 +22,7 @@ object Dependencies {
lazy val banana = "com.lucidworks" % "banana" % "1.5.1" artifacts(Artifact("banana", "war", "war"))
lazy val commonsValidator = "commons-validator" % "commons-validator" % "1.5.1"
lazy val httpClient = "org.apache.httpcomponents" % "httpclient" % "4.5.2"
+ lazy val gson = "com.google.code.gson" % "gson" % "2.8.7"
object Jackson {
private val group = "com.fasterxml.jackson.core"
private val version = "2.6.5"
diff --git a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/SparklerConfiguration.java b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/SparklerConfiguration.java
index b8b19950..efdb5187 100644
--- a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/SparklerConfiguration.java
+++ b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/SparklerConfiguration.java
@@ -17,17 +17,14 @@
package edu.usc.irds.sparkler;
+import com.google.gson.Gson;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
-import org.json.simple.parser.ParseException;
import java.util.HashMap;
-import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
-import com.google.gson.Gson;
-
public class SparklerConfiguration extends JSONObject {
public SparklerConfiguration() {
From 7c89519119306cfdd03a0dd47f7fbf0a0644334f Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 15:16:47 +0100
Subject: [PATCH 004/335] fix gson depenendency
---
sparkler-core/build.sbt | 6 ++++++
sparkler-core/plugins.build.sbt | 1 +
sparkler-core/project/Dependencies.scala | 1 +
sparkler-core/project/PluginDependencies.scala | 2 ++
4 files changed, 10 insertions(+)
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index ba2ec481..12d19f6a 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -56,6 +56,12 @@ developers := List(
// Apologies if we missed you. Please add yourself here..
)
+
+
+resolvers += "Gitlab Spicule" at "https://gitlab.com/api/v4/projects/23300400/packages/maven"
+
+
+
lazy val plugins = ProjectRef(file("./"), "plugins")
lazy val root = (project in file("."))
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index d8e52b93..978dff9c 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -66,6 +66,7 @@ lazy val fetcherChrome = (project in file(s"$sparklerPlugins/fetcher-chrome"))
libraryDependencies ++= Seq(
//FetcherChrome.Selenium.chromeDriver,
FetcherChrome.Selenium.java,
+ FetcherChrome.browserup,
),
Settings.pluginManifest(
id = "fetcher-chrome",
diff --git a/sparkler-core/project/Dependencies.scala b/sparkler-core/project/Dependencies.scala
index 0997135f..0803421b 100644
--- a/sparkler-core/project/Dependencies.scala
+++ b/sparkler-core/project/Dependencies.scala
@@ -23,6 +23,7 @@ object Dependencies {
lazy val commonsValidator = "commons-validator" % "commons-validator" % "1.5.1"
lazy val httpClient = "org.apache.httpcomponents" % "httpclient" % "4.5.2"
lazy val gson = "com.google.code.gson" % "gson" % "2.8.7"
+
object Jackson {
private val group = "com.fasterxml.jackson.core"
private val version = "2.6.5"
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index f2308b84..96642fc8 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -27,6 +27,8 @@ object FetcherChrome {
lazy val chromeDriver = group % "selenium-chrome-driver" % version
lazy val java = group % "selenium-java" % version
}
+ lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
+
}
object FetcherHtmlUnit {
From 1ebd5d05272e6edb7f3aaa8e72df6f5e78b5138c Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 15:27:31 +0100
Subject: [PATCH 005/335] fix gson depenendency
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 96642fc8..a7bc611a 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
-
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.0-SNAPSHOT"
}
object FetcherHtmlUnit {
From 55513808911bb901e178322e458805967c3f83ee Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 15:51:42 +0100
Subject: [PATCH 006/335] fix gson depenendency
---
sparkler-core/plugins.build.sbt | 1 +
1 file changed, 1 insertion(+)
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index 978dff9c..71b4ffdc 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -67,6 +67,7 @@ lazy val fetcherChrome = (project in file(s"$sparklerPlugins/fetcher-chrome"))
//FetcherChrome.Selenium.chromeDriver,
FetcherChrome.Selenium.java,
FetcherChrome.browserup,
+ FetcherChrome.seleniumscripter,
),
Settings.pluginManifest(
id = "fetcher-chrome",
From e640666d800205e280aeb975ecc31e17eae050f0 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 15:53:55 +0100
Subject: [PATCH 007/335] fix gson depenendency
---
sparkler-core/build.sbt | 1 +
1 file changed, 1 insertion(+)
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index 12d19f6a..e630ab47 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -59,6 +59,7 @@ developers := List(
resolvers += "Gitlab Spicule" at "https://gitlab.com/api/v4/projects/23300400/packages/maven"
+resolvers += "Gitlab Spicule 2" at "https://gitlab.com/api/v4/projects/26391218/packages/maven"
From aa76ee2fbf7fb530c76d7fe2d5a46fd4a8ee73a6 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 15:56:15 +0100
Subject: [PATCH 008/335] fix gson depenendency
---
sparkler-core/build.sbt | 7 -------
sparkler-core/project/Settings.scala | 4 +++-
2 files changed, 3 insertions(+), 8 deletions(-)
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index e630ab47..ba2ec481 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -56,13 +56,6 @@ developers := List(
// Apologies if we missed you. Please add yourself here..
)
-
-
-resolvers += "Gitlab Spicule" at "https://gitlab.com/api/v4/projects/23300400/packages/maven"
-resolvers += "Gitlab Spicule 2" at "https://gitlab.com/api/v4/projects/26391218/packages/maven"
-
-
-
lazy val plugins = ProjectRef(file("./"), "plugins")
lazy val root = (project in file("."))
diff --git a/sparkler-core/project/Settings.scala b/sparkler-core/project/Settings.scala
index ead9f140..df030ed5 100644
--- a/sparkler-core/project/Settings.scala
+++ b/sparkler-core/project/Settings.scala
@@ -61,7 +61,9 @@ object Settings {
"Restlet Repository" at "https://maven.restlet.com/",
"JBoss Repository" at "https://repository.jboss.org/nexus/content/repositories/",
"Sonatype Snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/",
- "Scala-Tools Snapshots" at "https://scala-tools.org/repo-snapshots/"
+ "Scala-Tools Snapshots" at "https://scala-tools.org/repo-snapshots/",
+ "Gitlab Spicule 2" at "https://gitlab.com/api/v4/projects/26391218/packages/maven",
+ "Gitlab Spicule" at "https://gitlab.com/api/v4/projects/23300400/packages/maven"
)
)
lazy val assemblyProject = common ++ baseAssemblySettings ++ Seq(
From af1489e0ec14d4e3ba9e6a2b1c928d903a8307da Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 16:01:51 +0100
Subject: [PATCH 009/335] fix gson depenendency
---
sparkler-core/plugins.build.sbt | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index 71b4ffdc..4b8c62a9 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -77,7 +77,7 @@ lazy val fetcherChrome = (project in file(s"$sparklerPlugins/fetcher-chrome"))
)
.dependsOn(api)
-lazy val fetcherHtmlUnit = (project in file(s"$sparklerPlugins/fetcher-htmlunit"))
+/*lazy val fetcherHtmlUnit = (project in file(s"$sparklerPlugins/fetcher-htmlunit"))
.enablePlugins(JavaAppPackaging)
.settings(
Settings.plugin,
@@ -110,7 +110,7 @@ lazy val fetcherJBrowser = (project in file(s"$sparklerPlugins/fetcher-jbrowser"
dependencies = List.empty
)
)
- .dependsOn(api)
+ .dependsOn(api)*/
lazy val scorerDdSvn = (project in file(s"$sparklerPlugins/scorer-dd-svn"))
.enablePlugins(JavaAppPackaging)
From aca6ca4384ad6f1bd9b44f0fb10fc5612828c097 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 16:02:45 +0100
Subject: [PATCH 010/335] fix gson depenendency
---
sparkler-core/plugins.build.sbt | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index 4b8c62a9..0513a49d 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -27,8 +27,8 @@ lazy val plugins = (project in file(s"$sparklerPlugins"))
)
.aggregate(
fetcherChrome,
- fetcherHtmlUnit,
- fetcherJBrowser,
+ /*fetcherHtmlUnit,
+ fetcherJBrowser,*/
scorerDdSvn,
urlFilterRegex,
urlFilterSameHost,
From f44e491f3940cd60003ed33c755a3f0a397bba36 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 16:08:16 +0100
Subject: [PATCH 011/335] fix gson depenendency
---
sparkler-core/plugins.build.sbt | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index 0513a49d..6b07bf70 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -127,7 +127,10 @@ lazy val scorerDdSvn = (project in file(s"$sparklerPlugins/scorer-dd-svn"))
)
)
.dependsOn(api)
-
+assemblyMergeStrategy in assembly := {
+ case PathList("META-INF", xs @ _*) => MergeStrategy.discard
+ case x => MergeStrategy.first
+}
lazy val urlFilterRegex = (project in file(s"$sparklerPlugins/urlfilter-regex"))
.enablePlugins(JavaAppPackaging)
.settings(
From 071e33f555463528b09fff8ead7076740d645799 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 16:10:10 +0100
Subject: [PATCH 012/335] fix gson depenendency
---
sparkler-core/plugins.build.sbt | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index 6b07bf70..66e6845c 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -116,6 +116,10 @@ lazy val scorerDdSvn = (project in file(s"$sparklerPlugins/scorer-dd-svn"))
.enablePlugins(JavaAppPackaging)
.settings(
Settings.plugin,
+ assemblyMergeStrategy in assembly := {
+ case PathList("META-INF", xs @ _*) => MergeStrategy.discard
+ case x => MergeStrategy.first
+ },
name := "scorer-dd-svn",
libraryDependencies ++= Seq(
ScorerDdSvn.httpClient
@@ -127,10 +131,7 @@ lazy val scorerDdSvn = (project in file(s"$sparklerPlugins/scorer-dd-svn"))
)
)
.dependsOn(api)
-assemblyMergeStrategy in assembly := {
- case PathList("META-INF", xs @ _*) => MergeStrategy.discard
- case x => MergeStrategy.first
-}
+
lazy val urlFilterRegex = (project in file(s"$sparklerPlugins/urlfilter-regex"))
.enablePlugins(JavaAppPackaging)
.settings(
From ae1c8b6d1c1e48f546d9b46af6916ca79cbb51dc Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 16:11:13 +0100
Subject: [PATCH 013/335] fix gson depenendency
---
sparkler-core/plugins.build.sbt | 4 ----
sparkler-core/project/Settings.scala | 4 ++++
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index 66e6845c..0513a49d 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -116,10 +116,6 @@ lazy val scorerDdSvn = (project in file(s"$sparklerPlugins/scorer-dd-svn"))
.enablePlugins(JavaAppPackaging)
.settings(
Settings.plugin,
- assemblyMergeStrategy in assembly := {
- case PathList("META-INF", xs @ _*) => MergeStrategy.discard
- case x => MergeStrategy.first
- },
name := "scorer-dd-svn",
libraryDependencies ++= Seq(
ScorerDdSvn.httpClient
diff --git a/sparkler-core/project/Settings.scala b/sparkler-core/project/Settings.scala
index df030ed5..2efc54d1 100644
--- a/sparkler-core/project/Settings.scala
+++ b/sparkler-core/project/Settings.scala
@@ -76,6 +76,10 @@ object Settings {
)
lazy val plugin = assemblyProject ++ Seq(
autoScalaLibrary := false,
+ assemblyMergeStrategy in assembly := {
+ case PathList("META-INF", xs @ _*) => MergeStrategy.discard
+ case x => MergeStrategy.first
+ },
assemblyOutputPath in assembly := file(".") / buildDir / pluginsDir / s"${name.value}-${(version in ThisBuild).value}.jar"
)
From 73c0967e3f50aebb2d733ff4b3d017aa673b2111 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 16:12:12 +0100
Subject: [PATCH 014/335] fix gson depenendency
---
sparkler-core/plugins.build.sbt | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index 0513a49d..71b4ffdc 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -27,8 +27,8 @@ lazy val plugins = (project in file(s"$sparklerPlugins"))
)
.aggregate(
fetcherChrome,
- /*fetcherHtmlUnit,
- fetcherJBrowser,*/
+ fetcherHtmlUnit,
+ fetcherJBrowser,
scorerDdSvn,
urlFilterRegex,
urlFilterSameHost,
@@ -77,7 +77,7 @@ lazy val fetcherChrome = (project in file(s"$sparklerPlugins/fetcher-chrome"))
)
.dependsOn(api)
-/*lazy val fetcherHtmlUnit = (project in file(s"$sparklerPlugins/fetcher-htmlunit"))
+lazy val fetcherHtmlUnit = (project in file(s"$sparklerPlugins/fetcher-htmlunit"))
.enablePlugins(JavaAppPackaging)
.settings(
Settings.plugin,
@@ -110,7 +110,7 @@ lazy val fetcherJBrowser = (project in file(s"$sparklerPlugins/fetcher-jbrowser"
dependencies = List.empty
)
)
- .dependsOn(api)*/
+ .dependsOn(api)
lazy val scorerDdSvn = (project in file(s"$sparklerPlugins/scorer-dd-svn"))
.enablePlugins(JavaAppPackaging)
From c257e6e9d651d42392ef92ecda47184424e34c43 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 16:30:03 +0100
Subject: [PATCH 015/335] fix gson depenendency
---
sparkler-core/build.sbt | 10 ++++++++--
sparkler-core/project/build.properties | 4 ----
2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index ba2ec481..c28e39b0 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -25,7 +25,7 @@ maintainer := Settings.projectMaintainer
scalaVersion in ThisBuild := "2.12.12"
javacOptions in (Compile, doc) in ThisBuild ++= Seq("-source", "13")
javacOptions in (Compile, compile) ++= Seq("-target", "13")
-
+addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "(version)")
// Common dependencies
libraryDependencies in ThisBuild ++= Seq(
Dependencies.pf4j % "provided",
@@ -173,4 +173,10 @@ lazy val ui = (project in file("sparkler-ui"))
IO.move(packageFile, buildLocation)
buildLocation
}
- )
\ No newline at end of file
+ )
+
+
+enablePlugins(PackPlugin)
+
+name := "myprog"
+base := file(".")
\ No newline at end of file
diff --git a/sparkler-core/project/build.properties b/sparkler-core/project/build.properties
index 6bb55841..b3663160 100644
--- a/sparkler-core/project/build.properties
+++ b/sparkler-core/project/build.properties
@@ -1,5 +1 @@
-<<<<<<< HEAD
sbt.version = 1.5.0
-=======
-sbt.version=1.4.9
->>>>>>> master
From 034d59e65533c25e994ac4d3894b17b2df02fcd1 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 16:36:41 +0100
Subject: [PATCH 016/335] fix gson depenendency
---
sparkler-core/build.sbt | 1 -
sparkler-core/project/plugins.sbt | 3 ++-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index c28e39b0..878339ca 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -25,7 +25,6 @@ maintainer := Settings.projectMaintainer
scalaVersion in ThisBuild := "2.12.12"
javacOptions in (Compile, doc) in ThisBuild ++= Seq("-source", "13")
javacOptions in (Compile, compile) ++= Seq("-target", "13")
-addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "(version)")
// Common dependencies
libraryDependencies in ThisBuild ++= Seq(
Dependencies.pf4j % "provided",
diff --git a/sparkler-core/project/plugins.sbt b/sparkler-core/project/plugins.sbt
index a8c859f1..cf7817d0 100644
--- a/sparkler-core/project/plugins.sbt
+++ b/sparkler-core/project/plugins.sbt
@@ -18,4 +18,5 @@
addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.7.4")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
-addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.13")
\ No newline at end of file
+addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.13")
+addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.13")
From 2e37189fedb24be1612a41427d9ec4be27a114e7 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 16:37:37 +0100
Subject: [PATCH 017/335] fix gson depenendency
---
sparkler-core/build.sbt | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index 878339ca..09264ed3 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -175,7 +175,4 @@ lazy val ui = (project in file("sparkler-ui"))
)
-enablePlugins(PackPlugin)
-
-name := "myprog"
-base := file(".")
\ No newline at end of file
+enablePlugins(PackPlugin)
\ No newline at end of file
From 1c3a9f3a040d878996f48b9c68d78366599b6b15 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 16:43:26 +0100
Subject: [PATCH 018/335] fix gson depenendency
---
sparkler-core/build.sbt | 4 +++-
.../main/scala/edu/usc/irds/sparkler/service/Injector.scala | 2 --
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index 09264ed3..5b1c6064 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -175,4 +175,6 @@ lazy val ui = (project in file("sparkler-ui"))
)
-enablePlugins(PackPlugin)
\ No newline at end of file
+enablePlugins(PackPlugin)
+
+packMain := Map("inject" -> "edu.usc.irds.sparkler.service.Injector")
\ No newline at end of file
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
index ef6643be..0dc85089 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
@@ -16,9 +16,7 @@
*/
package edu.usc.irds.sparkler.service
-import scala.collection.JavaConversions._
import java.io.File
-import java.nio.file.NotDirectoryException
import java.util
import edu.usc.irds.sparkler.{Constants, SparklerConfiguration}
From a968a86bd011b28af140c26dac9b42b4d43d95a2 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 16:59:25 +0100
Subject: [PATCH 019/335] fix gson depenendency
---
sparkler-core/bin/sparkler.sh | 14 +-------------
sparkler-core/build.sbt | 4 ++--
2 files changed, 3 insertions(+), 15 deletions(-)
diff --git a/sparkler-core/bin/sparkler.sh b/sparkler-core/bin/sparkler.sh
index f8e1d43d..c6623da3 100755
--- a/sparkler-core/bin/sparkler.sh
+++ b/sparkler-core/bin/sparkler.sh
@@ -9,26 +9,14 @@ done
DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )"
SPARKLER_CORE_DIR="$DIR/.."
-<<<<<<< HEAD
-LIB_DIR=`echo $DIR/sparkler-app-*-SNAPSHOT/lib`
-if [ ! -d "$LIB_DIR" ]
- then
- echo "ERROR: Can't find Sparkler Lib directory at $LIB_DIR.
- Looks like Sparkler is not built. Please refer to build instructions"
-=======
JAR=`echo $SPARKLER_CORE_DIR/build/sparkler-app-*-SNAPSHOT.jar`
if [ ! -f "$JAR" ]
then
echo "ERROR: Can't find Sparkler Jar at $JAR.
Looks like the jar is not built. Please refer to build instructions. Or see ./dockler.sh"
->>>>>>> master
exit 2
fi
# run
# -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005
-<<<<<<< HEAD
-java -Xms1g -cp $DIR/conf:${LIB_DIR}/* -Dpf4j.pluginsDir=$DIR/plugins edu.usc.irds.sparkler.Main $@
-=======
-java -Xms1g -cp $DIR/conf:$JAR -Dpf4j.pluginsDir=$SPARKLER_CORE_DIR/build/plugins edu.usc.irds.sparkler.Main $@
->>>>>>> master
+java -Xms1g -cp $DIR/conf:$JAR -Dpf4j.pluginsDir=$SPARKLER_CORE_DIR/build/plugins edu.usc.irds.sparkler.Main $@
\ No newline at end of file
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index 5b1c6064..b38cc23f 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -175,6 +175,6 @@ lazy val ui = (project in file("sparkler-ui"))
)
-enablePlugins(PackPlugin)
+/*enablePlugins(PackPlugin)
-packMain := Map("inject" -> "edu.usc.irds.sparkler.service.Injector")
\ No newline at end of file
+packMain := Map("inject" -> "edu.usc.irds.sparkler.service.Injector")*/
\ No newline at end of file
From 36f0c011c8dd6ceecec8d27354a53fb94b3214cf Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 2 Jun 2021 17:16:02 +0100
Subject: [PATCH 020/335] fix shell script
---
sparkler-core/bin/sparkler.sh | 39 +++++++++++++++++++++++++----------
1 file changed, 28 insertions(+), 11 deletions(-)
diff --git a/sparkler-core/bin/sparkler.sh b/sparkler-core/bin/sparkler.sh
index c6623da3..0e6f47fb 100755
--- a/sparkler-core/bin/sparkler.sh
+++ b/sparkler-core/bin/sparkler.sh
@@ -1,5 +1,23 @@
#!/usr/bin/env bash
+# Attempt to resolve the sparkler jar using relative paths
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+DIR="$DIR/.."
+
+JAR=`echo $DIR/sparkler-app-*-SNAPSHOT.jar`
+if [ -f "$JAR" ]
+ then
+ # run
+ # -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005
+ java -Xms1g -cp $DIR/conf:$JAR -Dpf4j.pluginsDir=$DIR/plugins edu.usc.irds.sparkler.Main $@
+ exit 0
+fi
+
+# Attempt to resolve the sparkler jar using absolute paths
+# We do this because in the elastic-search deployment we add sparkler.sh to /usr/bin
+# In that case the Sparkler jar cannot be resolved via relative paths.
+# The followig code block resolves the absolute location of this script on disk
+# We assume that it is located in sparkler-core/bin/
SOURCE="${BASH_SOURCE[0]}"
while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )"
@@ -7,16 +25,15 @@ while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symli
[[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
done
DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )"
-SPARKLER_CORE_DIR="$DIR/.."
-
-JAR=`echo $SPARKLER_CORE_DIR/build/sparkler-app-*-SNAPSHOT.jar`
-if [ ! -f "$JAR" ]
- then
- echo "ERROR: Can't find Sparkler Jar at $JAR.
- Looks like the jar is not built. Please refer to build instructions. Or see ./dockler.sh"
- exit 2
-fi
-
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+SPARKLER_BUILD_DIR="$DIR/../build"
+JAR=`echo $DIR/../sparkler-app-*/lib`
+#if [ ! -f "$JAR" ]
+# then
+# echo "ERROR: Can't find Sparkler Jar at $JAR.
+# Looks like the jar is not built. Please refer to build instructions. Or see ./dockler.sh"
+# exit 2
+#fi
# run
# -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005
-java -Xms1g -cp $DIR/conf:$JAR -Dpf4j.pluginsDir=$SPARKLER_CORE_DIR/build/plugins edu.usc.irds.sparkler.Main $@
\ No newline at end of file
+java -Xms1g -cp $DIR/../conf:$JAR/* -Dpf4j.pluginsDir=$DIR/../plugins edu.usc.irds.sparkler.Main $@
\ No newline at end of file
From a735832d6511b3736d83cacf505090a169578fd4 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 22 Jun 2021 16:53:44 +0100
Subject: [PATCH 021/335] add sbt build
---
.github/workflows/build-sbt.yaml | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
create mode 100644 .github/workflows/build-sbt.yaml
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
new file mode 100644
index 00000000..dba64a31
--- /dev/null
+++ b/.github/workflows/build-sbt.yaml
@@ -0,0 +1,23 @@
+name: Scala CI
+
+on:
+ push:
+ branches: [ mvn2sbt ]
+ pull_request:
+ branches: [ mvn2sbt ]
+
+jobs:
+ build:
+
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up JDK 8
+ uses: actions/setup-java@v2
+ with:
+ java-version: '8'
+ distribution: 'adopt'
+ - name: Run package
+ run: sbt package
+
From 5aba7d0cc0c6f0a6d875126d81bfc528a1d5f1da Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 22 Jun 2021 16:56:20 +0100
Subject: [PATCH 022/335] add sbt build
---
.github/workflows/build-sbt.yaml | 2 +-
.github/workflows/build.yaml | 5 ++++-
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index dba64a31..81cf95de 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -20,4 +20,4 @@ jobs:
distribution: 'adopt'
- name: Run package
run: sbt package
-
+ working-directory: sparkler-core
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index bc6b2e7a..95164510 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -1,6 +1,9 @@
name: Sparkler Build
-on: [push]
+on:
+ push:
+ branches:
+ - '!mvn2sbt'
jobs:
build:
From caf4fcec1fbdd8ca4c4b209579bfb0aeb11009cb Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 22 Jun 2021 18:13:20 +0100
Subject: [PATCH 023/335] add to build
---
.github/workflows/build-sbt.yaml | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index 81cf95de..07857737 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -21,3 +21,12 @@ jobs:
- name: Run package
run: sbt package
working-directory: sparkler-core
+ - name: Install databricks
+ run: pip install databricks-cli
+ - name: Upload
+ run: /home/runner/.local/bin/databricks fs cp --overwrite build/ dbfs:/FileStore/sparkler-submit/
+ env:
+ DATABRICKS_HOST: https://kli-mmit.cloud.databricks.com
+ DATABRICKS_TOKEN: dapi686259bd2b151be0cb4a63c1f63cfd95
+ working-directory: sparkler-core
+
From 7865dfedf145b00b940e3458784c228537866fd1 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 22 Jun 2021 18:16:05 +0100
Subject: [PATCH 024/335] add to build
---
.github/workflows/build-sbt.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index 07857737..87fa8592 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -24,7 +24,7 @@ jobs:
- name: Install databricks
run: pip install databricks-cli
- name: Upload
- run: /home/runner/.local/bin/databricks fs cp --overwrite build/ dbfs:/FileStore/sparkler-submit/
+ run: /home/runner/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
env:
DATABRICKS_HOST: https://kli-mmit.cloud.databricks.com
DATABRICKS_TOKEN: dapi686259bd2b151be0cb4a63c1f63cfd95
From 139549ba6225dc1b1b71146a2bce6a9ac768a10e Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 22 Jun 2021 18:28:20 +0100
Subject: [PATCH 025/335] add to build
---
.github/workflows/build-sbt.yaml | 7 ++-----
sparkler-core/deploy.sh | 5 +++++
2 files changed, 7 insertions(+), 5 deletions(-)
create mode 100755 sparkler-core/deploy.sh
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index 87fa8592..f9f466c3 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -22,11 +22,8 @@ jobs:
run: sbt package
working-directory: sparkler-core
- name: Install databricks
- run: pip install databricks-cli
- - name: Upload
- run: /home/runner/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
+ run: ./deploy.sh
+ working-directory: sparkler-core
env:
DATABRICKS_HOST: https://kli-mmit.cloud.databricks.com
DATABRICKS_TOKEN: dapi686259bd2b151be0cb4a63c1f63cfd95
- working-directory: sparkler-core
-
diff --git a/sparkler-core/deploy.sh b/sparkler-core/deploy.sh
new file mode 100755
index 00000000..7f0ed31d
--- /dev/null
+++ b/sparkler-core/deploy.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+pip install databricks-cli
+
+/home/runner/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
From 32c3c31bcbafabf81b11491b66f0e42bd1bc53c0 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 22 Jun 2021 18:40:27 +0100
Subject: [PATCH 026/335] update token
---
.github/workflows/build-sbt.yaml | 2 +-
sparkler-core/deploy.sh | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index f9f466c3..de13b2a5 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -26,4 +26,4 @@ jobs:
working-directory: sparkler-core
env:
DATABRICKS_HOST: https://kli-mmit.cloud.databricks.com
- DATABRICKS_TOKEN: dapi686259bd2b151be0cb4a63c1f63cfd95
+ DATABRICKS_TOKEN: dapi88de017d8ab1fe80afe5691c1e786185
diff --git a/sparkler-core/deploy.sh b/sparkler-core/deploy.sh
index 7f0ed31d..4d24c1b5 100755
--- a/sparkler-core/deploy.sh
+++ b/sparkler-core/deploy.sh
@@ -2,4 +2,4 @@
pip install databricks-cli
-/home/runner/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
+~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
From e2b273ee0668c67964077e2abce43767e505b299 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 22 Jun 2021 22:08:41 +0100
Subject: [PATCH 027/335] update sbt stuff
---
sparkler-core/README.md | 3 +
sparkler-core/build.sbt | 26 +++
sparkler-core/conf/sparkler-default.yaml | 2 +-
sparkler-core/project/Dependencies.scala | 6 +-
.../src/main/resources/sparkler-default.yaml | 178 ++++++++++++++++++
5 files changed, 211 insertions(+), 4 deletions(-)
create mode 100644 sparkler-core/README.md
create mode 100644 sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
diff --git a/sparkler-core/README.md b/sparkler-core/README.md
new file mode 100644
index 00000000..1b2242cd
--- /dev/null
+++ b/sparkler-core/README.md
@@ -0,0 +1,3 @@
+Sample spark submit
+
+ ~/Projects/spark-3.0.2-bin-hadoop2.7/bin/spark-submit --class edu.usc.irds.sparkler.Main --master spark://localhost:7077 --driver-java-options '-Dpf4j.pluginsDir=/home/bugg/Projects/sparkler-fork/sparkler-core/build/plugins/' build/sparkler-app-0.3.1-SNAPSHOT.jar inject -su https://news.bbc.co.uk
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index b38cc23f..5170c2ea 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -81,6 +81,18 @@ lazy val api = (project in file("sparkler-api"))
Dependencies.jUnit % Test,
Dependencies.jUnitInterface % Test
),
+ assemblyMergeStrategy in assembly := {
+ case x if x.contains("io.netty.versions.properties") => MergeStrategy.first
+ case x if x.contains("Log4j2Plugins.dat") => MergeStrategy.first
+ case x if x.contains("module-info.class") => MergeStrategy.first
+ case PathList("org", "apache", "logging", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "logging", "log4j", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "commons", "logging", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "log4j", xs@_*) => MergeStrategy.first
+ case PathList("org", "slf4j", "impl", xs@_*) => MergeStrategy.first
+ case PathList("org", "cliffc", "high_scale_lib", xs@_*) => MergeStrategy.first
+ case x => (assemblyMergeStrategy in assembly).value.apply(x)
+ },
testOptions += Tests.Argument(TestFrameworks.JUnit,
"--verbosity=1",
"--run-listener=edu.usc.irds.sparkler.test.WebServerRunListener")
@@ -106,6 +118,20 @@ lazy val app = (project in file("sparkler-app"))
Dependencies.Spark.sql,
Dependencies.tikaParsers,
),
+ assemblyMergeStrategy in assembly := {
+ case x if x.contains("io.netty.versions.properties") => MergeStrategy.first
+ case x if x.contains("Log4j2Plugins.dat") => MergeStrategy.first
+ case x if x.contains("module-info.class") => MergeStrategy.first
+ case PathList("org", "apache", "logging", "log4j", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "logging", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "log4j", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "commons", "logging", xs@_*) => MergeStrategy.first
+ case PathList("org", "slf4j", "impl", xs@_*) => MergeStrategy.first
+ case PathList("org", "cliffc", "high_scale_lib", xs@_*) => MergeStrategy.first
+ case x => (assemblyMergeStrategy in assembly).value.apply(x)
+ },
+ //assembly / assemblyJarName := "something.jar",
+ assemblyOutputPath in assembly := file(".") / "build" / s"${name.value}-${(version in ThisBuild).value}.jar",
packageBin in Universal := {
// Move sparkler-app & its dependencies to {Settings.buildDir}
val fileMappings = (mappings in Universal).value
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index aa1034c4..4bf5b8f0 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -27,7 +27,7 @@ crawldb.backend: solr # "solr" is default until "elasticsearch" becomes usable.
# solr.uri: file://conf/solr/crawldb
# For cloudmode with zookeepers; Format = collectionName::zkhost1:port1,zkhost2:port2,zkhost3:port3
# solr.uri: crawldb::localhost:9983
-solr.uri: http://localhost:8983/solr/crawldb
+solr.uri: http://ec2-35-174-200-133.compute-1.amazonaws.com:8983/solr/crawldb
# elasticsearch settings
elasticsearch.uri: http://localhost:9200
diff --git a/sparkler-core/project/Dependencies.scala b/sparkler-core/project/Dependencies.scala
index 0803421b..0b239b50 100644
--- a/sparkler-core/project/Dependencies.scala
+++ b/sparkler-core/project/Dependencies.scala
@@ -60,8 +60,8 @@ object Dependencies {
object Spark {
private val group = "org.apache.spark"
private val version = "3.0.1" // pre-built version available @ https://spark.apache.org/downloads.html
- lazy val core = group %% "spark-core" % version
- lazy val sql = group %% "spark-sql" % version
+ lazy val core = group %% "spark-core" % version % "provided"
+ lazy val sql = group %% "spark-sql" % version % "provided"
}
- lazy val tikaParsers = "org.apache.tika" % "tika-parsers" % "1.24"
+ lazy val tikaParsers = "org.apache.tika" % "tika-parsers" % "1.24" % "provided"
}
\ No newline at end of file
diff --git a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
new file mode 100644
index 00000000..4bf5b8f0
--- /dev/null
+++ b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
@@ -0,0 +1,178 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+##################### General Properties ################################
+
+# uri - Crawl Database URL. Stores crawl metadata and status updates.
+
+crawldb.backend: solr # "solr" is default until "elasticsearch" becomes usable.
+
+# Type: String. Default: http://localhost:8983/solr/crawldb
+# for standalone server
+# For quick test crawls using embedded solr
+# solr.uri: file://conf/solr/crawldb
+# For cloudmode with zookeepers; Format = collectionName::zkhost1:port1,zkhost2:port2,zkhost3:port3
+# solr.uri: crawldb::localhost:9983
+solr.uri: http://ec2-35-174-200-133.compute-1.amazonaws.com:8983/solr/crawldb
+
+# elasticsearch settings
+elasticsearch.uri: http://localhost:9200
+
+
+##################### Apache Spark Properties ###########################
+
+# URL on which Apache Spark is running.
+# Type: String. Default is "local[*]" for local mode.
+spark.master: local[*]
+databricks.enable: false
+
+##################### Apache Kafka Properties ###########################
+# Enable Kafka Dump
+# Type: Boolean. Default is "false"
+kafka.enable: false
+# Kafka Listeners
+# Type: String. Default is "localhost:9092" for local mode.
+kafka.listeners: localhost:9092
+# Kafka topic to send dumps to
+# Type: String. Default is "sparkler/".
+kafka.topic: sparkler_%s
+
+##################### Generate Properties ###############################
+
+# Generates the top N URLs for fetching.
+# Type: Int. Default: 1000
+generate.topn: 1000
+
+# Generates URLs from top N groups for fetching.
+# Type: Int. Default: 256
+generate.top.groups: 256
+
+# Define criteria for sorting the top N urls
+# Note: The name of the field to sort by should exactly match the one used in the SOLR schema
+# Type: String. Default: discover_depth asc, score asc
+generate.sortby: "discover_depth asc, score asc"
+
+
+# Specify field to use for grouping partitions in RDD
+# Default is the "group" field which represent the hostnames of the URLs being fethced
+# Note: This field should match exactly the one specified in the SOLR schema
+# Type: String. Default: group
+generate.groupby: "group"
+
+##################### Fetcher Properties ################################
+
+# Delay (in milliseconds) between two fetch requests for the same host.
+# Type: Long. Default: 1000
+fetcher.server.delay: 2000
+
+# list of headers to be included for each outgoing request
+fetcher.headers:
+ User-Agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Sparkler/${project.version}"
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
+ Accept-Language: "en-US,en"
+
+# Rotating agents file.
+# File should contain a list of agents which will be used to override the default agent string
+# This is an unbounded list, it can take any number of agents you wish.
+# for every request the agents are used one after the other in sequence
+#fetcher.user.agents: user-agents.txt
+
+##################### Plugins ###########################################
+
+# Plugins Bundle directory. Configured through Maven.
+# Discouraged to Modify unless specifically required.
+# To set plugins directory, use system property -Dpf4j.pluginsDir=$DIR/plugins
+# by default it looks up 'plugins' directory from current working directory
+
+
+# List of activated plugins
+plugins.active:
+
+ - urlfilter-regex
+ - urlfilter-samehost
+# - url-injector
+# - scorer-dd-svn
+# - fetcher-jbrowser
+# - fetcher-htmlunit
+# - fetcher-chrome
+
+# All Plugins are listed under this tree
+plugins:
+ # Regex URL Filter - Filters outlinks from a web page based on Regex
+ # expressions.
+ urlfilter.regex:
+ #
+ # File with Regex Filter Rules
+ urlfilter.regex.file: regex-urlfilter.txt
+ scorer.dd.svn:
+ scorer.dd.svn.url: http://domain-discovery:5000/classify/predict
+ scorer.dd.svn.fallback: 0
+ scorer.dd.svn.key: svn_score
+ # Fetcher jBrowser - Headless browser to fetch javascript and AJAX
+ # based document/content.
+ fetcher.jbrowser:
+ # Configuration Properties
+ #socket.timeout: 3000
+ #connect.timeout: 3000
+ fetcher.chrome:
+ #Set timeout to > -1 to enable the wait for element visibility for some ajax sites.
+ chrome.wait.timeout: -1
+ #Element name
+ chrome.wait.element: "some element"
+ #What type of element, class, name, id
+ chrome.wait.type: "class"
+ chrome.dns: "http://localhost:3000/webdriver"
+ #chrome.selenium.enabled: "true"
+ #chrome.selenium.script.click: "id:txtName"
+ #chrome.selenium.script.keys: "COR"
+ #chrome.selenium.script.click: "id:btnSearch"
+ #chrome.proxy.address: 127.0.0.1:9998
+ url.injector:
+ mode: selenium # currently only compatible with the fetcher-chrome plugin
+ #mode: replace
+ #mode: json
+ #mode: form
+ #values: #escaped for json
+ # - "\"COR\""
+ # - "\"VEN\""
+ # - "\"SOM\""
+ values:
+ - Acitretin
+ - Adempas
+ - Actiq
+ selenium:
+ 1:
+ operation: click
+ value: id:some-id
+ 2:
+ operation: keys
+ value: "id:some-input-id:${token}"
+ 3:
+ operation: click
+ value: "id:some-id"
+ json: "{ \"name\":\"John\", \"age\":${token}, \"car\":null }"
+ form:
+ hdnField: "submit"
+ txtRequired: ""
+ radSearchBy: "drugname"
+ txtName: "${token}"
+ selTC: ""
+ selProgram: "MA"
+ txtDateOfService: "12/01/2020"
+##################### Custom properties for MEMEX ###########################################
+ memex.webpage.mimetype: "text/html"
+
From 0e3547acb6166d0abfeef53aaaed0c9d9e2d5367 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 01:55:55 +0100
Subject: [PATCH 028/335] spark submit fixes
---
sparkler-core/build.sbt | 9 ++++--
sparkler-core/conf/sparkler-default.yaml | 2 +-
sparkler-core/project/Dependencies.scala | 2 +-
.../src/main/resources/solr-schema-map.yaml | 30 +++++++++++++++++++
.../src/main/resources/sparkler-default.yaml | 3 +-
.../usc/irds/sparkler/pipeline/Crawler.scala | 2 +-
.../usc/irds/sparkler/service/Injector.scala | 11 +++++--
7 files changed, 51 insertions(+), 8 deletions(-)
create mode 100644 sparkler-core/sparkler-app/src/main/resources/solr-schema-map.yaml
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index 5170c2ea..79f17888 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -93,6 +93,7 @@ lazy val api = (project in file("sparkler-api"))
case PathList("org", "cliffc", "high_scale_lib", xs@_*) => MergeStrategy.first
case x => (assemblyMergeStrategy in assembly).value.apply(x)
},
+ test in assembly := {},
testOptions += Tests.Argument(TestFrameworks.JUnit,
"--verbosity=1",
"--run-listener=edu.usc.irds.sparkler.test.WebServerRunListener")
@@ -122,15 +123,19 @@ lazy val app = (project in file("sparkler-app"))
case x if x.contains("io.netty.versions.properties") => MergeStrategy.first
case x if x.contains("Log4j2Plugins.dat") => MergeStrategy.first
case x if x.contains("module-info.class") => MergeStrategy.first
+ case x if x.contains("public-suffix-list.txt") => MergeStrategy.first
+ case x if x.contains("bus-extensions.txt") => MergeStrategy.first
+ case x if x.contains("blueprint.handlers") => MergeStrategy.first
case PathList("org", "apache", "logging", "log4j", xs@_*) => MergeStrategy.first
case PathList("org", "apache", "logging", xs@_*) => MergeStrategy.first
case PathList("org", "apache", "log4j", xs@_*) => MergeStrategy.first
case PathList("org", "apache", "commons", "logging", xs@_*) => MergeStrategy.first
case PathList("org", "slf4j", "impl", xs@_*) => MergeStrategy.first
+ case PathList("com", "ctc", "wstx", xs@_*) => MergeStrategy.first
case PathList("org", "cliffc", "high_scale_lib", xs@_*) => MergeStrategy.first
case x => (assemblyMergeStrategy in assembly).value.apply(x)
},
- //assembly / assemblyJarName := "something.jar",
+ test in assembly := {},
assemblyOutputPath in assembly := file(".") / "build" / s"${name.value}-${(version in ThisBuild).value}.jar",
packageBin in Universal := {
// Move sparkler-app & its dependencies to {Settings.buildDir}
@@ -203,4 +208,4 @@ lazy val ui = (project in file("sparkler-ui"))
/*enablePlugins(PackPlugin)
-packMain := Map("inject" -> "edu.usc.irds.sparkler.service.Injector")*/
\ No newline at end of file
+packMain := Map("inject" -> "edu.usc.irds.sparkler.service.Injector")*/
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index 4bf5b8f0..01527305 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -37,7 +37,7 @@ elasticsearch.uri: http://localhost:9200
# URL on which Apache Spark is running.
# Type: String. Default is "local[*]" for local mode.
-spark.master: local[*]
+spark.master: spark://DESKTOP-JIMKO29.localdomain:7077
databricks.enable: false
##################### Apache Kafka Properties ###########################
diff --git a/sparkler-core/project/Dependencies.scala b/sparkler-core/project/Dependencies.scala
index 0b239b50..a233f49c 100644
--- a/sparkler-core/project/Dependencies.scala
+++ b/sparkler-core/project/Dependencies.scala
@@ -63,5 +63,5 @@ object Dependencies {
lazy val core = group %% "spark-core" % version % "provided"
lazy val sql = group %% "spark-sql" % version % "provided"
}
- lazy val tikaParsers = "org.apache.tika" % "tika-parsers" % "1.24" % "provided"
+ lazy val tikaParsers = "org.apache.tika" % "tika-parsers" % "1.24"
}
\ No newline at end of file
diff --git a/sparkler-core/sparkler-app/src/main/resources/solr-schema-map.yaml b/sparkler-core/sparkler-app/src/main/resources/solr-schema-map.yaml
new file mode 100644
index 00000000..3f4d1747
--- /dev/null
+++ b/sparkler-core/sparkler-app/src/main/resources/solr-schema-map.yaml
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+################## Solr Schema Map Properties #######################
+
+#overrides:
+# id: id
+
+typeSuffix:
+ java.lang.String: _t
+ java.lang.Integer: _i
+ java.lang.Long: _l
+ java.lang.Boolean: _b
+ java.lang.Float: _f
+ java.lang.Double: _d
+ java.util.Date: _dt
+
+multiValSuffix: s
\ No newline at end of file
diff --git a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
index 4bf5b8f0..797c481e 100644
--- a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
+++ b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
@@ -37,7 +37,8 @@ elasticsearch.uri: http://localhost:9200
# URL on which Apache Spark is running.
# Type: String. Default is "local[*]" for local mode.
-spark.master: local[*]
+spark.master:
+ #local[*]
databricks.enable: false
##################### Apache Kafka Properties ###########################
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index 9e903b45..6674812f 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -136,7 +136,7 @@ class Crawler extends CliTool {
this.outputPath = jobId
}
val conf = new SparkConf().setAppName(jobId)
- if (!sparkMaster.isEmpty) {
+ if (sparkMaster != null && !sparkMaster.isEmpty) {
conf.setMaster(sparkMaster)
}
if (!sparkStorage.isEmpty){
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
index 0dc85089..dc435a21 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
@@ -18,7 +18,6 @@
package edu.usc.irds.sparkler.service
import java.io.File
import java.util
-
import edu.usc.irds.sparkler.{Constants, SparklerConfiguration}
import edu.usc.irds.sparkler.base.{CliTool, Loggable}
import edu.usc.irds.sparkler.model.{Resource, ResourceStatus, SparklerJob}
@@ -30,8 +29,8 @@ import org.kohsuke.args4j.spi.StringArrayOptionHandler
import scala.collection.JavaConversions._
import scala.io.Source
import java.nio.file.NotDirectoryException
-
import org.apache.commons.validator.routines.UrlValidator
+import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.Stack
import scala.collection.mutable.ArrayBuffer
@@ -72,6 +71,14 @@ class Injector extends CliTool {
var configOverride: Array[Any] = Array()
override def run(): Unit = {
+ val sconf = new SparkConf().setAppName("sparkler-job")
+ val sc = new SparkContext(sconf)
+ val logFile = "/home/bugg/Projects/spark-3.0.2-bin-hadoop2.7/README.md"
+ val logData = sc.textFile(logFile, 2).cache()
+ val numAs = logData.filter(line => line.contains("a")).count()
+ val numBs = logData.filter(line => line.contains("b")).count()
+ println("Lines with a: %s, Lines with b: %s".format(numAs, numBs))
+ println("SU: " + seedUrls.mkString(","))
if (configOverride != ""){
conf.overloadConfig(configOverride.mkString(" "));
}
From ed9c29160a7afabe7237895085d71e34bf2afb91 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 02:04:26 +0100
Subject: [PATCH 029/335] spark submit fixes
---
sparkler-core/deploy.sh | 2 ++
1 file changed, 2 insertions(+)
diff --git a/sparkler-core/deploy.sh b/sparkler-core/deploy.sh
index 4d24c1b5..e5f054ff 100755
--- a/sparkler-core/deploy.sh
+++ b/sparkler-core/deploy.sh
@@ -2,4 +2,6 @@
pip install databricks-cli
+rm -rf build/sparkler-app-0.3.1-SNAPSHOT
+
~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
From c7650425b755c16484f6e5e0d0e203e09f18d9c3 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 02:09:05 +0100
Subject: [PATCH 030/335] spark submit fixes
---
.github/workflows/build-sbt.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index de13b2a5..b50f6346 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -19,7 +19,7 @@ jobs:
java-version: '8'
distribution: 'adopt'
- name: Run package
- run: sbt package
+ run: sbt assembly
working-directory: sparkler-core
- name: Install databricks
run: ./deploy.sh
From 1f421aa13ecd5cd9e9f9fc0b9aa7cbdc9b23062a Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 03:01:58 +0100
Subject: [PATCH 031/335] spark submit fixes
---
.../src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index 6674812f..57600dbc 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -222,7 +222,7 @@ class Crawler extends CliTool {
val fetchedRdd = rdd.map(r => (r.getGroup, r))
.groupByKey()
.flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
- FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer) })
+ FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer) }).repartition(50)
.persist()
if (kafkaEnable) {
From 9b90b5207d3579d927b73315d1a92e44365f5a85 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 03:13:02 +0100
Subject: [PATCH 032/335] spark submit fixes
---
.../main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index 57600dbc..7380d415 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -220,9 +220,9 @@ class Crawler extends CliTool {
val rdd = new MemexCrawlDbRDD(sc, job, maxGroups = topG, topN = topN)
val fetchedRdd = rdd.map(r => (r.getGroup, r))
- .groupByKey()
+ .groupByKey().repartition(50)
.flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
- FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer) }).repartition(50)
+ FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer) })
.persist()
if (kafkaEnable) {
From 41c7ca9215fd45384ae9ae8acd4bd28b4df7d2c2 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 10:32:43 +0100
Subject: [PATCH 033/335] spark submit fixes
---
.../src/main/resources/domain-suffixes.xml | 4428 +++++++++++++++++
.../src/main/resources/regex-urlfilter.txt | 42 +
.../src/main/resources/user-agents.txt | 21 +
3 files changed, 4491 insertions(+)
create mode 100644 sparkler-core/sparkler-app/src/main/resources/domain-suffixes.xml
create mode 100644 sparkler-core/sparkler-app/src/main/resources/regex-urlfilter.txt
create mode 100644 sparkler-core/sparkler-app/src/main/resources/user-agents.txt
diff --git a/sparkler-core/sparkler-app/src/main/resources/domain-suffixes.xml b/sparkler-core/sparkler-app/src/main/resources/domain-suffixes.xml
new file mode 100644
index 00000000..3541eb51
--- /dev/null
+++ b/sparkler-core/sparkler-app/src/main/resources/domain-suffixes.xml
@@ -0,0 +1,4428 @@
+
+
+
+
+
+
+
+
+
+
+
+ INFRASTRUCTURE
+
+ (from http://en.wikipedia.org/wiki/.root)
+ vrsn-end-of-zone-marker-dummy-record.root is a domain name
+ listed in the DNS root zone as a diagnostic marker, whose
+ presence demonstrates the root zone was not truncated upon
+ loading by a root nameserver. It could be argued it represents
+ a top-level domain of .root, although technically no such
+ delegation exists.
+
+
+
+
+ INFRASTRUCTURE
+
+ (from http://en.wikipedia.org/wiki/.arpa) .arpa is an Internet
+ top-level domain (TLD) used exclusively for
+ Internet-infrastructure purposes. It does not function as a
+ normal TLD where websites are registered, but rather as a
+ meta-TLD used to look up addresses, and for other purposes.
+
+
+
+
+
+
+
+ SPONSORED
+ for the air transport industry
+
+
+
+ UNSPONSORED
+ for business use
+
+
+
+ SPONSORED
+ for Catalan language/culture
+
+
+
+ UNSPONSORED
+
+ for commercial organizations, but unrestricted
+
+
+
+
+ SPONSORED
+ for cooperatives
+
+
+
+ UNSPONSORED
+ 1.0
+
+ for post-secondary educational establishments
+
+
+
+
+ UNSPONSORED
+
+ for governments and their agencies in the United States
+
+
+
+
+ UNSPONSORED
+
+ for informational sites, but unrestricted
+
+
+
+
+ UNSPONSORED
+
+ for international organizations established by treaty
+
+
+
+
+ SPONSORED
+ for employment-related sites
+
+
+
+ UNSPONSORED
+ for the US military
+
+
+
+ SPONSORED
+ for sites catering to mobile devices
+
+
+
+ SPONSORED
+ for museums
+
+
+
+ UNSPONSORED
+ for families and individuals
+
+
+
+ UNSPONSORED
+
+ originally for network infrastructures, now unrestricted
+
+
+
+
+ UNSPONSORED
+
+ originally for organizations not clearly falling within the
+ other gTLDs, now unrestricted
+
+
+
+
+ SPONSORED
+ for certain professions
+
+
+
+ SPONSORED
+
+ for travel agents, airlines, hoteliers, tourism bureaus, etc.
+
+
+
+
+
+
+ STARTUP
+ for the Asian community
+
+
+
+ PROPOSED
+ for postal services
+
+
+
+ STARTUP
+
+ for services involving connections between the telephone
+ network and the Internet
+
+
+
+
+ PROPOSED
+ for geographically related sites
+
+
+
+ PROPOSED
+ for Galicia, a country within Spain
+
+
+
+ PROPOSED
+ for Wales, a country within the UK
+
+
+
+ PROPOSED
+ for Scotland, a country within the UK
+
+
+
+ PROPOSED
+ for websites designed for children
+
+
+
+ PROPOSED
+ for websites designed for children
+
+
+
+ PROPOSED
+ http://en.wikipedia.org/wiki/.mail
+
+
+
+ PROPOSED
+ For Web sites of all sorts
+
+
+
+ PROPOSED
+ For Adult entertainment sites
+
+
+
+
+ DELETED
+
+ for NATO sites and operations. Replaced by .int
+
+
+
+
+
+ PSEUDO_DOMAIN
+
+ identifying a hostname not connected directly to the Internet,
+ but a bitnet network
+
+
+
+
+ PSEUDO_DOMAIN
+
+ identifying a hostname not connected directly to the Internet,
+ but a csnet network
+
+
+
+
+ PSEUDO_DOMAIN
+
+ identifying a hostname not connected directly to the Internet,
+ but a bitnet network
+
+
+
+
+ PSEUDO_DOMAIN
+
+ .local is a pseudo top-level domain used by Apple, Inc.'s
+ Bonjour protocol.
+
+
+
+
+ PSEUDO_DOMAIN
+ alias of .local
+
+
+
+ PSEUDO_DOMAIN
+
+ designates an anonymous or pseudonymous address reachable via
+ the Tor network.
+
+
+
+
+
+
+ Ascension Island
+
+
+
+ Andorra
+
+
+
+ United Arab Emirates
+
+
+
+ Afghanistan
+
+
+
+ Antigua and Barbuda
+
+
+
+ Anguilla
+
+
+
+ Albania
+
+
+
+ Armenia
+
+
+
+ Netherlands Antilles
+
+
+
+ Angola
+
+
+
+ Antarctica
+
+
+
+ Argentina
+
+
+
+ American Samoa
+
+
+
+ Austria
+
+
+
+ Australia
+
+
+
+ Aruba
+
+
+
+ Aland Islands
+
+
+
+ Azerbaijan
+
+
+
+ Bosnia and Herzegovina
+
+
+
+ Barbados
+
+
+
+ Bangladesh
+
+
+
+ Belgium
+
+
+
+ Burkina Faso
+
+
+
+ Bulgaria
+
+
+
+ Bahrain
+
+
+
+ Burundi
+
+
+
+ Benin
+
+
+
+ Bermuda
+
+
+
+ Brunei
+
+
+
+ Bolivia
+
+
+
+ Brazil
+
+
+
+ Bahamas
+
+
+
+ Bhutan
+
+
+
+ Burma
+ NOT_IN_USE
+
+ not in use since re-naming of country to Myanmar, see .mm
+
+
+
+
+ Bouvet Island
+ NOT_IN_USE
+ not in use; no registrations
+
+
+
+ Botswana
+
+
+
+ Belarus
+
+
+
+ Belize
+
+
+
+ Canada
+
+
+
+ Cocos Keeling Islands
+
+
+
+ Democratic Republic of the Congo
+ formerly .zr - Zaire
+
+
+
+ Central African Republic
+
+
+
+ Republic of the Congo
+
+
+
+ Switzerland
+
+
+
+ Côte d'Ivoire
+ Ivory Coast
+
+
+
+ Cook Islands
+
+
+
+ Chile
+
+
+
+ Cameroon
+
+
+
+ People s Republic of China
+
+
+
+ Colombia
+
+
+
+ Costa Rica
+
+
+
+ Serbia and Montenegro
+ DELETED
+
+ formerly .yu - Yugoslavia; description: on June 3, 2006,
+ Montenegro declared independence, thus dissolving the state
+ union) (.cs code not assigned; no DNS) (.cs code previously
+ used for Czechoslovakia
+
+
+
+
+ Cuba
+
+
+
+ Cape Verde
+
+
+
+ Christmas Island
+
+
+
+ Cyprus
+
+
+
+ Czech Republic
+
+
+
+ German Democratic Republic(East Germany)
+ DELETED
+ deleted in 1990
+
+
+
+ Germany
+
+
+
+ Djibouti
+
+
+
+ Denmark
+
+
+
+ Dominica
+
+
+
+ Dominican Republic
+
+
+
+ Algeria
+
+
+
+ Ecuador
+
+
+
+ Estonia
+
+
+
+ Egypt
+
+
+
+ Western Sahara
+ NOT_IN_USE
+ not assigned; no DNS
+
+
+
+ Eritrea
+
+
+
+ Spain
+
+
+
+ Ethiopia
+
+
+
+ European Union
+
+ code "exceptionally reserved" by ISO 3166-1
+
+
+
+
+ Finland
+
+
+
+ Fiji
+
+
+
+ Falkland Islands
+
+
+
+ Federated States of Micronesia
+
+
+
+ Faroe Islands
+
+
+
+ France
+
+
+
+ Gabon
+
+
+
+ United Kingdom
+
+ Reserved domain by IANA; deprecated – see .uk
+
+
+
+
+ Grenada
+
+
+
+ Georgia
+
+
+
+ French Guiana
+
+
+
+ Guernsey
+
+
+
+ Ghana
+
+
+
+ Gibraltar
+
+
+
+ Greenland
+
+
+
+ Gambia
+
+
+
+ Guinea
+
+
+
+ Guadeloupe
+
+
+
+ Equatorial Guinea
+
+
+
+ Greece
+
+
+
+ South Georgia and the South Sandwich Islands
+
+
+
+ Guatemala
+
+
+
+ Guam
+
+
+
+ Guinea Bissau
+
+
+
+ Guyana
+
+
+
+ Hong Kong
+
+
+
+ Heard Island and McDonald Islands
+
+
+
+ Honduras
+
+
+
+ Croatia
+
+
+
+ Haiti
+
+
+
+ Hungary
+
+
+
+ Indonesia
+
+
+
+ Ireland
+
+
+
+ Israel
+
+
+
+ Isle of Man
+
+
+
+ India
+
+
+
+ British Indian Ocean Territory
+
+
+
+ Iraq
+
+
+
+ Iran
+
+
+
+ Iceland
+
+
+
+ Italy
+
+
+
+ Jersey
+
+
+
+ Jamaica
+
+
+
+ Jordan
+
+
+
+ Japan
+
+
+
+ Kenya
+
+
+
+ Kyrgyzstan
+
+
+
+ Cambodia
+
+
+
+ Kiribati
+
+
+
+ Comoros
+
+
+
+ Saint Kitts and Nevis
+
+
+
+ North Korea
+ NOT_IN_USE
+
+ not assigned; no DNS
+
+
+
+ South Korea
+
+
+
+ Kuwait
+
+
+
+ Cayman Islands
+
+
+
+ Kazakhstan
+
+
+
+ Laos
+
+
+
+ Lebanon
+
+
+
+ Saint Lucia
+
+
+
+ Liechtenstein
+
+
+
+ Sri Lanka
+
+
+
+ Liberia
+
+
+
+ Lesotho
+
+
+
+ Lithuania
+
+
+
+ Luxembourg
+
+
+
+ Latvia
+
+
+
+ Libya
+
+
+
+ Morocco
+
+
+
+ Monaco
+
+
+
+ Moldova
+
+
+
+ Montenegro
+
+
+
+ Madagascar
+
+
+
+ Marshall Islands
+
+
+
+ Republic of Macedonia
+
+
+
+ Mali
+
+
+
+ Myanmar
+ formerly .bu - Burma
+
+
+
+ Mongolia
+
+
+
+ Macau
+
+
+
+ Northern Mariana Islands
+
+
+
+ Martinique
+
+
+
+ Mauritania
+
+
+
+ Montserrat
+
+
+
+ Malta
+
+
+
+ Mauritius
+
+
+
+ Maldives
+
+
+
+ Malawi
+
+
+
+ Mexico
+
+
+
+ Malaysia
+
+
+
+ Mozambique
+
+
+
+ Namibia
+
+
+
+ New Caledonia
+
+
+
+ Niger
+
+
+
+ Norfolk Island
+
+
+
+ Nigeria
+
+
+
+ Nicaragua
+
+
+
+ Netherlands
+
+
+
+ Norway
+
+
+
+ Nepal
+
+
+
+ Nauru
+
+
+
+ Niue
+
+
+
+ New Zealand
+
+
+
+ Oman
+
+
+
+ Panama
+
+
+
+ Peru
+
+
+
+ French Polynesia
+
+
+
+ Papua New Guinea
+
+
+
+ Philippines
+
+
+
+ Pakistan
+
+
+
+ Poland
+
+
+
+ Saint Pierre and Miquelon
+
+
+
+ Pitcairn Islands
+
+
+
+ Puerto Rico
+
+
+
+ Palestinian territories
+
+
+
+ Portugal
+
+
+
+ Palau
+
+
+
+ Paraguay
+
+
+
+ Qatar
+
+
+
+ Réunion
+
+
+
+ Romania
+
+
+
+ Serbia
+
+
+
+ Russia
+
+
+
+ Rwanda
+
+
+
+ Saudi Arabia
+
+
+
+ Solomon Islands
+
+
+
+ Seychelles
+
+
+
+ Sudan
+
+
+
+ Sweden
+
+
+
+ Singapore
+
+
+
+ Saint Helena
+
+
+
+ Slovenia
+
+
+
+ Svalbard and Jan Mayen Islands
+ NOT_IN_USE
+ not in use; no registrations
+
+
+
+ Slovakia
+
+
+
+ Sierra Leone
+
+
+
+ San Marino
+
+
+
+ Senegal
+
+
+
+ Somalia
+
+
+
+ Suriname
+
+
+
+ São Tomé and Príncipe
+
+
+
+ Soviet Union
+ DELETED
+
+ deprecated; being phased out; code "transitionally reserved"
+ by ISO 3166-1
+
+
+
+
+ El Salvador
+
+
+
+ Syria
+
+
+
+ Swaziland
+
+
+
+ Turks and Caicos Islands
+
+
+
+ Chad
+
+
+
+ French Southern Territories
+
+
+
+ Togo
+
+
+
+ Thailand
+
+
+
+ Tajikistan
+
+
+
+ Tokelau
+
+
+
+ East Timor
+ formerly .tp
+
+
+
+ Turkmenistan
+
+
+
+ Tunisia
+
+
+
+ Tonga
+
+
+
+ East Timor
+ DELETED
+
+ deprecated - use .tl; code "transitionally reserved" by ISO
+ 3166-1
+
+
+
+
+ Turkey
+
+
+
+ Trinidad and Tobago
+
+
+
+ Tuvalu
+
+
+
+ Republic of China
+ Taiwan
+
+
+
+ Tanzania
+
+
+
+ Ukraine
+
+
+
+ Uganda
+
+
+
+ United Kingdom
+
+ code "exceptionally reserved" by ISO 3166-1 (see also .gb)
+
+
+
+
+ United States Minor Outlying Islands
+ DELETED
+ see http://en.wikipedia.org/wiki/.um
+
+
+
+ United States
+
+
+
+ Uruguay
+
+
+
+ Uzbekistan
+
+
+
+ Vatican City
+
+
+
+ Saint Vincent and the Grenadines
+
+
+
+ Venezuela
+
+
+
+ British Virgin Islands
+
+
+
+ United States Virgin Islands
+
+
+
+ Vietnam
+
+
+
+ Vanuatu
+
+
+
+ Wallis and Futuna
+
+
+
+ Samoa
+ formerly Western Samoa
+
+
+
+ Yemen
+
+
+
+ Mayotte
+
+
+
+ Yugoslavia
+
+ subsequently renamed Serbia and Montenegro (code officially
+ replaced by .cs (see above) but still used; code
+ "transitionally reserved" by ISO 3166-1)
+
+
+
+
+ South Africa
+
+
+
+ Zambia
+
+
+
+ Zaire
+ DELETED
+ replaced by .cd
+
+
+
+ Zimbabwe
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ DELETED
+
+
+ DELETED
+
+
+ DELETED
+
+
+ DELETED
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/sparkler-core/sparkler-app/src/main/resources/regex-urlfilter.txt b/sparkler-core/sparkler-app/src/main/resources/regex-urlfilter.txt
new file mode 100644
index 00000000..fd8ba2f3
--- /dev/null
+++ b/sparkler-core/sparkler-app/src/main/resources/regex-urlfilter.txt
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# The default url filter.
+# Better for whole-internet crawling.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'. The first matching pattern in the file
+# determines whether a URL is included or ignored. If no pattern
+# matches, the URL is ignored.
+
+# skip file: ftp: and mailto: urls
+-^(file|ftp|mailto):
+
+# Default: skip image and other suffixes which produces large content
+# for a more extensive coverage use the urlfilter-suffix plugin
+-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS|svg|SVG|mp3|MP3|mp4|MP4|pdf|PDF)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+-.*(/[^/]+)/[^/]+\1/[^/]+\1/
+
+# accept any HTTP URL
++^https?://
+
+# reject the rest
+-.
diff --git a/sparkler-core/sparkler-app/src/main/resources/user-agents.txt b/sparkler-core/sparkler-app/src/main/resources/user-agents.txt
new file mode 100644
index 00000000..60ab611f
--- /dev/null
+++ b/sparkler-core/sparkler-app/src/main/resources/user-agents.txt
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# User agents to be used
+# Each line contains an agent
+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Sparkler/${project.version} client1
+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Sparkler/${project.version} client2
+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Sparkler/${project.version} client3
+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Sparkler/${project.version} client4
\ No newline at end of file
From b09a054179e4277d39ea5478591bd4d86ad04ece Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 10:50:48 +0100
Subject: [PATCH 034/335] spark submit fixes
---
.../main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index 7380d415..e9605361 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -220,7 +220,7 @@ class Crawler extends CliTool {
val rdd = new MemexCrawlDbRDD(sc, job, maxGroups = topG, topN = topN)
val fetchedRdd = rdd.map(r => (r.getGroup, r))
- .groupByKey().repartition(50)
+ .groupByKey()
.flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer) })
.persist()
@@ -249,7 +249,7 @@ class Crawler extends CliTool {
val scoredRdd = fetchedRdd.map(d => ScoreFunction(job, d))
- val scoreUpdateRdd: RDD[SolrInputDocument] = scoredRdd.map(d => ScoreUpdateSolrTransformer(d))
+ val scoreUpdateRdd: RDD[SolrInputDocument] = scoredRdd.repartition(50).map(d => ScoreUpdateSolrTransformer(d))
val scoreUpdateFunc = new SolrStatusUpdate(job)
sc.runJob(scoreUpdateRdd, scoreUpdateFunc)
From 6f17ca56b7e1f9387f35007cadc1da1d9212cff2 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 11:09:05 +0100
Subject: [PATCH 035/335] spark submit fixes
---
.../edu/usc/irds/sparkler/pipeline/Crawler.scala | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index e9605361..15a307bc 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -219,12 +219,16 @@ class Crawler extends CliTool {
LOG.info(s"Starting the job:$jobId, task:$taskId")
val rdd = new MemexCrawlDbRDD(sc, job, maxGroups = topG, topN = topN)
- val fetchedRdd = rdd.map(r => (r.getGroup, r))
- .groupByKey()
- .flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
+ val f = rdd.map(r => (r.getGroup, r))
+ .groupByKey().repartition(50);
+
+ val c = f.getNumPartitions
+ val fetchedRdd = f.flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer) })
.persist()
+ val d = fetchedRdd.getNumPartitions
+
if (kafkaEnable) {
storeContentKafka(kafkaListeners, kafkaTopic.format(jobId), fetchedRdd)
}
@@ -249,7 +253,7 @@ class Crawler extends CliTool {
val scoredRdd = fetchedRdd.map(d => ScoreFunction(job, d))
- val scoreUpdateRdd: RDD[SolrInputDocument] = scoredRdd.repartition(50).map(d => ScoreUpdateSolrTransformer(d))
+ val scoreUpdateRdd: RDD[SolrInputDocument] = scoredRdd.map(d => ScoreUpdateSolrTransformer(d))
val scoreUpdateFunc = new SolrStatusUpdate(job)
sc.runJob(scoreUpdateRdd, scoreUpdateFunc)
From 6d5841e8e5e6c9ad48ba7c4f096a533325c09ff4 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 11:50:54 +0100
Subject: [PATCH 036/335] spark submit fixes
---
.../src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index 15a307bc..265ef3a1 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -253,7 +253,7 @@ class Crawler extends CliTool {
val scoredRdd = fetchedRdd.map(d => ScoreFunction(job, d))
- val scoreUpdateRdd: RDD[SolrInputDocument] = scoredRdd.map(d => ScoreUpdateSolrTransformer(d))
+ val scoreUpdateRdd: RDD[SolrInputDocument] = scoredRdd.repartition(50).map(d => ScoreUpdateSolrTransformer(d))
val scoreUpdateFunc = new SolrStatusUpdate(job)
sc.runJob(scoreUpdateRdd, scoreUpdateFunc)
From ac937fccc6f0b0dd65a36af69ab75f880a627083 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 14:03:11 +0100
Subject: [PATCH 037/335] try map partition
---
.../usc/irds/sparkler/pipeline/Crawler.scala | 20 +++++++++++++++----
.../usc/irds/sparkler/pipeline/RunCrawl.scala | 20 +++++++++++++++++++
2 files changed, 36 insertions(+), 4 deletions(-)
create mode 100644 sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/RunCrawl.scala
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index 265ef3a1..81cc1b15 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -47,7 +47,8 @@ import scala.io.Source
*
* @since 5/28/16
*/
-class Crawler extends CliTool {
+@SerialVersionUID(100L)
+class Crawler extends CliTool with Serializable {
import Crawler._
@@ -165,6 +166,12 @@ class Crawler extends CliTool {
//TODO: URL normalizers
//TODO: Robots.txt
+ def mapCrawl(x: Iterator[(String, Iterable[Resource])]): Iterator[CrawlData] = {
+ val m = 1000
+ x.flatMap({case (grp, rs) => new FairFetcher(job, rs.iterator, m,
+ FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer)})
+ }
+
override def run(): Unit = {
//STEP : Initialize environment
@@ -223,10 +230,15 @@ class Crawler extends CliTool {
.groupByKey().repartition(50);
val c = f.getNumPartitions
- val fetchedRdd = f.flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
- FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer) })
- .persist()
+ //val fetchedRdd = f.mapPartitions( x => mapCrawl(x))
+ val rc = new RunCrawl
+ val fetchedRdd = rc.runCrawl(f, job)
+ /*val fetchedRdd = f.flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
+ FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq })
+ .persist()*/
+
+ val coll = fetchedRdd.collect()
val d = fetchedRdd.getNumPartitions
if (kafkaEnable) {
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/RunCrawl.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/RunCrawl.scala
new file mode 100644
index 00000000..4af8de6c
--- /dev/null
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/RunCrawl.scala
@@ -0,0 +1,20 @@
+package edu.usc.irds.sparkler.pipeline
+
+import edu.usc.irds.sparkler.model.{CrawlData, Resource, SparklerJob}
+import edu.usc.irds.sparkler.storage.solr.StatusUpdateSolrTransformer
+import org.apache.spark.rdd.RDD
+
+@SerialVersionUID(100L)
+class RunCrawl extends Serializable{
+ def mapCrawl(x: Iterator[(String, Iterable[Resource])], job: SparklerJob): Iterator[CrawlData] = {
+ val m = 1000
+ x.flatMap({case (grp, rs) => new FairFetcher(job, rs.iterator, m,
+ FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer)})
+ }
+
+ def runCrawl(f: RDD[(String, Iterable[Resource])], job: SparklerJob): RDD[CrawlData] = {
+ f.mapPartitions( x => mapCrawl(x, job))
+
+ }
+
+}
From cbe1a0cd417f154fd43080d7f483e98321eb65a6 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 14:23:09 +0100
Subject: [PATCH 038/335] try map partition
---
.../scala/edu/usc/irds/sparkler/pipeline/Crawler.scala | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index 81cc1b15..2bce43b2 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -229,7 +229,7 @@ class Crawler extends CliTool with Serializable {
val f = rdd.map(r => (r.getGroup, r))
.groupByKey().repartition(50);
- val c = f.getNumPartitions
+ //val c = f.getNumPartitions
//val fetchedRdd = f.mapPartitions( x => mapCrawl(x))
val rc = new RunCrawl
@@ -238,8 +238,8 @@ class Crawler extends CliTool with Serializable {
FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq })
.persist()*/
- val coll = fetchedRdd.collect()
- val d = fetchedRdd.getNumPartitions
+ //val coll = fetchedRdd.collect()
+ //val d = fetchedRdd.getNumPartitions
if (kafkaEnable) {
storeContentKafka(kafkaListeners, kafkaTopic.format(jobId), fetchedRdd)
@@ -265,7 +265,7 @@ class Crawler extends CliTool with Serializable {
val scoredRdd = fetchedRdd.map(d => ScoreFunction(job, d))
- val scoreUpdateRdd: RDD[SolrInputDocument] = scoredRdd.repartition(50).map(d => ScoreUpdateSolrTransformer(d))
+ val scoreUpdateRdd: RDD[SolrInputDocument] = scoredRdd.map(d => ScoreUpdateSolrTransformer(d))
val scoreUpdateFunc = new SolrStatusUpdate(job)
sc.runJob(scoreUpdateRdd, scoreUpdateFunc)
From 752d9db1b2a9784f8ac40e437fec5acf267426e0 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 16:15:14 +0100
Subject: [PATCH 039/335] try map partition
---
.../scala/edu/usc/irds/sparkler/pipeline/Crawler.scala | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index 2bce43b2..7bd112f5 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -226,17 +226,17 @@ class Crawler extends CliTool with Serializable {
LOG.info(s"Starting the job:$jobId, task:$taskId")
val rdd = new MemexCrawlDbRDD(sc, job, maxGroups = topG, topN = topN)
- val f = rdd.map(r => (r.getGroup, r))
- .groupByKey().repartition(50);
+ val f = rdd.map(r => ("id", r))
+ .groupByKey();
//val c = f.getNumPartitions
//val fetchedRdd = f.mapPartitions( x => mapCrawl(x))
val rc = new RunCrawl
- val fetchedRdd = rc.runCrawl(f, job)
- /*val fetchedRdd = f.flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
+ //val fetchedRdd = rc.runCrawl(f, job)
+ val fetchedRdd = f.flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq })
- .persist()*/
+ .persist()
//val coll = fetchedRdd.collect()
//val d = fetchedRdd.getNumPartitions
From 4bb3e016bf20285cc3f4871ca7209c2dfac009bb Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 16:46:28 +0100
Subject: [PATCH 040/335] try map partition
---
.../src/main/resources/sparkler-default.yaml | 2 +-
.../usc/irds/sparkler/pipeline/Crawler.scala | 22 ++++++++++---------
2 files changed, 13 insertions(+), 11 deletions(-)
diff --git a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
index 797c481e..19c873cb 100644
--- a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
+++ b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
@@ -72,7 +72,7 @@ generate.sortby: "discover_depth asc, score asc"
# Default is the "group" field which represent the hostnames of the URLs being fethced
# Note: This field should match exactly the one specified in the SOLR schema
# Type: String. Default: group
-generate.groupby: "group"
+generate.groupby: "id"
##################### Fetcher Properties ################################
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index 7bd112f5..5a360355 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -19,13 +19,11 @@ package edu.usc.irds.sparkler.pipeline
import java.io.File
import java.util
-
import edu.usc.irds.sparkler._
import edu.usc.irds.sparkler.base.{CliTool, Loggable}
import edu.usc.irds.sparkler.model.ResourceStatus._
import edu.usc.irds.sparkler.model.{CrawlData, Resource, ResourceStatus, SparklerJob}
-
-import edu.usc.irds.sparkler.storage.solr.{SolrProxy, SolrStatusUpdate, SolrUpsert, StatusUpdateSolrTransformer, ScoreUpdateSolrTransformer}
+import edu.usc.irds.sparkler.storage.solr.{ScoreUpdateSolrTransformer, SolrProxy, SolrStatusUpdate, SolrUpsert, StatusUpdateSolrTransformer}
import edu.usc.irds.sparkler.util.{JobUtil, NutchBridge}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.Text
@@ -35,10 +33,10 @@ import org.apache.solr.common.SolrInputDocument
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
-
import org.kohsuke.args4j.Option
import org.kohsuke.args4j.spi.StringArrayOptionHandler
+import java.util.UUID
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
import scala.io.Source
@@ -47,8 +45,7 @@ import scala.io.Source
*
* @since 5/28/16
*/
-@SerialVersionUID(100L)
-class Crawler extends CliTool with Serializable {
+class Crawler extends CliTool {
import Crawler._
@@ -172,6 +169,11 @@ class Crawler extends CliTool with Serializable {
FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer)})
}
+ def goup: String = {
+ val uuid = UUID.randomUUID
+ val uuidAsString = uuid.toString
+ uuidAsString
+ }
override def run(): Unit = {
//STEP : Initialize environment
@@ -184,7 +186,7 @@ class Crawler extends CliTool with Serializable {
val job = this.job // local variable to bypass serialization
for (_ <- 1 to iterations) {
- var deepCrawlHosts: mutable.Set[String] = new mutable.HashSet[String]()
+ var deepCrawlHosts = new mutable.HashSet[String]()
if(deepCrawlHostFile != null) {
if(deepCrawlHostFile.isFile) {
deepCrawlHosts ++= Source.fromFile(deepCrawlHostFile).getLines().toSet
@@ -226,10 +228,10 @@ class Crawler extends CliTool with Serializable {
LOG.info(s"Starting the job:$jobId, task:$taskId")
val rdd = new MemexCrawlDbRDD(sc, job, maxGroups = topG, topN = topN)
- val f = rdd.map(r => ("id", r))
- .groupByKey();
+ val f = rdd.map(r => (r.getDedupeId, r))
+ .groupByKey(100);
- //val c = f.getNumPartitions
+ val c = f.getNumPartitions
//val fetchedRdd = f.mapPartitions( x => mapCrawl(x))
val rc = new RunCrawl
From 7b9762f4c5a8d7ce75834517b9d745035282a6f8 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 17:28:32 +0100
Subject: [PATCH 041/335] try map partition
---
.../scala/edu/usc/irds/sparkler/pipeline/Crawler.scala | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index 5a360355..8b1c6a04 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -228,8 +228,11 @@ class Crawler extends CliTool {
LOG.info(s"Starting the job:$jobId, task:$taskId")
val rdd = new MemexCrawlDbRDD(sc, job, maxGroups = topG, topN = topN)
+ //TODO RESTORE THIS HACK
val f = rdd.map(r => (r.getDedupeId, r))
- .groupByKey(100);
+ .groupByKey(500)
+
+
val c = f.getNumPartitions
@@ -237,7 +240,7 @@ class Crawler extends CliTool {
val rc = new RunCrawl
//val fetchedRdd = rc.runCrawl(f, job)
val fetchedRdd = f.flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
- FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq })
+ FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq }).repartition(100)
.persist()
//val coll = fetchedRdd.collect()
From ced3014332cafc408605201b294f157bf62637ac Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 17:33:06 +0100
Subject: [PATCH 042/335] try map partition
---
.../src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index 8b1c6a04..a602471b 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -240,7 +240,7 @@ class Crawler extends CliTool {
val rc = new RunCrawl
//val fetchedRdd = rc.runCrawl(f, job)
val fetchedRdd = f.flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
- FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq }).repartition(100)
+ FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq }).repartition(500)
.persist()
//val coll = fetchedRdd.collect()
From d735190273495cd9e475ed09cad3f1f07b3dbe73 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 17:57:08 +0100
Subject: [PATCH 043/335] try map partition
---
.../edu/usc/irds/sparkler/pipeline/Crawler.scala | 14 +++++++++++---
.../edu/usc/irds/sparkler/pipeline/RunCrawl.scala | 12 ++++++++++++
2 files changed, 23 insertions(+), 3 deletions(-)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index a602471b..0d365e1f 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -174,6 +174,10 @@ class Crawler extends CliTool {
val uuidAsString = uuid.toString
uuidAsString
}
+
+ def maplogic: Unit = {
+
+ }
override def run(): Unit = {
//STEP : Initialize environment
@@ -226,18 +230,22 @@ class Crawler extends CliTool {
var taskId = JobUtil.newSegmentId(true)
job.currentTask = taskId
LOG.info(s"Starting the job:$jobId, task:$taskId")
+ val rc = new RunCrawl
val rdd = new MemexCrawlDbRDD(sc, job, maxGroups = topG, topN = topN)
//TODO RESTORE THIS HACK
- val f = rdd.map(r => (r.getDedupeId, r))
- .groupByKey(500)
+ val f = rc.map(rdd)
+ /*val f = rdd.map(r => (r.getDedupeId, r))
+ .groupByKey()*/
+
+ val l = f.glom().map(_.length).collect()
+ print(l.min, l.max, l.sum/l.length, l.length)
val c = f.getNumPartitions
//val fetchedRdd = f.mapPartitions( x => mapCrawl(x))
- val rc = new RunCrawl
//val fetchedRdd = rc.runCrawl(f, job)
val fetchedRdd = f.flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq }).repartition(500)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/RunCrawl.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/RunCrawl.scala
index 4af8de6c..78853bb3 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/RunCrawl.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/RunCrawl.scala
@@ -1,11 +1,13 @@
package edu.usc.irds.sparkler.pipeline
+import edu.usc.irds.sparkler.MemexCrawlDbRDD
import edu.usc.irds.sparkler.model.{CrawlData, Resource, SparklerJob}
import edu.usc.irds.sparkler.storage.solr.StatusUpdateSolrTransformer
import org.apache.spark.rdd.RDD
@SerialVersionUID(100L)
class RunCrawl extends Serializable{
+ var i = 0
def mapCrawl(x: Iterator[(String, Iterable[Resource])], job: SparklerJob): Iterator[CrawlData] = {
val m = 1000
x.flatMap({case (grp, rs) => new FairFetcher(job, rs.iterator, m,
@@ -17,4 +19,14 @@ class RunCrawl extends Serializable{
}
+ def maplogic(r: Resource): (String, Resource) = {
+ print("loop: "+i)
+ i = i +1
+ (r.getId, r)
+ }
+
+ def map(rdd: MemexCrawlDbRDD): RDD[(String, Iterable[Resource])] = {
+ rdd.map(r => maplogic(r))
+ .groupByKey()
+ }
}
From 5b66bf7e623b1a49f7bc3c837708f681d52614f7 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 23 Jun 2021 18:07:24 +0100
Subject: [PATCH 044/335] try map partition
---
.../src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index 0d365e1f..cdcc4fb7 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -247,7 +247,7 @@ class Crawler extends CliTool {
//val fetchedRdd = f.mapPartitions( x => mapCrawl(x))
//val fetchedRdd = rc.runCrawl(f, job)
- val fetchedRdd = f.flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
+ val fetchedRdd = f.repartition(500).flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq }).repartition(500)
.persist()
From 9b06f12450b94fcd7df2e2f49bea2d5d1da83367 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 28 Jun 2021 21:11:14 +0100
Subject: [PATCH 045/335] update readme
---
sparkler-core/README.md | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/sparkler-core/README.md b/sparkler-core/README.md
index 1b2242cd..f0076437 100644
--- a/sparkler-core/README.md
+++ b/sparkler-core/README.md
@@ -1,3 +1,8 @@
Sample spark submit
~/Projects/spark-3.0.2-bin-hadoop2.7/bin/spark-submit --class edu.usc.irds.sparkler.Main --master spark://localhost:7077 --driver-java-options '-Dpf4j.pluginsDir=/home/bugg/Projects/sparkler-fork/sparkler-core/build/plugins/' build/sparkler-app-0.3.1-SNAPSHOT.jar inject -su https://news.bbc.co.uk
+
+
+Databricks API
+
+curl -vvv -n -H 'Content-Type:application/json' -H "Authorization: Bearer xxx" https://kli-mmit.cloud.databricks.com/api/2.0/jobs/runs/submit -d '{"new_cluster":{"spark_conf":{"spark.locality.wait.node":"0","spark.executor.extraJavaOptions":"-Dpf4j.pluginsDir=/dbfs/FileStore/sparkler-submit/plugins/", "spark.task.cpus":"8"},"spark_version":"8.3.x-scala2.12","aws_attributes":{"availability":"SPOT_WITH_FALLBACK","first_on_demand":1,"zone_id":"us-west-2c"},"node_type_id":"c5d.4xlarge","init_scripts":[{"dbfs":{"destination":"dbfs:/FileStore/KLI/crawlinit.sh"}}],"num_workers":10, "cluster_log_conf":{ "dbfs" : { "destination" : "dbfs:/FileStore/logs" } }},"spark_submit_task":{"parameters":["--driver-java-options","-Dpf4j.pluginsDir=/dbfs/FileStore/sparkler-submit/plugins/","--driver-memory","10g","--executor-memory","10g","--class","edu.usc.irds.sparkler.Main","dbfs:/FileStore/sparkler-submit/sparkler-app-0.3.1-SNAPSHOT.jar","crawl","-id","testclustercrawl7", "-tn", "4000","-co","{\"plugins.active\":[\"urlfilter-regex\",\"urlfilter-samehost\",\"fetcher-chrome\"],\"plugins\":{\"fetcher.chrome\":{\"chrome.dns\":\"local\"}}}"]},"run_name":"testsubmi4t"}'
From 8238e7c56a78e3da6b53d557b20dc071b62be161 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 28 Jun 2021 21:18:27 +0100
Subject: [PATCH 046/335] update scripter
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index a7bc611a..796e37db 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.0-SNAPSHOT"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.2"
}
object FetcherHtmlUnit {
From 9efec783378b823e55cda63d4f075475b9109ead Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 28 Jun 2021 21:25:02 +0100
Subject: [PATCH 047/335] update scripter
---
.../main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index a8fb9662..846590c8 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -226,7 +226,7 @@ public FetchedData fetch(Resource resource) throws Exception {
if(pluginConfig.get("chrome.selenium.script") != null && pluginConfig.get("chrome.selenium.script") instanceof Map) {
Map map = (Map) pluginConfig.get("chrome.selenium.script");
try {
- scripter.runScript(map, null, null);
+ scripter.runScript(map);
} catch (Exception ignored){
}
@@ -237,7 +237,7 @@ public FetchedData fetch(Resource resource) throws Exception {
if(json != null && json.containsKey("selenium")){
if(json.get("selenium") != null && json.get("selenium") instanceof Map) {
try {
- scripter.runScript((Map) json.get("selenium"), null, null);
+ scripter.runScript((Map) json.get("selenium"));
} catch (Exception e){
Map tempmap = new HashMap<>();
tempmap.put("type", "file");
From d1371c03a8f4e83e6aaba6d421ea3881a3eb6da9 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 28 Jun 2021 22:18:34 +0100
Subject: [PATCH 048/335] add more dependencies
---
sparkler-core/build.sbt | 12 ++++++++++++
sparkler-core/project/Dependencies.scala | 8 ++++++++
.../sparkler-plugins/fetcher-chrome/pom.xml | 2 +-
3 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index 79f17888..9fedef88 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -118,6 +118,10 @@ lazy val app = (project in file("sparkler-app"))
Dependencies.Spark.core,
Dependencies.Spark.sql,
Dependencies.tikaParsers,
+ Dependencies.seleniumscripter,
+ Dependencies.browserup,
+ Dependencies.Selenium.java,
+ Dependencies.Selenium.chromeDriver
),
assemblyMergeStrategy in assembly := {
case x if x.contains("io.netty.versions.properties") => MergeStrategy.first
@@ -126,6 +130,7 @@ lazy val app = (project in file("sparkler-app"))
case x if x.contains("public-suffix-list.txt") => MergeStrategy.first
case x if x.contains("bus-extensions.txt") => MergeStrategy.first
case x if x.contains("blueprint.handlers") => MergeStrategy.first
+ case x if x.contains("META-INF/versions/9/javax/xml/bind/") => MergeStrategy.first
case PathList("org", "apache", "logging", "log4j", xs@_*) => MergeStrategy.first
case PathList("org", "apache", "logging", xs@_*) => MergeStrategy.first
case PathList("org", "apache", "log4j", xs@_*) => MergeStrategy.first
@@ -133,6 +138,12 @@ lazy val app = (project in file("sparkler-app"))
case PathList("org", "slf4j", "impl", xs@_*) => MergeStrategy.first
case PathList("com", "ctc", "wstx", xs@_*) => MergeStrategy.first
case PathList("org", "cliffc", "high_scale_lib", xs@_*) => MergeStrategy.first
+ case PathList("javax.xml.bind", "jaxb-api", xs@_*) => MergeStrategy.first
+ case PathList("org", "hamcrest", xs@_*) => MergeStrategy.first
+ case PathList("javax", "xml", xs@_*) => MergeStrategy.first
+ case PathList("javax", "activation", xs@_*) => MergeStrategy.first
+ case PathList("io", "netty", xs@_*) => MergeStrategy.first
+
case x => (assemblyMergeStrategy in assembly).value.apply(x)
},
test in assembly := {},
@@ -209,3 +220,4 @@ lazy val ui = (project in file("sparkler-ui"))
/*enablePlugins(PackPlugin)
packMain := Map("inject" -> "edu.usc.irds.sparkler.service.Injector")*/
+
diff --git a/sparkler-core/project/Dependencies.scala b/sparkler-core/project/Dependencies.scala
index a233f49c..b718c263 100644
--- a/sparkler-core/project/Dependencies.scala
+++ b/sparkler-core/project/Dependencies.scala
@@ -64,4 +64,12 @@ object Dependencies {
lazy val sql = group %% "spark-sql" % version % "provided"
}
lazy val tikaParsers = "org.apache.tika" % "tika-parsers" % "1.24"
+ object Selenium {
+ private val group = "org.seleniumhq.selenium"
+ private val version = "3.141.59"
+ lazy val chromeDriver = group % "selenium-chrome-driver" % version
+ lazy val java = group % "selenium-java" % version
+ }
+ lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.2"
}
\ No newline at end of file
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml b/sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml
index 974cd371..5a5269d5 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml
@@ -112,7 +112,7 @@
com.kytheralabsseleniumscripter
- 1.0-SNAPSHOT
+ 1.2
From dfcf2224cdcaff6acc80727e97776f587611d66d Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 28 Jun 2021 22:33:05 +0100
Subject: [PATCH 049/335] add more dependencies
---
sparkler-core/build.sbt | 3 ++-
sparkler-core/project/Dependencies.scala | 2 ++
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index 9fedef88..97e2d00e 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -121,7 +121,8 @@ lazy val app = (project in file("sparkler-app"))
Dependencies.seleniumscripter,
Dependencies.browserup,
Dependencies.Selenium.java,
- Dependencies.Selenium.chromeDriver
+ Dependencies.Selenium.chromeDriver,
+ Dependencies.Selenium.guava
),
assemblyMergeStrategy in assembly := {
case x if x.contains("io.netty.versions.properties") => MergeStrategy.first
diff --git a/sparkler-core/project/Dependencies.scala b/sparkler-core/project/Dependencies.scala
index b718c263..48787d65 100644
--- a/sparkler-core/project/Dependencies.scala
+++ b/sparkler-core/project/Dependencies.scala
@@ -69,7 +69,9 @@ object Dependencies {
private val version = "3.141.59"
lazy val chromeDriver = group % "selenium-chrome-driver" % version
lazy val java = group % "selenium-java" % version
+ lazy val guava = "com.google.guava" % "guava" % "30.1.1-jre"
}
+
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.2"
}
\ No newline at end of file
From 0106162cb48535c45602543b6fae4b6ea2079cc2 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 28 Jun 2021 22:41:31 +0100
Subject: [PATCH 050/335] add more dependencies
---
sparkler-core/deploy.sh | 1 +
sparkler-core/project/Dependencies.scala | 2 +-
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/sparkler-core/deploy.sh b/sparkler-core/deploy.sh
index e5f054ff..00edc5a8 100755
--- a/sparkler-core/deploy.sh
+++ b/sparkler-core/deploy.sh
@@ -3,5 +3,6 @@
pip install databricks-cli
rm -rf build/sparkler-app-0.3.1-SNAPSHOT
+rm -rf build/plugins
~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
diff --git a/sparkler-core/project/Dependencies.scala b/sparkler-core/project/Dependencies.scala
index 48787d65..fceaee70 100644
--- a/sparkler-core/project/Dependencies.scala
+++ b/sparkler-core/project/Dependencies.scala
@@ -69,7 +69,7 @@ object Dependencies {
private val version = "3.141.59"
lazy val chromeDriver = group % "selenium-chrome-driver" % version
lazy val java = group % "selenium-java" % version
- lazy val guava = "com.google.guava" % "guava" % "30.1.1-jre"
+ lazy val guava = "com.google.guava" % "guava" % "25.0-jre"
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
From 2b54c1b953e7d0cde2214898a598aa5cb333accd Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 29 Jun 2021 12:58:04 +0100
Subject: [PATCH 051/335] new sbt changes
---
sparkler-core/build.sbt | 30 ++++++++++++++-----
sparkler-core/conf/log4j.properties | 2 ++
sparkler-core/conf/log4j2.properties | 2 ++
sparkler-core/conf/sparkler-default.yaml | 3 +-
sparkler-core/plugins.build.sbt | 11 +++----
sparkler-core/project/Dependencies.scala | 10 +++++--
sparkler-core/project/Settings.scala | 14 +++++++--
.../src/main/resources/log4j2.properties | 2 ++
.../src/main/resources/sparkler-default.yaml | 2 +-
.../usc/irds/sparkler/service/Injector.scala | 16 +++++-----
10 files changed, 66 insertions(+), 26 deletions(-)
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index 97e2d00e..d99f0435 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -100,12 +100,23 @@ lazy val api = (project in file("sparkler-api"))
)
.dependsOn(testsBase)
+val sparkprovided = System.getProperty("sparkprovided", "")
+
lazy val app = (project in file("sparkler-app"))
.enablePlugins(JavaAppPackaging)
.settings(
Settings.common,
name := "sparkler-app",
mainClass in (Compile, packageBin) := Some("edu.usc.irds.sparkler.Main"),
+ libraryDependencies ++= (
+ if(sparkprovided == "true") {
+ ("org.apache.spark" %% "spark-core" % "3.0.1" % "provided") :: Nil
+ ("org.apache.spark" %% "spark-sql" % "3.0.1" % "provided") :: Nil
+ } else {
+ ("org.apache.spark" %% "spark-core" % "3.0.1") :: Nil
+ ("org.apache.spark" %% "spark-sql" % "3.0.1") :: Nil
+ }
+ ),
libraryDependencies ++= Seq(
// TODO: Only keep necessary dependencies. Rest all should be included as plugin. Eg: extractors
Dependencies.args4j,
@@ -115,14 +126,8 @@ lazy val app = (project in file("sparkler-app"))
Dependencies.kafkaClients exclude("org.slf4j", "slf4j-api"),
Dependencies.pf4j,
Dependencies.Solr.core,
- Dependencies.Spark.core,
- Dependencies.Spark.sql,
Dependencies.tikaParsers,
- Dependencies.seleniumscripter,
- Dependencies.browserup,
- Dependencies.Selenium.java,
- Dependencies.Selenium.chromeDriver,
- Dependencies.Selenium.guava
+
),
assemblyMergeStrategy in assembly := {
case x if x.contains("io.netty.versions.properties") => MergeStrategy.first
@@ -131,6 +136,8 @@ lazy val app = (project in file("sparkler-app"))
case x if x.contains("public-suffix-list.txt") => MergeStrategy.first
case x if x.contains("bus-extensions.txt") => MergeStrategy.first
case x if x.contains("blueprint.handlers") => MergeStrategy.first
+ case x if x.contains("git.properties") => MergeStrategy.first
+ case x if x.contains("config.fmpp") => MergeStrategy.first
case x if x.contains("META-INF/versions/9/javax/xml/bind/") => MergeStrategy.first
case PathList("org", "apache", "logging", "log4j", xs@_*) => MergeStrategy.first
case PathList("org", "apache", "logging", xs@_*) => MergeStrategy.first
@@ -144,6 +151,15 @@ lazy val app = (project in file("sparkler-app"))
case PathList("javax", "xml", xs@_*) => MergeStrategy.first
case PathList("javax", "activation", xs@_*) => MergeStrategy.first
case PathList("io", "netty", xs@_*) => MergeStrategy.first
+ case PathList("org", "aopalliance", "intercept", xs@_*) => MergeStrategy.first
+ case PathList("org", "aopalliance", "aop", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "spark", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "hadoop", xs@_*) => MergeStrategy.first
+ case PathList("net", "jpountz", xs@_*) => MergeStrategy.last
+ case PathList("net", "jcip", xs@_*) => MergeStrategy.first
+ case PathList("javax", "inject", xs@_*) => MergeStrategy.first
+ case PathList("javax", "annotation", xs@_*) => MergeStrategy.first
+ case PathList("com", "sun", xs@_*) => MergeStrategy.first
case x => (assemblyMergeStrategy in assembly).value.apply(x)
},
diff --git a/sparkler-core/conf/log4j.properties b/sparkler-core/conf/log4j.properties
index 4a8e9da3..e9e59c06 100644
--- a/sparkler-core/conf/log4j.properties
+++ b/sparkler-core/conf/log4j.properties
@@ -35,3 +35,5 @@ rootLogger.appenderRefs = stdout
rootLogger.appenderRef.stdout.ref = STDOUT
logger.irds.name = edu.usc.irds
logger.irds.level=DEBUG
+logger.kythera.name = com.kytheralabs
+logger.kythera.level = DEBUG
\ No newline at end of file
diff --git a/sparkler-core/conf/log4j2.properties b/sparkler-core/conf/log4j2.properties
index 900d9bb2..df071d87 100644
--- a/sparkler-core/conf/log4j2.properties
+++ b/sparkler-core/conf/log4j2.properties
@@ -36,3 +36,5 @@ rootLogger.appenderRefs = stdout
rootLogger.appenderRef.stdout.ref = STDOUT
logger.irds.name = edu.usc.irds
logger.irds.level=DEBUG
+logger.kythera.name = com.kytheralabs
+logger.kythera.level = DEBUG
\ No newline at end of file
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index 01527305..797c481e 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -37,7 +37,8 @@ elasticsearch.uri: http://localhost:9200
# URL on which Apache Spark is running.
# Type: String. Default is "local[*]" for local mode.
-spark.master: spark://DESKTOP-JIMKO29.localdomain:7077
+spark.master:
+ #local[*]
databricks.enable: false
##################### Apache Kafka Properties ###########################
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index 71b4ffdc..abc205a8 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -64,10 +64,11 @@ lazy val fetcherChrome = (project in file(s"$sparklerPlugins/fetcher-chrome"))
Settings.plugin,
name := "fetcher-chrome",
libraryDependencies ++= Seq(
- //FetcherChrome.Selenium.chromeDriver,
- FetcherChrome.Selenium.java,
- FetcherChrome.browserup,
- FetcherChrome.seleniumscripter,
+ FetcherChrome.Selenium.java exclude("org.slf4j", "slf4j-api"),
+ FetcherChrome.browserup exclude("com.fasterxml.jackson.core", "jackson-databind") exclude("org.slf4j", "slf4j-api"),
+ Dependencies.seleniumscripter exclude("org.slf4j", "slf4j-api"),
+ Dependencies.Selenium.chromeDriver exclude("org.slf4j", "slf4j-api"),
+ Dependencies.Selenium.guava exclude("org.slf4j", "slf4j-api")
),
Settings.pluginManifest(
id = "fetcher-chrome",
@@ -83,7 +84,7 @@ lazy val fetcherHtmlUnit = (project in file(s"$sparklerPlugins/fetcher-htmlunit"
Settings.plugin,
name := "fetcher-htmlunit",
libraryDependencies ++= Seq(
- FetcherHtmlUnit.htmlUnit,
+ FetcherHtmlUnit.htmlUnit exclude("org.slf4j", "slf4j-api"),
),
Settings.pluginManifest(
id = "fetcher-htmlunit",
diff --git a/sparkler-core/project/Dependencies.scala b/sparkler-core/project/Dependencies.scala
index fceaee70..e3d69732 100644
--- a/sparkler-core/project/Dependencies.scala
+++ b/sparkler-core/project/Dependencies.scala
@@ -26,7 +26,7 @@ object Dependencies {
object Jackson {
private val group = "com.fasterxml.jackson.core"
- private val version = "2.6.5"
+ private val version = "2.10.0"
lazy val core = group % "jackson-core" % version
lazy val databind = group % "jackson-databind" % version
}
@@ -42,7 +42,7 @@ object Dependencies {
lazy val jUnitInterface = "com.novocode" % "junit-interface" % "0.11"
lazy val kafkaClients = "org.apache.kafka" % "kafka-clients" % "0.10.0.0"
lazy val nutch = "org.apache.nutch" % "nutch" % "1.16"
- lazy val pf4j = "org.pf4j" % "pf4j" % "2.6.0"
+ lazy val pf4j = "org.pf4j" % "pf4j" % "3.6.0"
lazy val scalaMacrosParadise = "org.scalamacros" %% "paradise" % "2.1.1"
object Slf4j {
private val group = "org.slf4j"
@@ -58,6 +58,12 @@ object Dependencies {
lazy val solrj = group % "solr-solrj" % version
}
object Spark {
+ private val group = "org.apache.spark"
+ private val version = "3.0.1" // pre-built version available @ https://spark.apache.org/downloads.html
+ lazy val core = group %% "spark-core" % version //% "provided"
+ lazy val sql = group %% "spark-sql" % version //% "provided"
+ }
+ object SparkProvided {
private val group = "org.apache.spark"
private val version = "3.0.1" // pre-built version available @ https://spark.apache.org/downloads.html
lazy val core = group %% "spark-core" % version % "provided"
diff --git a/sparkler-core/project/Settings.scala b/sparkler-core/project/Settings.scala
index 2efc54d1..72e59d9b 100644
--- a/sparkler-core/project/Settings.scala
+++ b/sparkler-core/project/Settings.scala
@@ -77,8 +77,18 @@ object Settings {
lazy val plugin = assemblyProject ++ Seq(
autoScalaLibrary := false,
assemblyMergeStrategy in assembly := {
- case PathList("META-INF", xs @ _*) => MergeStrategy.discard
- case x => MergeStrategy.first
+ case x if x.contains("io.netty.versions.properties") => MergeStrategy.first
+ case x if x.contains("Log4j2Plugins.dat") => MergeStrategy.first
+ case x if x.contains("module-info.class") => MergeStrategy.first
+ case x if x.contains("public-suffix-list.txt") => MergeStrategy.first
+ case x if x.contains("bus-extensions.txt") => MergeStrategy.first
+ case x if x.contains("blueprint.handlers") => MergeStrategy.first
+ case x if x.contains("git.properties") => MergeStrategy.first
+ case x if x.contains("config.fmpp") => MergeStrategy.first
+ case x if x.contains("META-INF/versions/9/javax/xml/bind/") => MergeStrategy.first
+ case x if x.contains("MANIFEST.MF") => MergeStrategy.discard
+ case PathList("META-INF", "MANIFEST.MF") => MergeStrategy.discard
+ case x => (assemblyMergeStrategy in assembly).value.apply(x)
},
assemblyOutputPath in assembly := file(".") / buildDir / pluginsDir / s"${name.value}-${(version in ThisBuild).value}.jar"
)
diff --git a/sparkler-core/sparkler-app/src/main/resources/log4j2.properties b/sparkler-core/sparkler-app/src/main/resources/log4j2.properties
index 3c05fb77..d01ac13c 100644
--- a/sparkler-core/sparkler-app/src/main/resources/log4j2.properties
+++ b/sparkler-core/sparkler-app/src/main/resources/log4j2.properties
@@ -44,3 +44,5 @@ appender.console.layout.pattern = %d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
rootLogger.level = debug
rootLogger.appenderRefs = stdout
rootLogger.appenderRef.stdout.ref = STDOUT
+logger.kythera.name = com.kytheralabs
+logger.kythera.level = DEBUG
\ No newline at end of file
diff --git a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
index 19c873cb..4e51cba9 100644
--- a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
+++ b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
@@ -38,7 +38,7 @@ elasticsearch.uri: http://localhost:9200
# URL on which Apache Spark is running.
# Type: String. Default is "local[*]" for local mode.
spark.master:
- #local[*]
+ #local[1]
databricks.enable: false
##################### Apache Kafka Properties ###########################
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
index dc435a21..1c6325d9 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
@@ -71,14 +71,14 @@ class Injector extends CliTool {
var configOverride: Array[Any] = Array()
override def run(): Unit = {
- val sconf = new SparkConf().setAppName("sparkler-job")
- val sc = new SparkContext(sconf)
- val logFile = "/home/bugg/Projects/spark-3.0.2-bin-hadoop2.7/README.md"
- val logData = sc.textFile(logFile, 2).cache()
- val numAs = logData.filter(line => line.contains("a")).count()
- val numBs = logData.filter(line => line.contains("b")).count()
- println("Lines with a: %s, Lines with b: %s".format(numAs, numBs))
- println("SU: " + seedUrls.mkString(","))
+ //val sconf = new SparkConf().setAppName("sparkler-job")
+ //val sc = new SparkContext(sconf)
+ //val logFile = "/home/bugg/Projects/spark-3.0.2-bin-hadoop2.7/README.md"
+ //val logData = sc.textFile(logFile, 2).cache()
+ //val numAs = logData.filter(line => line.contains("a")).count()
+ //val numBs = logData.filter(line => line.contains("b")).count()
+ //println("Lines with a: %s, Lines with b: %s".format(numAs, numBs))
+ //println("SU: " + seedUrls.mkString(","))
if (configOverride != ""){
conf.overloadConfig(configOverride.mkString(" "));
}
From b8287277c46d1c88bb281e7963239cfb796dc56c Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 29 Jun 2021 12:59:48 +0100
Subject: [PATCH 052/335] new sbt changes
---
sparkler-core/deploy.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/deploy.sh b/sparkler-core/deploy.sh
index 00edc5a8..06f8845f 100755
--- a/sparkler-core/deploy.sh
+++ b/sparkler-core/deploy.sh
@@ -3,6 +3,6 @@
pip install databricks-cli
rm -rf build/sparkler-app-0.3.1-SNAPSHOT
-rm -rf build/plugins
+#rm -rf build/plugins
~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
From 10c8dff88a00bad4fb79c717b86d7497df874e69 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 29 Jun 2021 13:03:39 +0100
Subject: [PATCH 053/335] new sbt changes
---
.github/workflows/build-sbt.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index b50f6346..b31bef9f 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -19,7 +19,7 @@ jobs:
java-version: '8'
distribution: 'adopt'
- name: Run package
- run: sbt assembly
+ run: sbt assembly -Dsparkprovided=true
working-directory: sparkler-core
- name: Install databricks
run: ./deploy.sh
From 04c297c27ffc0c175b7a3135b9066b366fe6fb79 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 30 Jun 2021 01:52:39 +0100
Subject: [PATCH 054/335] update scripter
---
sparkler-core/project/PluginDependencies.scala | 2 +-
.../java/edu/usc/irds/sparkler/plugin/FetcherChrome.java | 5 ++++-
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 796e37db..343a97b3 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.2"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-SNAPSHOT"
}
object FetcherHtmlUnit {
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 846590c8..14f00133 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -47,6 +47,7 @@
import java.net.CookieHandler;
import java.net.CookieManager;
import java.net.CookiePolicy;
+import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
import com.browserup.bup.BrowserUpProxy;
@@ -237,7 +238,9 @@ public FetchedData fetch(Resource resource) throws Exception {
if(json != null && json.containsKey("selenium")){
if(json.get("selenium") != null && json.get("selenium") instanceof Map) {
try {
- scripter.runScript((Map) json.get("selenium"));
+ Map m = (Map) json.get("selenium");
+ Map json = new TreeMap(m);
+ scripter.runScript(json);
} catch (Exception e){
Map tempmap = new HashMap<>();
tempmap.put("type", "file");
From 3f1670752306ef545d83309a3f429ecf152bc7aa Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 30 Jun 2021 01:57:00 +0100
Subject: [PATCH 055/335] update scripter
---
.../main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 14f00133..c37bebec 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -239,8 +239,8 @@ public FetchedData fetch(Resource resource) throws Exception {
if(json.get("selenium") != null && json.get("selenium") instanceof Map) {
try {
Map m = (Map) json.get("selenium");
- Map json = new TreeMap(m);
- scripter.runScript(json);
+ Map jsonmap = new TreeMap(m);
+ scripter.runScript(jsonmap);
} catch (Exception e){
Map tempmap = new HashMap<>();
tempmap.put("type", "file");
From 47971779e64f312ff2eea29a47a6d4040582612b Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 30 Jun 2021 11:55:33 +0100
Subject: [PATCH 056/335] force build
---
sparkler-core/README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/sparkler-core/README.md b/sparkler-core/README.md
index f0076437..d100a859 100644
--- a/sparkler-core/README.md
+++ b/sparkler-core/README.md
@@ -6,3 +6,4 @@ Sample spark submit
Databricks API
curl -vvv -n -H 'Content-Type:application/json' -H "Authorization: Bearer xxx" https://kli-mmit.cloud.databricks.com/api/2.0/jobs/runs/submit -d '{"new_cluster":{"spark_conf":{"spark.locality.wait.node":"0","spark.executor.extraJavaOptions":"-Dpf4j.pluginsDir=/dbfs/FileStore/sparkler-submit/plugins/", "spark.task.cpus":"8"},"spark_version":"8.3.x-scala2.12","aws_attributes":{"availability":"SPOT_WITH_FALLBACK","first_on_demand":1,"zone_id":"us-west-2c"},"node_type_id":"c5d.4xlarge","init_scripts":[{"dbfs":{"destination":"dbfs:/FileStore/KLI/crawlinit.sh"}}],"num_workers":10, "cluster_log_conf":{ "dbfs" : { "destination" : "dbfs:/FileStore/logs" } }},"spark_submit_task":{"parameters":["--driver-java-options","-Dpf4j.pluginsDir=/dbfs/FileStore/sparkler-submit/plugins/","--driver-memory","10g","--executor-memory","10g","--class","edu.usc.irds.sparkler.Main","dbfs:/FileStore/sparkler-submit/sparkler-app-0.3.1-SNAPSHOT.jar","crawl","-id","testclustercrawl7", "-tn", "4000","-co","{\"plugins.active\":[\"urlfilter-regex\",\"urlfilter-samehost\",\"fetcher-chrome\"],\"plugins\":{\"fetcher.chrome\":{\"chrome.dns\":\"local\"}}}"]},"run_name":"testsubmi4t"}'
+
From 38639b42c0912903b6be453a437cac25e6cb3881 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 30 Jun 2021 12:01:16 +0100
Subject: [PATCH 057/335] force build
---
sparkler-core/project/Dependencies.scala | 3 ---
1 file changed, 3 deletions(-)
diff --git a/sparkler-core/project/Dependencies.scala b/sparkler-core/project/Dependencies.scala
index e3d69732..34503fee 100644
--- a/sparkler-core/project/Dependencies.scala
+++ b/sparkler-core/project/Dependencies.scala
@@ -77,7 +77,4 @@ object Dependencies {
lazy val java = group % "selenium-java" % version
lazy val guava = "com.google.guava" % "guava" % "25.0-jre"
}
-
- lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.2"
}
\ No newline at end of file
From 9a212c74b8aaf9518f15596019028cc99233c32a Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 30 Jun 2021 12:04:33 +0100
Subject: [PATCH 058/335] force build
---
sparkler-core/plugins.build.sbt | 6 +++---
sparkler-core/project/Dependencies.scala | 7 -------
2 files changed, 3 insertions(+), 10 deletions(-)
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index abc205a8..ba4152fa 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -66,9 +66,9 @@ lazy val fetcherChrome = (project in file(s"$sparklerPlugins/fetcher-chrome"))
libraryDependencies ++= Seq(
FetcherChrome.Selenium.java exclude("org.slf4j", "slf4j-api"),
FetcherChrome.browserup exclude("com.fasterxml.jackson.core", "jackson-databind") exclude("org.slf4j", "slf4j-api"),
- Dependencies.seleniumscripter exclude("org.slf4j", "slf4j-api"),
- Dependencies.Selenium.chromeDriver exclude("org.slf4j", "slf4j-api"),
- Dependencies.Selenium.guava exclude("org.slf4j", "slf4j-api")
+ FetcherChrome.seleniumscripter exclude("org.slf4j", "slf4j-api"),
+ FetcherChrome.Selenium.chromeDriver exclude("org.slf4j", "slf4j-api"),
+ //FetcherChrome.Selenium.guava exclude("org.slf4j", "slf4j-api")
),
Settings.pluginManifest(
id = "fetcher-chrome",
diff --git a/sparkler-core/project/Dependencies.scala b/sparkler-core/project/Dependencies.scala
index 34503fee..c6a3f8e8 100644
--- a/sparkler-core/project/Dependencies.scala
+++ b/sparkler-core/project/Dependencies.scala
@@ -70,11 +70,4 @@ object Dependencies {
lazy val sql = group %% "spark-sql" % version % "provided"
}
lazy val tikaParsers = "org.apache.tika" % "tika-parsers" % "1.24"
- object Selenium {
- private val group = "org.seleniumhq.selenium"
- private val version = "3.141.59"
- lazy val chromeDriver = group % "selenium-chrome-driver" % version
- lazy val java = group % "selenium-java" % version
- lazy val guava = "com.google.guava" % "guava" % "25.0-jre"
- }
}
\ No newline at end of file
From fc5a020384547092c14b97be9ced7d6300c1ab2b Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 30 Jun 2021 21:09:13 +0100
Subject: [PATCH 059/335] try extended chrome args
---
.../usc/irds/sparkler/plugin/FetcherChrome.java | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index c37bebec..62ac6463 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -144,6 +144,20 @@ public void filterResponse(HttpResponse response, HttpMessageContents contents,
chromeOptions.addArguments("--ignore-certificate-errors");
chromeOptions.addArguments("--incognito");
chromeOptions.addArguments("--window-size=1920,1080");
+ chromeOptions.addArguments("--proxy-server='direct://");
+ chromeOptions.addArguments("--proxy-bypass-list=*");
+ chromeOptions.addArguments("--disable-background-networking");
+ chromeOptions.addArguments("--safebrowsing-disable-auto-update");
+ chromeOptions.addArguments("--disable-sync");
+ chromeOptions.addArguments("--metrics-recording-only");
+ chromeOptions.addArguments("--disable-default-apps");
+ chromeOptions.addArguments("--no-first-run");
+ chromeOptions.addArguments("--disable-setuid-sandbox");
+ chromeOptions.addArguments("--hide-scrollbars");
+ chromeOptions.addArguments("--no-zygote");
+ chromeOptions.addArguments("--disable-notifications");
+ chromeOptions.addArguments("--disable-logging");
+ chromeOptions.addArguments("--disable-permissions-api");
chromeOptions.setPageLoadStrategy(PageLoadStrategy.NORMAL);
//capabilities.setCapability(CapabilityType.PROXY, seleniumProxy);
From f2d8c8ad6dc482085c9026c51c0c060d43087e6d Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 30 Jun 2021 21:15:53 +0100
Subject: [PATCH 060/335] build both versions
---
.github/workflows/build-sbt.yaml | 9 +++++++++
sparkler-core/deploy.sh | 11 ++++++++---
2 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index b31bef9f..e9f95a2c 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -27,3 +27,12 @@ jobs:
env:
DATABRICKS_HOST: https://kli-mmit.cloud.databricks.com
DATABRICKS_TOKEN: dapi88de017d8ab1fe80afe5691c1e786185
+ - name: Run package
+ run: sbt assembly -Dsparkprovided=false
+ working-directory: sparkler-core
+ - name: Install databricks
+ run: rm -rf build && ./deploy.sh standalone
+ working-directory: sparkler-core
+ env:
+ DATABRICKS_HOST: https://kli-mmit.cloud.databricks.com
+ DATABRICKS_TOKEN: dapi88de017d8ab1fe80afe5691c1e786185
diff --git a/sparkler-core/deploy.sh b/sparkler-core/deploy.sh
index 06f8845f..36980adb 100755
--- a/sparkler-core/deploy.sh
+++ b/sparkler-core/deploy.sh
@@ -1,8 +1,13 @@
#!/bin/bash
+standalone=$1
+
pip install databricks-cli
-rm -rf build/sparkler-app-0.3.1-SNAPSHOT
-#rm -rf build/plugins
+if [ "$standalone" = "true" ]; then
+ ~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-standalone/
+else
+ rm -rf build/sparkler-app-0.3.1-SNAPSHOT
-~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
+ ~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
+fi
From 577daa7eeeff640039b0511b9396e3c2cb2e9daf Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 30 Jun 2021 21:23:11 +0100
Subject: [PATCH 061/335] build both versions
---
.github/workflows/build-sbt.yaml | 2 +-
sparkler-core/deploy.sh | 1 +
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index e9f95a2c..f56d169a 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -28,7 +28,7 @@ jobs:
DATABRICKS_HOST: https://kli-mmit.cloud.databricks.com
DATABRICKS_TOKEN: dapi88de017d8ab1fe80afe5691c1e786185
- name: Run package
- run: sbt assembly -Dsparkprovided=false
+ run: sbt clean assembly -Dsparkprovided=false
working-directory: sparkler-core
- name: Install databricks
run: rm -rf build && ./deploy.sh standalone
diff --git a/sparkler-core/deploy.sh b/sparkler-core/deploy.sh
index 36980adb..9c50a99a 100755
--- a/sparkler-core/deploy.sh
+++ b/sparkler-core/deploy.sh
@@ -5,6 +5,7 @@ standalone=$1
pip install databricks-cli
if [ "$standalone" = "true" ]; then
+ ls
~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-standalone/
else
rm -rf build/sparkler-app-0.3.1-SNAPSHOT
From ab3ddcb0982d602bcd38b9f29241e917faa2cb6f Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 30 Jun 2021 21:31:59 +0100
Subject: [PATCH 062/335] build both versions
---
.github/workflows/build-sbt.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index f56d169a..8291644e 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -31,7 +31,7 @@ jobs:
run: sbt clean assembly -Dsparkprovided=false
working-directory: sparkler-core
- name: Install databricks
- run: rm -rf build && ./deploy.sh standalone
+ run: ./deploy.sh standalone
working-directory: sparkler-core
env:
DATABRICKS_HOST: https://kli-mmit.cloud.databricks.com
From a5c5d8b965b7c02a333cb96e3078c3500f0fbde7 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 30 Jun 2021 21:37:32 +0100
Subject: [PATCH 063/335] build both versions
---
.github/workflows/build-sbt.yaml | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index 8291644e..1f9525aa 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -18,19 +18,19 @@ jobs:
with:
java-version: '8'
distribution: 'adopt'
- - name: Run package
+ - name: Run submit package
run: sbt assembly -Dsparkprovided=true
working-directory: sparkler-core
- - name: Install databricks
+ - name: Deploy Submit to Databricks
run: ./deploy.sh
working-directory: sparkler-core
env:
DATABRICKS_HOST: https://kli-mmit.cloud.databricks.com
DATABRICKS_TOKEN: dapi88de017d8ab1fe80afe5691c1e786185
- - name: Run package
+ - name: Run full package
run: sbt clean assembly -Dsparkprovided=false
working-directory: sparkler-core
- - name: Install databricks
+ - name: Deploy to databricks
run: ./deploy.sh standalone
working-directory: sparkler-core
env:
From 2bdca9d5d00f86fb1c365fcf6e2e4a00cb2894c4 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 30 Jun 2021 21:38:37 +0100
Subject: [PATCH 064/335] build both versions
---
sparkler-core/deploy.sh | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/deploy.sh b/sparkler-core/deploy.sh
index 9c50a99a..b2305d29 100755
--- a/sparkler-core/deploy.sh
+++ b/sparkler-core/deploy.sh
@@ -1,10 +1,10 @@
#!/bin/bash
-standalone=$1
+method=$1
pip install databricks-cli
-if [ "$standalone" = "true" ]; then
+if [ "$method" = "standalone" ]; then
ls
~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-standalone/
else
From 86b10143b97e1e8d88f30961d7d0a4d51bd02f08 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 30 Jun 2021 22:33:58 +0100
Subject: [PATCH 065/335] allow params
---
sparkler-core/conf/regex-urlfilter.txt | 4 ++--
.../sparkler-app/src/main/resources/regex-urlfilter.txt | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/sparkler-core/conf/regex-urlfilter.txt b/sparkler-core/conf/regex-urlfilter.txt
index fd8ba2f3..ae50a213 100644
--- a/sparkler-core/conf/regex-urlfilter.txt
+++ b/sparkler-core/conf/regex-urlfilter.txt
@@ -27,10 +27,10 @@
# Default: skip image and other suffixes which produces large content
# for a more extensive coverage use the urlfilter-suffix plugin
--\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS|svg|SVG|mp3|MP3|mp4|MP4|pdf|PDF)$
+-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS|svg|SVG|mp3|MP3|mp4|MP4)$
# skip URLs containing certain characters as probable queries, etc.
--[?*!@=]
+#-[?*!@=]
# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
-.*(/[^/]+)/[^/]+\1/[^/]+\1/
diff --git a/sparkler-core/sparkler-app/src/main/resources/regex-urlfilter.txt b/sparkler-core/sparkler-app/src/main/resources/regex-urlfilter.txt
index fd8ba2f3..ae50a213 100644
--- a/sparkler-core/sparkler-app/src/main/resources/regex-urlfilter.txt
+++ b/sparkler-core/sparkler-app/src/main/resources/regex-urlfilter.txt
@@ -27,10 +27,10 @@
# Default: skip image and other suffixes which produces large content
# for a more extensive coverage use the urlfilter-suffix plugin
--\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS|svg|SVG|mp3|MP3|mp4|MP4|pdf|PDF)$
+-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS|svg|SVG|mp3|MP3|mp4|MP4)$
# skip URLs containing certain characters as probable queries, etc.
--[?*!@=]
+#-[?*!@=]
# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
-.*(/[^/]+)/[^/]+\1/[^/]+\1/
From 49e9189ab3085077ca985c85aaa1b5df781462d8 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Thu, 1 Jul 2021 13:00:12 +0100
Subject: [PATCH 066/335] fix screenshot pointer
---
.../main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 62ac6463..d273c5b0 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -259,7 +259,7 @@ public FetchedData fetch(Resource resource) throws Exception {
Map tempmap = new HashMap<>();
tempmap.put("type", "file");
tempmap.put("targetdir", pluginConfig.getOrDefault("chrome.selenium.screenshotdir","/dbfs/FileStore/screenshots/")+resource.getCrawlId()+System.currentTimeMillis());
- scripter.screenshot(tempmap);
+ scripter.screenshotOperation(tempmap);
e.printStackTrace();
}
List snapshots = scripter.getSnapshots();
From 8fa00af8c4a5f0325712ffe299b3714ca94fa371 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Thu, 1 Jul 2021 19:55:18 +0100
Subject: [PATCH 067/335] add more logs
---
sparkler-core/project/PluginDependencies.scala | 2 +-
.../java/edu/usc/irds/sparkler/plugin/FetcherChrome.java | 7 ++++---
2 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 343a97b3..2cc7b99f 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-SNAPSHOT"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-SNAPSHOT" changing()
}
object FetcherHtmlUnit {
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index d273c5b0..78dada48 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -188,7 +188,7 @@ public FetchedData fetch(Resource resource) throws Exception {
* using default Fetcher
*/
if (!isWebPage(resource.getUrl())) {
- LOG.debug("{} not a html. Falling back to default fetcher.", resource.getUrl());
+ LOG.info("{} not a html. Falling back to default fetcher.", resource.getUrl());
// This should be true for all URLS ending with 4 character file extension
// return new FetchedData("".getBytes(), "application/html", ERROR_CODE) ;
return super.fetch(resource);
@@ -252,6 +252,7 @@ public FetchedData fetch(Resource resource) throws Exception {
if(json != null && json.containsKey("selenium")){
if(json.get("selenium") != null && json.get("selenium") instanceof Map) {
try {
+ LOG.info("Running Selenium Script");
Map m = (Map) json.get("selenium");
Map jsonmap = new TreeMap(m);
scripter.runScript(jsonmap);
@@ -260,7 +261,7 @@ public FetchedData fetch(Resource resource) throws Exception {
tempmap.put("type", "file");
tempmap.put("targetdir", pluginConfig.getOrDefault("chrome.selenium.screenshotdir","/dbfs/FileStore/screenshots/")+resource.getCrawlId()+System.currentTimeMillis());
scripter.screenshotOperation(tempmap);
- e.printStackTrace();
+ LOG.info(e.getMessage());
}
List snapshots = scripter.getSnapshots();
html = String.join(",", snapshots);
@@ -273,7 +274,7 @@ public FetchedData fetch(Resource resource) throws Exception {
LOG.debug("Time taken to load {} - {} ", resource.getUrl(), (System.currentTimeMillis() - start));
- System.out.println("LATEST STATUS: "+latestStatus);
+ LOG.info("LATEST STATUS: "+latestStatus);
/*if (!(latestStatus >= 200 && latestStatus < 300) && latestStatus != 0) {
// If not fetched through plugin successfully
// Falling back to default fetcher
From 18c4e66370bb891538568a7ea05288f0f5833634 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 2 Jul 2021 13:52:29 +0100
Subject: [PATCH 068/335] adjust for more parameters
---
sparkler-core/conf/sparkler-default.yaml | 23 +++++++++++
.../project/PluginDependencies.scala | 2 +-
sparkler-core/project/plugins.sbt | 3 +-
.../usc/irds/sparkler/pipeline/Crawler.scala | 18 +++++++--
.../irds/sparkler/plugin/FetcherChrome.java | 38 ++++++-------------
5 files changed, 52 insertions(+), 32 deletions(-)
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index 797c481e..922e6cf0 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -41,6 +41,7 @@ spark.master:
#local[*]
databricks.enable: false
+crawl.repartition: 500
##################### Apache Kafka Properties ###########################
# Enable Kafka Dump
# Type: Boolean. Default is "false"
@@ -137,6 +138,28 @@ plugins:
#What type of element, class, name, id
chrome.wait.type: "class"
chrome.dns: "http://localhost:3000/webdriver"
+ chrome.options:
+ - "--no-sandbox"
+ - "--headless"
+ - "--disable-gpu"
+ - "--disable-extensions"
+ - "--ignore-certificate-errors"
+ - "--incognito"
+ - "--window-size=1920,1080"
+ - "--proxy-server='direct://"
+ - "--proxy-bypass-list=*"
+ - "--disable-background-networking"
+ - "--safebrowsing-disable-auto-update"
+ - "--disable-sync"
+ - "--metrics-recording-only"
+ - "--disable-default-apps"
+ - "--no-first-run"
+ - "--disable-setuid-sandbox"
+ - "--hide-scrollbars"
+ - "--no-zygote"
+ - "--disable-notifications"
+ - "--disable-logging"
+ - "--disable-permissions-api"
#chrome.selenium.enabled: "true"
#chrome.selenium.script.click: "id:txtName"
#chrome.selenium.script.keys: "COR"
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 2cc7b99f..8716bfd0 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-SNAPSHOT" changing()
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210701.125932-13"
}
object FetcherHtmlUnit {
diff --git a/sparkler-core/project/plugins.sbt b/sparkler-core/project/plugins.sbt
index cf7817d0..7225838f 100644
--- a/sparkler-core/project/plugins.sbt
+++ b/sparkler-core/project/plugins.sbt
@@ -14,9 +14,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.7.4")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.13")
-addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.13")
+addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.13")
\ No newline at end of file
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index cdcc4fb7..f7872f64 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -247,9 +247,21 @@ class Crawler extends CliTool {
//val fetchedRdd = f.mapPartitions( x => mapCrawl(x))
//val fetchedRdd = rc.runCrawl(f, job)
- val fetchedRdd = f.repartition(500).flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
- FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq }).repartition(500)
- .persist()
+
+
+ var fetchedRdd: RDD[CrawlData] = null
+ val rep: Int = sparklerConf.get("crawl.repartition").asInstanceOf[Int]
+ if (rep > 0) {
+ fetchedRdd = f.repartition(rep).flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
+ FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq
+ }).repartition(rep)
+ .persist()
+ } else {
+ fetchedRdd = f.repartition(1).flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
+ FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq
+ }).repartition(1)
+ .persist()
+ }
//val coll = fetchedRdd.collect()
//val d = fetchedRdd.getNumPartitions
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 78dada48..f34a5e78 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -41,13 +41,10 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.*;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
import java.net.CookieHandler;
import java.net.CookieManager;
import java.net.CookiePolicy;
-import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
import com.browserup.bup.BrowserUpProxy;
@@ -137,27 +134,16 @@ public void filterResponse(HttpResponse response, HttpMessageContents contents,
DesiredCapabilities capabilities = DesiredCapabilities.chrome();
final ChromeOptions chromeOptions = new ChromeOptions();
- chromeOptions.addArguments("--no-sandbox");
- chromeOptions.addArguments("--headless");
- chromeOptions.addArguments("--disable-gpu");
- chromeOptions.addArguments("--disable-extensions");
- chromeOptions.addArguments("--ignore-certificate-errors");
- chromeOptions.addArguments("--incognito");
- chromeOptions.addArguments("--window-size=1920,1080");
- chromeOptions.addArguments("--proxy-server='direct://");
- chromeOptions.addArguments("--proxy-bypass-list=*");
- chromeOptions.addArguments("--disable-background-networking");
- chromeOptions.addArguments("--safebrowsing-disable-auto-update");
- chromeOptions.addArguments("--disable-sync");
- chromeOptions.addArguments("--metrics-recording-only");
- chromeOptions.addArguments("--disable-default-apps");
- chromeOptions.addArguments("--no-first-run");
- chromeOptions.addArguments("--disable-setuid-sandbox");
- chromeOptions.addArguments("--hide-scrollbars");
- chromeOptions.addArguments("--no-zygote");
- chromeOptions.addArguments("--disable-notifications");
- chromeOptions.addArguments("--disable-logging");
- chromeOptions.addArguments("--disable-permissions-api");
+
+ List chromedefaults = Arrays.asList("--no-sandbox", "--headless", "--disable-gpu", "--disable-extensions",
+ "--ignore-certificate-errors", "--incognito", "--window-size=1920,1080", "--proxy-server='direct://",
+ "--proxy-bypass-list=*", "--disable-background-networking", "--safebrowsing-disable-auto-update",
+ "--disable-sync", "--metrics-recording-only", "--disable-default-apps", "--no-first-run",
+ "--disable-setuid-sandbox", "--hide-scrollbars", "--no-zygote", "--disable-notifications",
+ "--disable-logging", "--disable-permissions-api");
+
+ List vals = (List) (pluginConfig.getOrDefault("chrome.options", chromedefaults));
+ chromeOptions.addArguments(vals);
chromeOptions.setPageLoadStrategy(PageLoadStrategy.NORMAL);
//capabilities.setCapability(CapabilityType.PROXY, seleniumProxy);
@@ -314,7 +300,7 @@ private boolean isWebPage(String webUrl) {
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
HttpURLConnection conn = (HttpURLConnection)url.openConnection();
String contentType = conn.getHeaderField("Content-Type");
- return contentType.contains("text") || contentType.contains("ml") || conn.getResponseCode() == 302;
+ return contentType.contains("json") || contentType.contains("text") || contentType.contains("ml") || conn.getResponseCode() == 302;
} catch (Exception e) {
LOG.debug(e.getMessage(), e);
}
From 74df42cdcb5dd729180846da7ff1c424ec4ecad1 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 2 Jul 2021 14:08:09 +0100
Subject: [PATCH 069/335] update internal conf!
---
.../src/main/resources/sparkler-default.yaml | 23 +++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
index 4e51cba9..6fb89a3a 100644
--- a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
+++ b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
@@ -32,6 +32,7 @@ solr.uri: http://ec2-35-174-200-133.compute-1.amazonaws.com:8983/solr/crawldb
# elasticsearch settings
elasticsearch.uri: http://localhost:9200
+crawl.repartition: 500
##################### Apache Spark Properties ###########################
@@ -142,6 +143,28 @@ plugins:
#chrome.selenium.script.keys: "COR"
#chrome.selenium.script.click: "id:btnSearch"
#chrome.proxy.address: 127.0.0.1:9998
+ chrome.options:
+ - "--no-sandbox"
+ - "--headless"
+ - "--disable-gpu"
+ - "--disable-extensions"
+ - "--ignore-certificate-errors"
+ - "--incognito"
+ - "--window-size=1920,1080"
+ - "--proxy-server='direct://"
+ - "--proxy-bypass-list=*"
+ - "--disable-background-networking"
+ - "--safebrowsing-disable-auto-update"
+ - "--disable-sync"
+ - "--metrics-recording-only"
+ - "--disable-default-apps"
+ - "--no-first-run"
+ - "--disable-setuid-sandbox"
+ - "--hide-scrollbars"
+ - "--no-zygote"
+ - "--disable-notifications"
+ - "--disable-logging"
+ - "--disable-permissions-api"
url.injector:
mode: selenium # currently only compatible with the fetcher-chrome plugin
#mode: replace
From 10fc14f763c76d9e567fb9b5798648c4e7952e68 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sat, 3 Jul 2021 00:36:31 +0100
Subject: [PATCH 070/335] update scripter
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 8716bfd0..37ebef7c 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210701.125932-13"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210702.233545-14"
}
object FetcherHtmlUnit {
From 3bde421b2e56317d4258d0bf9c0e1f551d7bc457 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sat, 3 Jul 2021 00:51:09 +0100
Subject: [PATCH 071/335] update scripter
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 37ebef7c..025b7f58 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210702.233545-14"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210702.235015-15"
}
object FetcherHtmlUnit {
From b95d5e1e47ee35281246f0d258bc642be2acc277 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sat, 3 Jul 2021 01:05:10 +0100
Subject: [PATCH 072/335] update scripter
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 025b7f58..064381e8 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210702.235015-15"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210703.000420-16"
}
object FetcherHtmlUnit {
From 5a5fc606ef1a30f32ff0f897c4c010fbe922d84a Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sat, 3 Jul 2021 01:28:44 +0100
Subject: [PATCH 073/335] update scripter
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 064381e8..1cb10127 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210703.000420-16"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210703.002821-17"
}
object FetcherHtmlUnit {
From 5a311eb6c6c5b487d7b3a93639405b1f8e45d279 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sat, 3 Jul 2021 02:21:01 +0100
Subject: [PATCH 074/335] update scripter
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 1cb10127..86db2bb1 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210703.002821-17"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210703.011941-18"
}
object FetcherHtmlUnit {
From 4ab2f75cf954d2d1dab68a356cdfe3b97cbe5e28 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sat, 3 Jul 2021 02:36:30 +0100
Subject: [PATCH 075/335] update scripter
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 86db2bb1..763eca04 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210703.011941-18"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210703.013542-19"
}
object FetcherHtmlUnit {
From 72bab644f9982c9e50b5ac7357909c4c5a8f5d4d Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sat, 3 Jul 2021 03:38:33 +0100
Subject: [PATCH 076/335] update scripter
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 763eca04..131b7bb1 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210703.013542-19"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210703.023751-21"
}
object FetcherHtmlUnit {
From cd2766925078cc67a5442e9beff34c3f5fc125b7 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 5 Jul 2021 12:17:47 +0100
Subject: [PATCH 077/335] update scripter version
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 131b7bb1..69984a3b 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210703.023751-21"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210705.111436-22"
}
object FetcherHtmlUnit {
From d629f257f6b9c592691124113704d188e549b502 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 5 Jul 2021 12:32:40 +0100
Subject: [PATCH 078/335] update scripter version
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 69984a3b..669c05a7 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210705.111436-22"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210705.113006-23"
}
object FetcherHtmlUnit {
From 65e7773cd4fe7e49234ce042e9a2b7444fa58274 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 5 Jul 2021 13:41:06 +0100
Subject: [PATCH 079/335] update scripter version
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 669c05a7..e7509367 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210705.113006-23"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210705.124039-24"
}
object FetcherHtmlUnit {
From 26c67c63e983b0ca37c4ed90c40c75b392f0d0d3 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 5 Jul 2021 14:04:10 +0100
Subject: [PATCH 080/335] update scripter version
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index e7509367..052b15d0 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210705.124039-24"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210705.130333-25"
}
object FetcherHtmlUnit {
From e9302911bcb28b38d74768103dc71653d1f9b3cb Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 5 Jul 2021 14:10:56 +0100
Subject: [PATCH 081/335] update scripter version
---
sparkler-core/project/PluginDependencies.scala | 2 +-
.../main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 052b15d0..6491c087 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210705.130333-25"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210705.130945-26"
}
object FetcherHtmlUnit {
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index f34a5e78..baba13ec 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -247,7 +247,7 @@ public FetchedData fetch(Resource resource) throws Exception {
tempmap.put("type", "file");
tempmap.put("targetdir", pluginConfig.getOrDefault("chrome.selenium.screenshotdir","/dbfs/FileStore/screenshots/")+resource.getCrawlId()+System.currentTimeMillis());
scripter.screenshotOperation(tempmap);
- LOG.info(e.getMessage());
+ LOG.error("Scripter Exception", e);
}
List snapshots = scripter.getSnapshots();
html = String.join(",", snapshots);
From 04f32ef37e88dd53c0a76fd4e104f20a27a37339 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 5 Jul 2021 22:48:46 +0100
Subject: [PATCH 082/335] update scripter version
---
sparkler-core/project/PluginDependencies.scala | 4 ++--
.../usc/irds/sparkler/plugin/FetcherChrome.java | 16 ++++++----------
2 files changed, 8 insertions(+), 12 deletions(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 6491c087..9bf17b0f 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -23,12 +23,12 @@ object PluginDependencies {}
object FetcherChrome {
object Selenium {
private val group = "org.seleniumhq.selenium"
- private val version = "3.141.59"
+ private val version = "4.0.0-beta-4"
lazy val chromeDriver = group % "selenium-chrome-driver" % version
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210705.130945-26"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210705.194205-28"
}
object FetcherHtmlUnit {
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index baba13ec..04ab115d 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -35,8 +35,6 @@
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
-import org.openqa.selenium.support.ui.ExpectedConditions;
-import org.openqa.selenium.support.ui.WebDriverWait;
import org.pf4j.Extension;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -132,7 +130,6 @@ public void filterResponse(HttpResponse response, HttpMessageContents contents,
// seleniumProxy.setHttpProxy("172.17.146.238:"+Integer.toString(port));
// seleniumProxy.setSslProxy("172.17.146.238:"+Integer.toString(port));
- DesiredCapabilities capabilities = DesiredCapabilities.chrome();
final ChromeOptions chromeOptions = new ChromeOptions();
List chromedefaults = Arrays.asList("--no-sandbox", "--headless", "--disable-gpu", "--disable-extensions",
@@ -147,13 +144,12 @@ public void filterResponse(HttpResponse response, HttpMessageContents contents,
chromeOptions.setPageLoadStrategy(PageLoadStrategy.NORMAL);
//capabilities.setCapability(CapabilityType.PROXY, seleniumProxy);
- capabilities.setCapability(ChromeOptions.CAPABILITY, chromeOptions);
if(loc.equals("local")){
- driver = new ChromeDriver(capabilities);
+ driver = new ChromeDriver(chromeOptions);
driver.manage().timeouts().pageLoadTimeout(3600, TimeUnit.SECONDS);
} else{
- driver = new RemoteWebDriver(new URL(loc), capabilities);
+ driver = new RemoteWebDriver(new URL(loc), chromeOptions);
}
driver.manage().window().setSize(new Dimension(1920, 1080));
@@ -197,7 +193,7 @@ public FetchedData fetch(Resource resource) throws Exception {
}
driver.get(resource.getUrl());
- int waittimeout = (int) pluginConfig.getOrDefault("chrome.wait.timeout", "-1");
+/* int waittimeout = (int) pluginConfig.getOrDefault("chrome.wait.timeout", "-1");
String waittype = (String) pluginConfig.getOrDefault("chrome.wait.type", "");
String waitelement = (String) pluginConfig.getOrDefault("chrome.wait.element", "");
@@ -219,11 +215,11 @@ public FetchedData fetch(Resource resource) throws Exception {
wait.until(ExpectedConditions.visibilityOfElementLocated(By.id(waitelement)));
break;
}
- }
+ }*/
SeleniumScripter scripter = new SeleniumScripter(driver);
String seleniumenabled = (String) pluginConfig.getOrDefault("chrome.selenium.enabled", "false");
String html = null;
- if (seleniumenabled.equals("true")) {
+/* if (seleniumenabled.equals("true")) {
if(pluginConfig.get("chrome.selenium.script") != null && pluginConfig.get("chrome.selenium.script") instanceof Map) {
Map map = (Map) pluginConfig.get("chrome.selenium.script");
try {
@@ -234,7 +230,7 @@ public FetchedData fetch(Resource resource) throws Exception {
List snapshots = scripter.getSnapshots();
html = String.join(",", snapshots);
}
- }
+ }*/
if(json != null && json.containsKey("selenium")){
if(json.get("selenium") != null && json.get("selenium") instanceof Map) {
try {
From 702696fe58b62a543ef63cb7ff60e6fd8d96d8a0 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 5 Jul 2021 22:58:52 +0100
Subject: [PATCH 083/335] update scripter version
---
sparkler-core/build.sbt | 1 +
sparkler-core/project/Settings.scala | 3 +++
2 files changed, 4 insertions(+)
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index d99f0435..258c12f8 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -160,6 +160,7 @@ lazy val app = (project in file("sparkler-app"))
case PathList("javax", "inject", xs@_*) => MergeStrategy.first
case PathList("javax", "annotation", xs@_*) => MergeStrategy.first
case PathList("com", "sun", xs@_*) => MergeStrategy.first
+ case PathList("javax", "activation", xs@_*) => MergeStrategy.first
case x => (assemblyMergeStrategy in assembly).value.apply(x)
},
diff --git a/sparkler-core/project/Settings.scala b/sparkler-core/project/Settings.scala
index 72e59d9b..0f9e9cb5 100644
--- a/sparkler-core/project/Settings.scala
+++ b/sparkler-core/project/Settings.scala
@@ -88,6 +88,9 @@ object Settings {
case x if x.contains("META-INF/versions/9/javax/xml/bind/") => MergeStrategy.first
case x if x.contains("MANIFEST.MF") => MergeStrategy.discard
case PathList("META-INF", "MANIFEST.MF") => MergeStrategy.discard
+ case PathList("javax", "activation", xs@_*) => MergeStrategy.first
+ case PathList("io", "netty", xs@_*) => MergeStrategy.first
+
case x => (assemblyMergeStrategy in assembly).value.apply(x)
},
assemblyOutputPath in assembly := file(".") / buildDir / pluginsDir / s"${name.value}-${(version in ThisBuild).value}.jar"
From 58275db8227c9d3dde34a05be901c7fc5ac1365d Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 6 Jul 2021 10:45:44 +0100
Subject: [PATCH 084/335] revert to selenium 3
---
sparkler-core/build.sbt | 6 +----
sparkler-core/plugins.build.sbt | 13 +++++----
.../project/PluginDependencies.scala | 4 +--
sparkler-core/project/Settings.scala | 4 +--
.../src/main/resources/log4j2.properties | 11 +++++++-
.../service/CustomerPluginManager.java | 19 +++++++++++++
.../sparkler/service/PluginManagerLoader.java | 27 +++++++++++++++++++
.../irds/sparkler/service/PluginService.scala | 3 ++-
.../irds/sparkler/plugin/FetcherChrome.java | 10 +++----
9 files changed, 73 insertions(+), 24 deletions(-)
create mode 100644 sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/CustomerPluginManager.java
create mode 100644 sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginManagerLoader.java
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index 258c12f8..801a1d58 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -139,6 +139,7 @@ lazy val app = (project in file("sparkler-app"))
case x if x.contains("git.properties") => MergeStrategy.first
case x if x.contains("config.fmpp") => MergeStrategy.first
case x if x.contains("META-INF/versions/9/javax/xml/bind/") => MergeStrategy.first
+ case x if x.contains("META-INF/native-image/io.netty") => MergeStrategy.first
case PathList("org", "apache", "logging", "log4j", xs@_*) => MergeStrategy.first
case PathList("org", "apache", "logging", xs@_*) => MergeStrategy.first
case PathList("org", "apache", "log4j", xs@_*) => MergeStrategy.first
@@ -160,7 +161,6 @@ lazy val app = (project in file("sparkler-app"))
case PathList("javax", "inject", xs@_*) => MergeStrategy.first
case PathList("javax", "annotation", xs@_*) => MergeStrategy.first
case PathList("com", "sun", xs@_*) => MergeStrategy.first
- case PathList("javax", "activation", xs@_*) => MergeStrategy.first
case x => (assemblyMergeStrategy in assembly).value.apply(x)
},
@@ -235,7 +235,3 @@ lazy val ui = (project in file("sparkler-ui"))
)
-/*enablePlugins(PackPlugin)
-
-packMain := Map("inject" -> "edu.usc.irds.sparkler.service.Injector")*/
-
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index ba4152fa..9d8bf5f0 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -28,12 +28,12 @@ lazy val plugins = (project in file(s"$sparklerPlugins"))
.aggregate(
fetcherChrome,
fetcherHtmlUnit,
- fetcherJBrowser,
+
scorerDdSvn,
urlFilterRegex,
urlFilterSameHost,
)
-
+//fetcherJBrowser,
/**
* ================ PLUGINS ================
*/
@@ -65,10 +65,9 @@ lazy val fetcherChrome = (project in file(s"$sparklerPlugins/fetcher-chrome"))
name := "fetcher-chrome",
libraryDependencies ++= Seq(
FetcherChrome.Selenium.java exclude("org.slf4j", "slf4j-api"),
- FetcherChrome.browserup exclude("com.fasterxml.jackson.core", "jackson-databind") exclude("org.slf4j", "slf4j-api"),
- FetcherChrome.seleniumscripter exclude("org.slf4j", "slf4j-api"),
FetcherChrome.Selenium.chromeDriver exclude("org.slf4j", "slf4j-api"),
- //FetcherChrome.Selenium.guava exclude("org.slf4j", "slf4j-api")
+ //FetcherChrome.browserup exclude("com.fasterxml.jackson.core", "jackson-databind") exclude("org.slf4j", "slf4j-api") exclude("io.netty", "netty-all"),
+ FetcherChrome.seleniumscripter exclude("org.slf4j", "slf4j-api"),
),
Settings.pluginManifest(
id = "fetcher-chrome",
@@ -97,7 +96,7 @@ lazy val fetcherHtmlUnit = (project in file(s"$sparklerPlugins/fetcher-htmlunit"
)
.dependsOn(api)
-lazy val fetcherJBrowser = (project in file(s"$sparklerPlugins/fetcher-jbrowser"))
+/*lazy val fetcherJBrowser = (project in file(s"$sparklerPlugins/fetcher-jbrowser"))
.enablePlugins(JavaAppPackaging)
.settings(
Settings.plugin,
@@ -111,7 +110,7 @@ lazy val fetcherJBrowser = (project in file(s"$sparklerPlugins/fetcher-jbrowser"
dependencies = List.empty
)
)
- .dependsOn(api)
+ .dependsOn(api)*/
lazy val scorerDdSvn = (project in file(s"$sparklerPlugins/scorer-dd-svn"))
.enablePlugins(JavaAppPackaging)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 9bf17b0f..aeb324c3 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -23,12 +23,12 @@ object PluginDependencies {}
object FetcherChrome {
object Selenium {
private val group = "org.seleniumhq.selenium"
- private val version = "4.0.0-beta-4"
+ private val version = "3.141.59"
lazy val chromeDriver = group % "selenium-chrome-driver" % version
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210705.194205-28"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210706.094407-29"
}
object FetcherHtmlUnit {
diff --git a/sparkler-core/project/Settings.scala b/sparkler-core/project/Settings.scala
index 0f9e9cb5..1176d757 100644
--- a/sparkler-core/project/Settings.scala
+++ b/sparkler-core/project/Settings.scala
@@ -77,7 +77,7 @@ object Settings {
lazy val plugin = assemblyProject ++ Seq(
autoScalaLibrary := false,
assemblyMergeStrategy in assembly := {
- case x if x.contains("io.netty.versions.properties") => MergeStrategy.first
+ case x if x.contains("io.netty.versions.properties") => MergeStrategy.last
case x if x.contains("Log4j2Plugins.dat") => MergeStrategy.first
case x if x.contains("module-info.class") => MergeStrategy.first
case x if x.contains("public-suffix-list.txt") => MergeStrategy.first
@@ -89,7 +89,7 @@ object Settings {
case x if x.contains("MANIFEST.MF") => MergeStrategy.discard
case PathList("META-INF", "MANIFEST.MF") => MergeStrategy.discard
case PathList("javax", "activation", xs@_*) => MergeStrategy.first
- case PathList("io", "netty", xs@_*) => MergeStrategy.first
+ //case PathList("io", "netty", xs@_*) => MergeStrategy.last
case x => (assemblyMergeStrategy in assembly).value.apply(x)
},
diff --git a/sparkler-core/sparkler-app/src/main/resources/log4j2.properties b/sparkler-core/sparkler-app/src/main/resources/log4j2.properties
index d01ac13c..3f07c420 100644
--- a/sparkler-core/sparkler-app/src/main/resources/log4j2.properties
+++ b/sparkler-core/sparkler-app/src/main/resources/log4j2.properties
@@ -45,4 +45,13 @@ rootLogger.level = debug
rootLogger.appenderRefs = stdout
rootLogger.appenderRef.stdout.ref = STDOUT
logger.kythera.name = com.kytheralabs
-logger.kythera.level = DEBUG
\ No newline at end of file
+logger.kythera.level = DEBUG
+
+logger.pf4j.name = org.pf4j
+logger.pf4j.level = debug
+logger.pf4j.additivity = false
+logger.pf4j.appenderRef.console.ref = console
+#logger.loader.name = org.pf4j.PluginClassLoader
+#logger.loader.level = trace
+#logger.finder.name = org.pf4j.AbstractExtensionFinder
+#logger.finder.level = trace
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/CustomerPluginManager.java b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/CustomerPluginManager.java
new file mode 100644
index 00000000..8921925e
--- /dev/null
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/CustomerPluginManager.java
@@ -0,0 +1,19 @@
+package edu.usc.irds.sparkler.service;
+
+import org.pf4j.*;
+
+public class CustomerPluginManager {
+
+ public static DefaultPluginManager getPluginManager(){
+ return new DefaultPluginManager(){
+ @Override
+ protected PluginLoader createPluginLoader() {
+ return new CompoundPluginLoader()
+ .add(new PluginManagerLoader(this), this::isNotDevelopment);
+ //.add(new JarPluginLoader(this), this::isNotDevelopment)
+ //.add(new DefaultPluginLoader(this), this::isNotDevelopment);
+
+ }
+ };
+ }
+}
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginManagerLoader.java b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginManagerLoader.java
new file mode 100644
index 00000000..6f13595b
--- /dev/null
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginManagerLoader.java
@@ -0,0 +1,27 @@
+package edu.usc.irds.sparkler.service;
+
+import org.pf4j.*;
+
+import java.nio.file.Path;
+
+public class PluginManagerLoader extends JarPluginLoader {
+ public PluginManagerLoader(PluginManager pluginManager) {
+ super(pluginManager);
+ }
+
+ @Override
+ public ClassLoader loadPlugin(Path pluginPath, PluginDescriptor pluginDescriptor) {
+ if(pluginPath.toString().contains("fetcher-chrome")) {
+ PluginClassLoader pluginClassLoader = new PluginClassLoader(pluginManager, pluginDescriptor, getClass().getClassLoader(), ClassLoadingStrategy.PDA);
+ pluginClassLoader.addFile(pluginPath.toFile());
+ return pluginClassLoader;
+ } else{
+ PluginClassLoader pluginClassLoader = new PluginClassLoader(pluginManager, pluginDescriptor, getClass().getClassLoader());
+ pluginClassLoader.addFile(pluginPath.toFile());
+ return pluginClassLoader;
+ }
+
+
+ }
+
+}
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginService.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginService.scala
index 36cadf2a..0bae207b 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginService.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginService.scala
@@ -36,7 +36,8 @@ import scala.collection.JavaConversions._
class PluginService(job:SparklerJob) {
import PluginService._
- val pluginManager = new DefaultPluginManager()
+ val pluginManager = CustomerPluginManager.getPluginManager
+
// This map keeps cache of all active instances
val registry = new mutable.HashMap[Class[_ <: ExtensionPoint], ExtensionPoint]
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 04ab115d..7d5f4626 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -33,7 +33,6 @@
import org.openqa.selenium.Proxy;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
-import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.pf4j.Extension;
import org.slf4j.Logger;
@@ -45,14 +44,13 @@
import java.net.CookiePolicy;
import java.util.concurrent.TimeUnit;
-import com.browserup.bup.BrowserUpProxy;
+/*import com.browserup.bup.BrowserUpProxy;
import com.browserup.bup.BrowserUpProxyServer;
import com.browserup.bup.client.ClientUtil;
import com.browserup.bup.filters.ResponseFilter;
import com.browserup.bup.proxy.CaptureType;
import com.browserup.bup.util.HttpMessageContents;
-import com.browserup.bup.util.HttpMessageInfo;
-import io.netty.handler.codec.http.HttpResponse;
+import com.browserup.bup.util.HttpMessageInfo;*/
@Extension
public class FetcherChrome extends FetcherDefault {
@@ -100,7 +98,7 @@ private void startDriver(Boolean restartproxy) throws UnknownHostException, Malf
if (loc.equals("")) {
driver = new ChromeDriver();
} else {
- BrowserUpProxy proxy = new BrowserUpProxyServer();
+ /*BrowserUpProxy proxy = new BrowserUpProxyServer();
proxy.setTrustAllServers(true);
proxy.enableHarCaptureTypes(CaptureType.REQUEST_CONTENT, CaptureType.RESPONSE_CONTENT);
@@ -125,7 +123,7 @@ public void filterResponse(HttpResponse response, HttpMessageContents contents,
InetSocketAddress addr = new InetSocketAddress(InetAddress.getByName(s[0]), Integer.parseInt(s[1]));
seleniumProxy = ClientUtil.createSeleniumProxy(addr);
}
- }
+ }*/
// seleniumProxy.setHttpProxy("172.17.146.238:"+Integer.toString(port));
// seleniumProxy.setSslProxy("172.17.146.238:"+Integer.toString(port));
From bca507c6769a9beed50e3e0d0000e3d9e893cb33 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 6 Jul 2021 11:32:31 +0100
Subject: [PATCH 085/335] fixes for seleniumscripter
---
sparkler-core/project/Settings.scala | 2 ++
.../main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java | 2 +-
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/sparkler-core/project/Settings.scala b/sparkler-core/project/Settings.scala
index 1176d757..9f2557a5 100644
--- a/sparkler-core/project/Settings.scala
+++ b/sparkler-core/project/Settings.scala
@@ -87,8 +87,10 @@ object Settings {
case x if x.contains("config.fmpp") => MergeStrategy.first
case x if x.contains("META-INF/versions/9/javax/xml/bind/") => MergeStrategy.first
case x if x.contains("MANIFEST.MF") => MergeStrategy.discard
+ case x if x.contains("ExtensionModule") => MergeStrategy.first
case PathList("META-INF", "MANIFEST.MF") => MergeStrategy.discard
case PathList("javax", "activation", xs@_*) => MergeStrategy.first
+
//case PathList("io", "netty", xs@_*) => MergeStrategy.last
case x => (assemblyMergeStrategy in assembly).value.apply(x)
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 7d5f4626..18307129 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -130,7 +130,7 @@ public void filterResponse(HttpResponse response, HttpMessageContents contents,
final ChromeOptions chromeOptions = new ChromeOptions();
- List chromedefaults = Arrays.asList("--no-sandbox", "--headless", "--disable-gpu", "--disable-extensions",
+ List chromedefaults = Arrays.asList("--no-sandbox", "--disable-gpu", "--disable-extensions",
"--ignore-certificate-errors", "--incognito", "--window-size=1920,1080", "--proxy-server='direct://",
"--proxy-bypass-list=*", "--disable-background-networking", "--safebrowsing-disable-auto-update",
"--disable-sync", "--metrics-recording-only", "--disable-default-apps", "--no-first-run",
From 4eb10141cb6f91fb221bb500a60a75c38bc53bee Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 6 Jul 2021 11:39:16 +0100
Subject: [PATCH 086/335] fixes for seleniumscripter
---
.../irds/sparkler/plugin/FetcherChrome.java | 18 ++++++++++--------
1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 18307129..60c8604d 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -69,13 +69,13 @@ public void init(JobContext context, String pluginId) throws SparklerException {
// TODO should change everywhere
pluginConfig = config.getPluginConfiguration(pluginId);
- try {
+ /*try {
System.out.println("Initializing Chrome Driver");
startDriver(true);
} catch (UnknownHostException | MalformedURLException e) {
e.printStackTrace();
System.out.println("Failed to init Chrome Session");
- }
+ }*/
}
private void checkSession() {
@@ -130,7 +130,7 @@ public void filterResponse(HttpResponse response, HttpMessageContents contents,
final ChromeOptions chromeOptions = new ChromeOptions();
- List chromedefaults = Arrays.asList("--no-sandbox", "--disable-gpu", "--disable-extensions",
+ List chromedefaults = Arrays.asList("--headless", "--no-sandbox", "--disable-gpu", "--disable-extensions",
"--ignore-certificate-errors", "--incognito", "--window-size=1920,1080", "--proxy-server='direct://",
"--proxy-bypass-list=*", "--disable-background-networking", "--safebrowsing-disable-auto-update",
"--disable-sync", "--metrics-recording-only", "--disable-default-apps", "--no-first-run",
@@ -162,6 +162,12 @@ public FetchedData fetch(Resource resource) throws Exception {
LOG.info("Chrome FETCHER {}", resource.getUrl());
FetchedData fetchedData;
JSONObject json = null;
+ try {
+ checkSession();
+ } catch (Exception e){
+ System.out.println("failed to start selenium session");
+ }
+
/*
* In this plugin we will work on only HTML data If data is of any other data
* type like image, pdf etc plugin will return client error so it can be fetched
@@ -184,11 +190,7 @@ public FetchedData fetch(Resource resource) throws Exception {
// This will block for the page load and any
// associated AJAX requests
- try {
- checkSession();
- } catch (Exception e){
- System.out.println("failed to start selenium session");
- }
+
driver.get(resource.getUrl());
/* int waittimeout = (int) pluginConfig.getOrDefault("chrome.wait.timeout", "-1");
From c307d93c0ca49aea9453e995efcba2d5ec339359 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 7 Jul 2021 00:58:54 +0100
Subject: [PATCH 087/335] unpick hard coded screenshot path
---
sparkler-core/conf/sparkler-default.yaml | 1 +
.../sparkler-app/src/main/resources/sparkler-default.yaml | 1 +
.../java/edu/usc/irds/sparkler/plugin/FetcherChrome.java | 8 +++++---
3 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index 922e6cf0..7bb9c39b 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -138,6 +138,7 @@ plugins:
#What type of element, class, name, id
chrome.wait.type: "class"
chrome.dns: "http://localhost:3000/webdriver"
+ chrome.selenium.screenshotdir: "/dbfs/FileStore/screenshots/"
chrome.options:
- "--no-sandbox"
- "--headless"
diff --git a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
index 6fb89a3a..209654a2 100644
--- a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
+++ b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
@@ -138,6 +138,7 @@ plugins:
#What type of element, class, name, id
chrome.wait.type: "class"
chrome.dns: "http://localhost:3000/webdriver"
+ chrome.selenium.screenshotdir: "/dbfs/FileStore/screenshots/"
#chrome.selenium.enabled: "true"
#chrome.selenium.script.click: "id:txtName"
#chrome.selenium.script.keys: "COR"
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 60c8604d..1f01bd5b 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -239,10 +239,12 @@ public FetchedData fetch(Resource resource) throws Exception {
Map jsonmap = new TreeMap(m);
scripter.runScript(jsonmap);
} catch (Exception e){
+ if(pluginConfig.containsKey("chrome.selenium.screenshotdir")) {
Map tempmap = new HashMap<>();
- tempmap.put("type", "file");
- tempmap.put("targetdir", pluginConfig.getOrDefault("chrome.selenium.screenshotdir","/dbfs/FileStore/screenshots/")+resource.getCrawlId()+System.currentTimeMillis());
- scripter.screenshotOperation(tempmap);
+ tempmap.put("type", "file");
+ tempmap.put("targetdir", pluginConfig.get("chrome.selenium.screenshotdir")+resource.getCrawlId()+System.currentTimeMillis());
+ scripter.screenshotOperation(tempmap);
+ }
LOG.error("Scripter Exception", e);
}
List snapshots = scripter.getSnapshots();
From bcf83324192a8515c37b09eb1a971b3fc49ca6e3 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 7 Jul 2021 13:49:50 +0100
Subject: [PATCH 088/335] add storage options for screenshots and downloaded
content
---
sparkler-core/conf/sparkler-default.yaml | 11 +------
.../irds/sparkler/util/FetcherDefault.java | 12 ++++++-
.../src/main/resources/sparkler-default.yaml | 11 +------
.../irds/sparkler/plugin/FetcherChrome.java | 33 +++++--------------
4 files changed, 22 insertions(+), 45 deletions(-)
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index 7bb9c39b..6e42a8b7 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -87,6 +87,7 @@ fetcher.headers:
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
Accept-Language: "en-US,en"
+fetcher.persist.content.location: /dbfs/FileStore/bcf/content/
# Rotating agents file.
# File should contain a list of agents which will be used to override the default agent string
# This is an unbounded list, it can take any number of agents you wish.
@@ -131,12 +132,6 @@ plugins:
#socket.timeout: 3000
#connect.timeout: 3000
fetcher.chrome:
- #Set timeout to > -1 to enable the wait for element visibility for some ajax sites.
- chrome.wait.timeout: -1
- #Element name
- chrome.wait.element: "some element"
- #What type of element, class, name, id
- chrome.wait.type: "class"
chrome.dns: "http://localhost:3000/webdriver"
chrome.selenium.screenshotdir: "/dbfs/FileStore/screenshots/"
chrome.options:
@@ -161,10 +156,6 @@ plugins:
- "--disable-notifications"
- "--disable-logging"
- "--disable-permissions-api"
- #chrome.selenium.enabled: "true"
- #chrome.selenium.script.click: "id:txtName"
- #chrome.selenium.script.keys: "COR"
- #chrome.selenium.script.click: "id:btnSearch"
#chrome.proxy.address: 127.0.0.1:9998
url.injector:
mode: selenium # currently only compatible with the fetcher-chrome plugin
diff --git a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
index f99ac214..b044de2f 100644
--- a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
+++ b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
@@ -9,6 +9,7 @@
import edu.usc.irds.sparkler.model.FetchedData;
import edu.usc.irds.sparkler.model.Resource;
import edu.usc.irds.sparkler.model.ResourceStatus;
+import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
@@ -21,9 +22,9 @@
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
+import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
@@ -169,6 +170,15 @@ public FetchedData fetch(Resource resource) throws Exception {
}
bufferOutStream.flush();
byte[] rawData = bufferOutStream.toByteArray();
+ if(jobContext.getConfiguration().containsKey("fetcher.persist.content.location")){
+ File outputDirectory = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId()).toFile();
+ File outputFile = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId(), FilenameUtils.getName(resource.getUrl())).toFile();
+ outputDirectory.mkdirs();
+ try (FileOutputStream outputStream = new FileOutputStream(outputFile)) {
+ outputStream.write(rawData);
+ }
+ }
+
IOUtils.closeQuietly(bufferOutStream);
FetchedData fetchedData = new FetchedData(rawData, urlConn.getContentType(), responseCode);
resource.setStatus(ResourceStatus.FETCHED.toString());
diff --git a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
index 209654a2..0b824a43 100644
--- a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
+++ b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
@@ -87,6 +87,7 @@ fetcher.headers:
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
Accept-Language: "en-US,en"
+fetcher.persist.content.location: /dbfs/FileStore/bcf/content/
# Rotating agents file.
# File should contain a list of agents which will be used to override the default agent string
# This is an unbounded list, it can take any number of agents you wish.
@@ -131,18 +132,8 @@ plugins:
#socket.timeout: 3000
#connect.timeout: 3000
fetcher.chrome:
- #Set timeout to > -1 to enable the wait for element visibility for some ajax sites.
- chrome.wait.timeout: -1
- #Element name
- chrome.wait.element: "some element"
- #What type of element, class, name, id
- chrome.wait.type: "class"
chrome.dns: "http://localhost:3000/webdriver"
chrome.selenium.screenshotdir: "/dbfs/FileStore/screenshots/"
- #chrome.selenium.enabled: "true"
- #chrome.selenium.script.click: "id:txtName"
- #chrome.selenium.script.keys: "COR"
- #chrome.selenium.script.click: "id:btnSearch"
#chrome.proxy.address: 127.0.0.1:9998
chrome.options:
- "--no-sandbox"
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 1f01bd5b..526238a7 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -37,7 +37,11 @@
import org.pf4j.Extension;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+
+import java.io.File;
import java.net.*;
+import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.*;
import java.net.CookieHandler;
import java.net.CookieManager;
@@ -193,29 +197,6 @@ public FetchedData fetch(Resource resource) throws Exception {
driver.get(resource.getUrl());
-/* int waittimeout = (int) pluginConfig.getOrDefault("chrome.wait.timeout", "-1");
- String waittype = (String) pluginConfig.getOrDefault("chrome.wait.type", "");
- String waitelement = (String) pluginConfig.getOrDefault("chrome.wait.element", "");
-
- if (waittimeout > -1) {
- LOG.debug("Waiting {} seconds for element {} of type {} to become visible", waittimeout, waitelement,
- waittype);
- WebDriverWait wait = new WebDriverWait(driver, waittimeout);
- switch (waittype) {
- case "class":
- LOG.debug("waiting for class...");
- wait.until(ExpectedConditions.visibilityOfElementLocated(By.className(waitelement)));
- break;
- case "name":
- LOG.debug("waiting for name...");
- wait.until(ExpectedConditions.visibilityOfElementLocated(By.name(waitelement)));
- break;
- case "id":
- LOG.debug("waiting for id...");
- wait.until(ExpectedConditions.visibilityOfElementLocated(By.id(waitelement)));
- break;
- }
- }*/
SeleniumScripter scripter = new SeleniumScripter(driver);
String seleniumenabled = (String) pluginConfig.getOrDefault("chrome.selenium.enabled", "false");
String html = null;
@@ -242,7 +223,11 @@ public FetchedData fetch(Resource resource) throws Exception {
if(pluginConfig.containsKey("chrome.selenium.screenshotdir")) {
Map tempmap = new HashMap<>();
tempmap.put("type", "file");
- tempmap.put("targetdir", pluginConfig.get("chrome.selenium.screenshotdir")+resource.getCrawlId()+System.currentTimeMillis());
+ Path path = Paths.get(pluginConfig.get("chrome.selenium.screenshotdir").toString(), jobContext.getId());
+ File f = path.toFile();
+ f.mkdirs();
+ Path filepath = Paths.get(pluginConfig.get("chrome.selenium.screenshotdir").toString(), jobContext.getId(), resource.getCrawlId()+System.currentTimeMillis()+".png");
+ tempmap.put("targetdir", filepath.toString());
scripter.screenshotOperation(tempmap);
}
LOG.error("Scripter Exception", e);
From 333fb6b7adc82736a4dd3b8af01e8aaf60ef1074 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 7 Jul 2021 17:00:30 +0100
Subject: [PATCH 089/335] try additional logging
---
.../java/edu/usc/irds/sparkler/plugin/FetcherChrome.java | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 526238a7..d8bd4610 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -33,6 +33,9 @@
import org.openqa.selenium.Proxy;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
+import org.openqa.selenium.logging.LogEntries;
+import org.openqa.selenium.logging.LogEntry;
+import org.openqa.selenium.logging.LogType;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.pf4j.Extension;
import org.slf4j.Logger;
@@ -134,7 +137,7 @@ public void filterResponse(HttpResponse response, HttpMessageContents contents,
final ChromeOptions chromeOptions = new ChromeOptions();
- List chromedefaults = Arrays.asList("--headless", "--no-sandbox", "--disable-gpu", "--disable-extensions",
+ List chromedefaults = Arrays.asList("--auto-open-devtools-for-tabs", "--headless", "--no-sandbox", "--disable-gpu", "--disable-extensions",
"--ignore-certificate-errors", "--incognito", "--window-size=1920,1080", "--proxy-server='direct://",
"--proxy-bypass-list=*", "--disable-background-networking", "--safebrowsing-disable-auto-update",
"--disable-sync", "--metrics-recording-only", "--disable-default-apps", "--no-first-run",
@@ -219,6 +222,7 @@ public FetchedData fetch(Resource resource) throws Exception {
Map m = (Map) json.get("selenium");
Map jsonmap = new TreeMap(m);
scripter.runScript(jsonmap);
+
} catch (Exception e){
if(pluginConfig.containsKey("chrome.selenium.screenshotdir")) {
Map tempmap = new HashMap<>();
From dbbd3d03a9c9b18e27bbc2a6b213ae7ddffa971a Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 7 Jul 2021 17:01:54 +0100
Subject: [PATCH 090/335] try additional logging
---
.../java/edu/usc/irds/sparkler/plugin/FetcherChrome.java | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index d8bd4610..cd4beaed 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -222,8 +222,12 @@ public FetchedData fetch(Resource resource) throws Exception {
Map m = (Map) json.get("selenium");
Map jsonmap = new TreeMap(m);
scripter.runScript(jsonmap);
-
} catch (Exception e){
+ LogEntries logs = driver.manage().logs().get(LogType.BROWSER);
+ List alllogs = logs.getAll();
+ for(LogEntry logentry: alllogs){
+ LOG.info(logentry.getMessage());
+ }
if(pluginConfig.containsKey("chrome.selenium.screenshotdir")) {
Map tempmap = new HashMap<>();
tempmap.put("type", "file");
From 2d1f2f72758f7be4eaaa0397c27323a2b95f57f4 Mon Sep 17 00:00:00 2001
From: dmitri-mcguckin
Date: Wed, 7 Jul 2021 18:16:56 -0400
Subject: [PATCH 091/335] Update status marking post-crawl
---
.../sparkler-plugins/fetcher-chrome/pom.xml | 2 +-
.../usc/irds/sparkler/plugin/FetcherChrome.java | 17 +++++++++++------
2 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml b/sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml
index 5a5269d5..ceb1533b 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml
@@ -112,7 +112,7 @@
com.kytheralabsseleniumscripter
- 1.2
+ 1.4-SNAPSHOT
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index cd4beaed..717626ac 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -222,6 +222,8 @@ public FetchedData fetch(Resource resource) throws Exception {
Map m = (Map) json.get("selenium");
Map jsonmap = new TreeMap(m);
scripter.runScript(jsonmap);
+
+ resource.setStatus(ResourceStatus.FETCHED.toString());
} catch (Exception e){
LogEntries logs = driver.manage().logs().get(LogType.BROWSER);
List alllogs = logs.getAll();
@@ -229,7 +231,7 @@ public FetchedData fetch(Resource resource) throws Exception {
LOG.info(logentry.getMessage());
}
if(pluginConfig.containsKey("chrome.selenium.screenshotdir")) {
- Map tempmap = new HashMap<>();
+ Map tempmap = new HashMap<>();
tempmap.put("type", "file");
Path path = Paths.get(pluginConfig.get("chrome.selenium.screenshotdir").toString(), jobContext.getId());
File f = path.toFile();
@@ -238,7 +240,9 @@ public FetchedData fetch(Resource resource) throws Exception {
tempmap.put("targetdir", filepath.toString());
scripter.screenshotOperation(tempmap);
}
- LOG.error("Scripter Exception", e);
+ LOG.error("Caught an exception in Selenium Scripter: " + e);
+
+ resource.setStatus(ResourceStatus.ERROR.toString());
}
List snapshots = scripter.getSnapshots();
html = String.join(",", snapshots);
@@ -249,9 +253,12 @@ public FetchedData fetch(Resource resource) throws Exception {
html = driver.getPageSource();
}
+ fetchedData = new FetchedData(html.getBytes(), "text/html", latestStatus);
+ fetchedData.setResource(resource);
+
LOG.debug("Time taken to load {} - {} ", resource.getUrl(), (System.currentTimeMillis() - start));
- LOG.info("LATEST STATUS: "+latestStatus);
+ LOG.info("LATEST STATUS: " + latestStatus);
/*if (!(latestStatus >= 200 && latestStatus < 300) && latestStatus != 0) {
// If not fetched through plugin successfully
// Falling back to default fetcher
@@ -259,11 +266,9 @@ public FetchedData fetch(Resource resource) throws Exception {
return super.fetch(resource);
}*/
- fetchedData = new FetchedData(html.getBytes(), "text/html", latestStatus);
- resource.setStatus(ResourceStatus.FETCHED.toString());
- fetchedData.setResource(resource);
driver.quit();
driver = null;
+
return fetchedData;
}
From 04ee856c7734042101d6bd6a2ae2ffbca651976d Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Wed, 7 Jul 2021 23:39:09 +0100
Subject: [PATCH 092/335] double to int
---
.../scala/edu/usc/irds/sparkler/pipeline/Crawler.scala | 8 ++++----
.../java/edu/usc/irds/sparkler/plugin/FetcherChrome.java | 2 +-
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index f7872f64..c8ac9699 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -239,18 +239,18 @@ class Crawler extends CliTool {
.groupByKey()*/
- val l = f.glom().map(_.length).collect()
+ //val l = f.glom().map(_.length).collect()
- print(l.min, l.max, l.sum/l.length, l.length)
+ //print(l.min, l.max, l.sum/l.length, l.length)
- val c = f.getNumPartitions
+ //val c = f.getNumPartitions
//val fetchedRdd = f.mapPartitions( x => mapCrawl(x))
//val fetchedRdd = rc.runCrawl(f, job)
var fetchedRdd: RDD[CrawlData] = null
- val rep: Int = sparklerConf.get("crawl.repartition").asInstanceOf[Int]
+ val rep: Int = sparklerConf.get("crawl.repartition").asInstanceOf[Double].toInt
if (rep > 0) {
fetchedRdd = f.repartition(rep).flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index cd4beaed..c076def9 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -229,7 +229,7 @@ public FetchedData fetch(Resource resource) throws Exception {
LOG.info(logentry.getMessage());
}
if(pluginConfig.containsKey("chrome.selenium.screenshotdir")) {
- Map tempmap = new HashMap<>();
+ Map tempmap = new HashMap<>();
tempmap.put("type", "file");
Path path = Paths.get(pluginConfig.get("chrome.selenium.screenshotdir").toString(), jobContext.getId());
File f = path.toFile();
From a85aa1ab3bc1a28ca06ba11db3d1f4de551e9319 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Thu, 8 Jul 2021 00:51:02 +0100
Subject: [PATCH 093/335] update scripter
---
sparkler-core/project/PluginDependencies.scala | 2 +-
.../scala/edu/usc/irds/sparkler/pipeline/Crawler.scala | 2 +-
.../java/edu/usc/irds/sparkler/plugin/FetcherChrome.java | 7 +++++++
3 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index aeb324c3..94146d4a 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.3-20210706.094407-29"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.4-20210707.233519-5"
}
object FetcherHtmlUnit {
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index c8ac9699..b20faa6e 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -250,7 +250,7 @@ class Crawler extends CliTool {
var fetchedRdd: RDD[CrawlData] = null
- val rep: Int = sparklerConf.get("crawl.repartition").asInstanceOf[Double].toInt
+ val rep: Int = sparklerConf.get("crawl.repartition").asInstanceOf[Number].intValue()
if (rep > 0) {
fetchedRdd = f.repartition(rep).flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 717626ac..14e6a672 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -36,6 +36,8 @@
import org.openqa.selenium.logging.LogEntries;
import org.openqa.selenium.logging.LogEntry;
import org.openqa.selenium.logging.LogType;
+import org.openqa.selenium.logging.LoggingPreferences;
+import org.openqa.selenium.remote.CapabilityType;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.pf4j.Extension;
import org.slf4j.Logger;
@@ -50,6 +52,7 @@
import java.net.CookieManager;
import java.net.CookiePolicy;
import java.util.concurrent.TimeUnit;
+import java.util.logging.Level;
/*import com.browserup.bup.BrowserUpProxy;
import com.browserup.bup.BrowserUpProxyServer;
@@ -144,6 +147,10 @@ public void filterResponse(HttpResponse response, HttpMessageContents contents,
"--disable-setuid-sandbox", "--hide-scrollbars", "--no-zygote", "--disable-notifications",
"--disable-logging", "--disable-permissions-api");
+ LoggingPreferences logPrefs = new LoggingPreferences();
+ logPrefs.enable(LogType.BROWSER, Level.ALL);
+ chromeOptions.setCapability(CapabilityType.LOGGING_PREFS, logPrefs);
+ chromeOptions.setCapability("goog:loggingPrefs", logPrefs);
List vals = (List) (pluginConfig.getOrDefault("chrome.options", chromedefaults));
chromeOptions.addArguments(vals);
From 8f0bd2b6c3eedf0457fc85756575e70ee578c8d0 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Thu, 8 Jul 2021 01:32:49 +0100
Subject: [PATCH 094/335] add more logs
---
.../edu/usc/irds/sparkler/plugin/FetcherChrome.java | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 14e6a672..528c55b9 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -237,6 +237,16 @@ public FetchedData fetch(Resource resource) throws Exception {
for(LogEntry logentry: alllogs){
LOG.info(logentry.getMessage());
}
+ logs = driver.manage().logs().get(LogType.PERFORMANCE);
+ alllogs = logs.getAll();
+ for(LogEntry logentry: alllogs){
+ LOG.info(logentry.getMessage());
+ }
+ logs = driver.manage().logs().get(LogType.PROFILER);
+ alllogs = logs.getAll();
+ for(LogEntry logentry: alllogs){
+ LOG.info(logentry.getMessage());
+ }
if(pluginConfig.containsKey("chrome.selenium.screenshotdir")) {
Map tempmap = new HashMap<>();
tempmap.put("type", "file");
From 683c2ae24f5c13c8210350c51263694d7b042240 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Thu, 8 Jul 2021 01:48:45 +0100
Subject: [PATCH 095/335] add more logs
---
.../main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java | 2 ++
1 file changed, 2 insertions(+)
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 528c55b9..f4080a80 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -149,6 +149,8 @@ public void filterResponse(HttpResponse response, HttpMessageContents contents,
LoggingPreferences logPrefs = new LoggingPreferences();
logPrefs.enable(LogType.BROWSER, Level.ALL);
+ logPrefs.enable(LogType.PROFILER, Level.ALL);
+ logPrefs.enable(LogType.PERFORMANCE, Level.ALL);
chromeOptions.setCapability(CapabilityType.LOGGING_PREFS, logPrefs);
chromeOptions.setCapability("goog:loggingPrefs", logPrefs);
List vals = (List) (pluginConfig.getOrDefault("chrome.options", chromedefaults));
From bd6bf9f2c5b6aebadc1c751f890af7d106b2d802 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Thu, 8 Jul 2021 02:24:28 +0100
Subject: [PATCH 096/335] force user agent
---
.../main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java | 1 +
1 file changed, 1 insertion(+)
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index f4080a80..1a79083e 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -154,6 +154,7 @@ public void filterResponse(HttpResponse response, HttpMessageContents contents,
chromeOptions.setCapability(CapabilityType.LOGGING_PREFS, logPrefs);
chromeOptions.setCapability("goog:loggingPrefs", logPrefs);
List vals = (List) (pluginConfig.getOrDefault("chrome.options", chromedefaults));
+ vals.add("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
chromeOptions.addArguments(vals);
chromeOptions.setPageLoadStrategy(PageLoadStrategy.NORMAL);
From 8908fc3eaaddd3f89b1cb10882ed778476175530 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Thu, 8 Jul 2021 14:27:23 +0100
Subject: [PATCH 097/335] add chrome
---
.../irds/sparkler/plugin/FetcherChrome.java | 24 ++++++++-----------
1 file changed, 10 insertions(+), 14 deletions(-)
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 1a79083e..12551d68 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -39,6 +39,7 @@
import org.openqa.selenium.logging.LoggingPreferences;
import org.openqa.selenium.remote.CapabilityType;
import org.openqa.selenium.remote.RemoteWebDriver;
+import org.openqa.selenium.support.ui.WebDriverWait;
import org.pf4j.Extension;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -209,22 +210,17 @@ public FetchedData fetch(Resource resource) throws Exception {
driver.get(resource.getUrl());
-
SeleniumScripter scripter = new SeleniumScripter(driver);
- String seleniumenabled = (String) pluginConfig.getOrDefault("chrome.selenium.enabled", "false");
- String html = null;
-/* if (seleniumenabled.equals("true")) {
- if(pluginConfig.get("chrome.selenium.script") != null && pluginConfig.get("chrome.selenium.script") instanceof Map) {
- Map map = (Map) pluginConfig.get("chrome.selenium.script");
- try {
- scripter.runScript(map);
- } catch (Exception ignored){
+ String waitforready = pluginConfig.getOrDefault("chrome.selenium.javascriptready", "false").toString();
- }
- List snapshots = scripter.getSnapshots();
- html = String.join(",", snapshots);
- }
- }*/
+ if(waitforready.equals("true")){
+ new WebDriverWait(driver, 60)
+ .until((driver) -> ((JavascriptExecutor) driver).executeScript("return document.readyState")
+ .toString()
+ .equals("complete"));
+ }
+
+ String html = null;
if(json != null && json.containsKey("selenium")){
if(json.get("selenium") != null && json.get("selenium") instanceof Map) {
try {
From 7920373dc625348a01f77bf2a48657211b203088 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Thu, 8 Jul 2021 19:41:59 +0100
Subject: [PATCH 098/335] update token parser
---
.../src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java | 1 +
1 file changed, 1 insertion(+)
diff --git a/sparkler-core/sparkler-plugins/url-injector/src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java b/sparkler-core/sparkler-plugins/url-injector/src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java
index 369e7cbd..8b3f3426 100644
--- a/sparkler-core/sparkler-plugins/url-injector/src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java
+++ b/sparkler-core/sparkler-plugins/url-injector/src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java
@@ -77,6 +77,7 @@ private List appendForm(Collection urls, List to
root.put("TAG", temp);
String json = root.toString();
json = json.replace("${token}", temp);
+ json = json.replace("__token__", temp);
UrlInjectorObj o = new UrlInjectorObj(u, json, method);
fixedUrls.add(o);
}
From 48324811624631302fc5199eb218d970d1d7263c Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Thu, 8 Jul 2021 19:43:38 +0100
Subject: [PATCH 099/335] update token parser
---
.../src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java | 2 ++
1 file changed, 2 insertions(+)
diff --git a/sparkler-core/sparkler-plugins/url-injector/src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java b/sparkler-core/sparkler-plugins/url-injector/src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java
index 8b3f3426..6b59fcbd 100644
--- a/sparkler-core/sparkler-plugins/url-injector/src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java
+++ b/sparkler-core/sparkler-plugins/url-injector/src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java
@@ -100,6 +100,7 @@ private List replaceURLToken(Collection urls, List appendSelenium(Collection urls, List
Date: Fri, 9 Jul 2021 02:04:49 +0100
Subject: [PATCH 100/335] add databricks plugin
---
.github/workflows/build-sbt.yaml | 2 +
sparkler-core/build.sbt | 21 ++++-
sparkler-core/conf/sparkler-default.yaml | 18 +++++
sparkler-core/plugins.build.sbt | 46 +++++++++--
.../project/PluginDependencies.scala | 5 ++
sparkler-core/project/Settings.scala | 16 +++-
sparkler-core/project/plugins.sbt | 3 +-
.../edu/usc/irds/sparkler/GenericProcess.java | 11 +++
.../usc/irds/sparkler/pipeline/Crawler.scala | 1 +
.../sparkler/pipeline/GenericFunction.scala | 31 ++++++++
.../kytheralabs/databricks/DatabricksAPI.java | 76 +++++++++++++++++++
.../databricks/DatabricksAPIActivator.java | 10 +++
12 files changed, 227 insertions(+), 13 deletions(-)
create mode 100644 sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/GenericProcess.java
create mode 100644 sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/GenericFunction.scala
create mode 100644 sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
create mode 100644 sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPIActivator.java
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index 1f9525aa..def64e1c 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -21,6 +21,8 @@ jobs:
- name: Run submit package
run: sbt assembly -Dsparkprovided=true
working-directory: sparkler-core
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Deploy Submit to Databricks
run: ./deploy.sh
working-directory: sparkler-core
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index 801a1d58..613ef4c2 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -30,6 +30,7 @@ libraryDependencies in ThisBuild ++= Seq(
Dependencies.pf4j % "provided",
)
+
developers := List(
// In alphabetic order
Developer("chrismattmann",
@@ -63,6 +64,8 @@ lazy val root = (project in file("."))
Settings.common,
name := "sparkler",
mainClass in Compile := Some("edu.usc.irds.sparkler.Main"),
+ githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN")
+
)
.aggregate(api, app, plugins, ui)
@@ -96,10 +99,14 @@ lazy val api = (project in file("sparkler-api"))
test in assembly := {},
testOptions += Tests.Argument(TestFrameworks.JUnit,
"--verbosity=1",
- "--run-listener=edu.usc.irds.sparkler.test.WebServerRunListener")
+ "--run-listener=edu.usc.irds.sparkler.test.WebServerRunListener"),
+ githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN")
+
+
)
.dependsOn(testsBase)
+
val sparkprovided = System.getProperty("sparkprovided", "")
lazy val app = (project in file("sparkler-app"))
@@ -179,7 +186,9 @@ lazy val app = (project in file("sparkler-app"))
IO.copyDirectory(file(".") / Settings.binDir, file(".") / Settings.buildDir / Settings.binDir)
buildLocation
- }
+ },
+ githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN")
+
)
.dependsOn(api)
@@ -193,7 +202,9 @@ lazy val testsBase = (project in file("sparkler-tests-base"))
Dependencies.jUnit,
Dependencies.Slf4j.api,
Dependencies.Slf4j.log4j12,
- )
+ ),
+ githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
+
)
@@ -231,7 +242,9 @@ lazy val ui = (project in file("sparkler-ui"))
val packageFile: File = (packageBin in Universal).value
IO.move(packageFile, buildLocation)
buildLocation
- }
+ },
+ githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN")
+
)
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index 6e42a8b7..7d2e85ce 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -189,6 +189,24 @@ plugins:
selTC: ""
selProgram: "MA"
txtDateOfService: "12/01/2020"
+ databricks.api.events:
+ startup:
+ updateeventlog:
+ sql: xxx
+ iteration_complete:
+ updateeventlog:
+ sql: xxx
+ shutdown:
+ triggerjob:
+ notebook: xxx
+ sparkversion: xxx
+ clustertype: xxx
+ clustersize: xxx
+ parameters:
+ abc: xxx
+ def: xxx
+
+
##################### Custom properties for MEMEX ###########################################
memex.webpage.mimetype: "text/html"
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index 9d8bf5f0..933c0911 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -23,8 +23,11 @@ lazy val plugins = (project in file(s"$sparklerPlugins"))
.enablePlugins(JavaAppPackaging)
.settings(
Settings.common,
- name := "sparkler-plugins"
+ name := "sparkler-plugins",
+ githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
+
)
+
.aggregate(
fetcherChrome,
fetcherHtmlUnit,
@@ -32,6 +35,7 @@ lazy val plugins = (project in file(s"$sparklerPlugins"))
scorerDdSvn,
urlFilterRegex,
urlFilterSameHost,
+ databricks,
)
//fetcherJBrowser,
/**
@@ -49,7 +53,9 @@ lazy val templatePlugin = (project in file(s"$sparklerPlugins/template-plugin"))
id = "template-plugin",
className = "edu.usc.irds.sparkler.plugin.MyPluginActivator",
dependencies = List.empty
- )
+ ),
+ githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
+
)
.dependsOn(api)
@@ -74,6 +80,26 @@ lazy val fetcherChrome = (project in file(s"$sparklerPlugins/fetcher-chrome"))
className = "edu.usc.irds.sparkler.plugin.FetcherChromeActivator",
dependencies = List.empty
),
+ githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
+
+ )
+ .dependsOn(api)
+
+lazy val databricks = (project in file(s"$sparklerPlugins/databricks-api-plugin"))
+ .enablePlugins(JavaAppPackaging)
+ .settings(
+ Settings.plugin,
+ name := "databricks-api",
+ libraryDependencies ++= Seq(
+ Databricks.wrapper
+ ),
+ Settings.pluginManifest(
+ id = "databricks-api",
+ className = "com.kytheralabs.databricks.DatabricksAPIActivator",
+ dependencies = List.empty
+ ),
+ githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
+
)
.dependsOn(api)
@@ -92,7 +118,9 @@ lazy val fetcherHtmlUnit = (project in file(s"$sparklerPlugins/fetcher-htmlunit"
),
testOptions += Tests.Argument(TestFrameworks.JUnit,
"--verbosity=1",
- "--run-listener=edu.usc.irds.sparkler.test.WebServerRunListener")
+ "--run-listener=edu.usc.irds.sparkler.test.WebServerRunListener"),
+ githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
+
)
.dependsOn(api)
@@ -124,7 +152,9 @@ lazy val scorerDdSvn = (project in file(s"$sparklerPlugins/scorer-dd-svn"))
id = "scorer-dd-svn",
className = "edu.usc.irds.sparkler.plugin.DdSvnScorerActivator",
dependencies = List.empty
- )
+ ),
+ githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
+
)
.dependsOn(api)
@@ -137,7 +167,9 @@ lazy val urlFilterRegex = (project in file(s"$sparklerPlugins/urlfilter-regex"))
id = "urlfilter-regex",
className = "edu.usc.irds.sparkler.plugin.RegexURLFilterActivator",
dependencies = List.empty
- )
+ ),
+ githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
+
)
.dependsOn(api)
@@ -150,6 +182,8 @@ lazy val urlFilterSameHost = (project in file(s"$sparklerPlugins/urlfilter-sameh
id = "urlfilter-samehost",
className = "edu.usc.irds.sparkler.plugin.UrlFilterSameHostActivator",
dependencies = List.empty
- )
+ ),
+ githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
+
)
.dependsOn(api)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 94146d4a..c711156c 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -42,3 +42,8 @@ object FetcherJBrowser {
object ScorerDdSvn {
lazy val httpClient = "org.apache.httpcomponents" % "httpclient" % "4.3.6"
}
+
+
+object Databricks {
+ lazy val wrapper = "default" % "webcrawlerwrapper_2.12" % "0.1"
+}
\ No newline at end of file
diff --git a/sparkler-core/project/Settings.scala b/sparkler-core/project/Settings.scala
index 9f2557a5..238b0751 100644
--- a/sparkler-core/project/Settings.scala
+++ b/sparkler-core/project/Settings.scala
@@ -63,8 +63,11 @@ object Settings {
"Sonatype Snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/",
"Scala-Tools Snapshots" at "https://scala-tools.org/repo-snapshots/",
"Gitlab Spicule 2" at "https://gitlab.com/api/v4/projects/26391218/packages/maven",
- "Gitlab Spicule" at "https://gitlab.com/api/v4/projects/23300400/packages/maven"
+ "Gitlab Spicule" at "https://gitlab.com/api/v4/projects/23300400/packages/maven",
+ "Private Github" at "https://maven.pkg.github.com/spicule-kythera/webcrawlerwrapper/"
)
+
+
)
lazy val assemblyProject = common ++ baseAssemblySettings ++ Seq(
test in assembly := {},
@@ -80,6 +83,7 @@ object Settings {
case x if x.contains("io.netty.versions.properties") => MergeStrategy.last
case x if x.contains("Log4j2Plugins.dat") => MergeStrategy.first
case x if x.contains("module-info.class") => MergeStrategy.first
+ case x if x.contains("jetty-dir.css") => MergeStrategy.first
case x if x.contains("public-suffix-list.txt") => MergeStrategy.first
case x if x.contains("bus-extensions.txt") => MergeStrategy.first
case x if x.contains("blueprint.handlers") => MergeStrategy.first
@@ -90,12 +94,20 @@ object Settings {
case x if x.contains("ExtensionModule") => MergeStrategy.first
case PathList("META-INF", "MANIFEST.MF") => MergeStrategy.discard
case PathList("javax", "activation", xs@_*) => MergeStrategy.first
+ case PathList("javax", "inject", xs@_*) => MergeStrategy.first
+ case PathList("javax", "xml", xs@_*) => MergeStrategy.first
+ case PathList("javax", "servlet", xs@_*) => MergeStrategy.first
+ case PathList("com", "sun", xs@_*) => MergeStrategy.first
+ case PathList("org", "aopalliance", xs@_*) => MergeStrategy.first
+
+ case PathList("org", "apache", "spark", "unused", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "commons", xs@_*) => MergeStrategy.first
//case PathList("io", "netty", xs@_*) => MergeStrategy.last
case x => (assemblyMergeStrategy in assembly).value.apply(x)
},
- assemblyOutputPath in assembly := file(".") / buildDir / pluginsDir / s"${name.value}-${(version in ThisBuild).value}.jar"
+ assemblyOutputPath in assembly := file(".") / buildDir / pluginsDir / s"${name.value}-${(version in ThisBuild).value}.jar"
)
def pluginManifest(id: String, className: String,
diff --git a/sparkler-core/project/plugins.sbt b/sparkler-core/project/plugins.sbt
index 7225838f..a9779901 100644
--- a/sparkler-core/project/plugins.sbt
+++ b/sparkler-core/project/plugins.sbt
@@ -18,4 +18,5 @@ addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.7.4")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.13")
-addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.13")
\ No newline at end of file
+addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.13")
+addSbtPlugin("com.codecommit" % "sbt-github-packages" % "0.5.3")
\ No newline at end of file
diff --git a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/GenericProcess.java b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/GenericProcess.java
new file mode 100644
index 00000000..eb7503dc
--- /dev/null
+++ b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/GenericProcess.java
@@ -0,0 +1,11 @@
+package edu.usc.irds.sparkler;
+
+public interface GenericProcess extends ExtensionPoint{
+
+ enum Event {
+ SHUTDOWN,
+ STARTUP,
+ ITERATION_COMPLETE,
+ }
+ void executeProcess(Event e) throws Exception;
+}
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index b20faa6e..fcde6862 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -280,6 +280,7 @@ class Crawler extends CliTool {
}
storageProxy.close()
//PluginService.shutdown(job)
+ GenericFunction(job, GenericProcess.Event.SHUTDOWN)
LOG.info("Shutting down Spark CTX..")
sc.stop()
}
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/GenericFunction.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/GenericFunction.scala
new file mode 100644
index 00000000..308a141b
--- /dev/null
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/GenericFunction.scala
@@ -0,0 +1,31 @@
+package edu.usc.irds.sparkler.pipeline
+
+import edu.usc.irds.sparkler.{GenericProcess, Scorer}
+import edu.usc.irds.sparkler.base.Loggable
+import edu.usc.irds.sparkler.model.{CrawlData, SparklerJob}
+import edu.usc.irds.sparkler.service.PluginService
+
+import java.io.Serializable
+
+object GenericFunction
+ extends ((SparklerJob, GenericProcess.Event) => GenericProcess.Event) with Serializable with Loggable {
+
+ override def apply(job: SparklerJob, event: GenericProcess.Event) : GenericProcess.Event = {
+ val genericProc:scala.Option[GenericProcess] = PluginService.getExtension(classOf[GenericProcess], job)
+ try {
+ genericProc match {
+ case Some(genericProc) =>
+ genericProc.executeProcess(event)
+ LOG.info(s"Executing Event Process $event")
+ event
+ case None =>
+ LOG.info("Event processing is not performed")
+ event
+ }
+ } catch {
+ case e: Exception =>
+ LOG.error(e.getMessage, e)
+ event
+ }
+ }
+}
diff --git a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
new file mode 100644
index 00000000..dcb3fcd9
--- /dev/null
+++ b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
@@ -0,0 +1,76 @@
+package com.kytheralabs.databricks;
+
+import edu.usc.irds.sparkler.AbstractExtensionPoint;
+import edu.usc.irds.sparkler.GenericProcess;
+import edu.usc.irds.sparkler.SparklerConfiguration;
+import com.kytheralabs.management.jobutils.JobAPI;
+import org.json.simple.JSONObject;
+
+import java.text.ParseException;
+import java.util.Map;
+import java.util.Scanner;
+
+public class DatabricksAPI extends AbstractExtensionPoint implements GenericProcess {
+ SparklerConfiguration pluginConfig = this.jobContext.getConfiguration();
+
+ @Override
+ public void executeProcess(GenericProcess.Event event) throws Exception {
+
+
+
+
+ if(event == Event.SHUTDOWN){
+ if(pluginConfig.containsKey("databricks.api.events.shutdown")){
+ Map m = (Map) pluginConfig.get("databricks.api.events.shutdown");
+ for (Map.Entry entry : m.entrySet()) {
+ if(entry.getKey().equals("triggerjob")){
+ triggerJob((Map) entry.getValue());
+ } else if(entry.getKey().equals("updateeventlog")){
+ updateEventLog((Map) entry.getValue());
+ }
+ }
+ }
+ }
+ }
+
+ private void triggerJob(Map map){
+ String crawlid = this.pluginId;
+ String notebook = map.get("notebook").toString();
+ String sparkversion = map.getOrDefault("sparkversion", "7.3.x-scala2.12").toString();
+ String clusterType = map.getOrDefault("instancetype", "i3.xlarge").toString();
+ Number clusterSize = 0;
+ try {
+ clusterSize = parseNumber(map.getOrDefault("clustersize", 1).toString());
+ } catch (ParseException e) {
+ e.printStackTrace();
+ }
+
+ String params = "{}";
+ if(map.containsKey("params")){
+ JSONObject j = new JSONObject((Map) map.get("params"));
+ params = j.toJSONString();
+ }
+
+ String environment = "";
+ JobAPI.runSingle(notebook, sparkversion, clusterType, clusterSize.intValue(), crawlid, params, environment);
+
+ }
+
+ private void updateEventLog(Map map){
+
+ }
+
+ private Number parseNumber(String number) throws ParseException {
+ Scanner scan = new Scanner(number);
+ if(scan.hasNextInt()){
+ return Integer.parseInt(number);
+ }
+ else if(scan.hasNextDouble()) {
+ return Double.parseDouble(number);
+ }
+ else {
+ throw new ParseException("Invalid numeric type: \"" + number + "\"", 0);
+ }
+ }
+
+}
diff --git a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPIActivator.java b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPIActivator.java
new file mode 100644
index 00000000..360f4ecd
--- /dev/null
+++ b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPIActivator.java
@@ -0,0 +1,10 @@
+package com.kytheralabs.databricks;
+
+import org.pf4j.Plugin;
+import org.pf4j.PluginWrapper;
+public class DatabricksAPIActivator extends Plugin {
+
+ public DatabricksAPIActivator(PluginWrapper wrapper) {
+ super(wrapper);
+ }
+}
From 90cf997af5e3b3536aa58f30e891b0f876c3d809 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 02:13:48 +0100
Subject: [PATCH 101/335] add databricks plugin
---
.../java/com/kytheralabs/databricks/DatabricksAPI.java | 8 +-------
1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
index dcb3fcd9..fe37e45f 100644
--- a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
+++ b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
@@ -15,12 +15,7 @@ public class DatabricksAPI extends AbstractExtensionPoint implements GenericProc
@Override
public void executeProcess(GenericProcess.Event event) throws Exception {
-
-
-
-
- if(event == Event.SHUTDOWN){
- if(pluginConfig.containsKey("databricks.api.events.shutdown")){
+ if(pluginConfig.containsKey("databricks.api.events."+event.toString().toLowerCase())){
Map m = (Map) pluginConfig.get("databricks.api.events.shutdown");
for (Map.Entry entry : m.entrySet()) {
if(entry.getKey().equals("triggerjob")){
@@ -30,7 +25,6 @@ public void executeProcess(GenericProcess.Event event) throws Exception {
}
}
}
- }
}
private void triggerJob(Map map){
From c02045ce18be688123429b2e71cf5bb101bb0599 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 19:04:15 +0100
Subject: [PATCH 102/335] ignore persistence for now
---
sparkler-core/build.sbt | 50 ++++++-
sparkler-core/plugins.build.sbt | 2 +-
.../project/PluginDependencies.scala | 2 +-
sparkler-core/project/Settings.scala | 1 +
.../edu/usc/irds/sparkler/GenericProcess.java | 3 +-
.../usc/irds/sparkler/pipeline/Crawler.scala | 3 +-
.../sparkler/pipeline/GenericFunction.scala | 7 +-
.../service/CustomerPluginManager.java | 2 +
.../irds/sparkler/service/PluginService.scala | 2 +-
.../kytheralabs/databricks/DatabricksAPI.java | 43 +++++-
.../kytheralabs/databricks/Persistence.java | 12 ++
.../sparkler-plugins/fetcher-chrome/pom.xml | 128 ------------------
.../sparkler-plugins/scorer-dd-svn/pom.xml | 61 ---------
.../sparkler-plugins/urlfilter-regex/pom.xml | 52 -------
14 files changed, 105 insertions(+), 263 deletions(-)
create mode 100644 sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/Persistence.java
delete mode 100644 sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml
delete mode 100644 sparkler-core/sparkler-plugins/scorer-dd-svn/pom.xml
delete mode 100644 sparkler-core/sparkler-plugins/urlfilter-regex/pom.xml
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index 613ef4c2..f8185126 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -73,11 +73,20 @@ lazy val api = (project in file("sparkler-api"))
.settings(
Settings.common,
name := "sparkler-api",
+ /*libraryDependencies ++= (
+ if(sparkprovided == "true") {
+ ("org.apache.spark" %% "spark-core" % "3.1.0" % "provided") :: Nil
+ ("org.apache.spark" %% "spark-sql" % "3.1.0" % "provided") :: Nil
+ } else {
+ ("org.apache.spark" %% "spark-core" % "3.1.0" % "provided") :: Nil
+ ("org.apache.spark" %% "spark-sql" % "3.1.0" % "provided") :: Nil
+ }
+ ),*/
libraryDependencies ++= Seq(
Dependencies.jsonSimple exclude("junit", "junit"),
Dependencies.nutch exclude("*", "*"),
Dependencies.snakeYaml,
- Dependencies.Solr.solrj,
+ Dependencies.Solr.solrj exclude("org.apache.spark", "spark-sql"),
Dependencies.gson,
// Test
@@ -88,12 +97,38 @@ lazy val api = (project in file("sparkler-api"))
case x if x.contains("io.netty.versions.properties") => MergeStrategy.first
case x if x.contains("Log4j2Plugins.dat") => MergeStrategy.first
case x if x.contains("module-info.class") => MergeStrategy.first
- case PathList("org", "apache", "logging", xs@_*) => MergeStrategy.first
+ case x if x.contains("public-suffix-list.txt") => MergeStrategy.first
+ case x if x.contains("bus-extensions.txt") => MergeStrategy.first
+ case x if x.contains("blueprint.handlers") => MergeStrategy.first
+ case x if x.contains("git.properties") => MergeStrategy.first
+ case x if x.contains("overview.html") => MergeStrategy.first
+ case x if x.contains("config.fmpp") => MergeStrategy.first
+ case x if x.contains("META-INF/versions/9/javax/xml/bind/") => MergeStrategy.first
+ case x if x.contains("META-INF/native-image/io.netty") => MergeStrategy.first
case PathList("org", "apache", "logging", "log4j", xs@_*) => MergeStrategy.first
- case PathList("org", "apache", "commons", "logging", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "logging", xs@_*) => MergeStrategy.first
case PathList("org", "apache", "log4j", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "commons", "logging", xs@_*) => MergeStrategy.first
case PathList("org", "slf4j", "impl", xs@_*) => MergeStrategy.first
+ case PathList("com", "ctc", "wstx", xs@_*) => MergeStrategy.first
case PathList("org", "cliffc", "high_scale_lib", xs@_*) => MergeStrategy.first
+ case PathList("javax.xml.bind", "jaxb-api", xs@_*) => MergeStrategy.first
+ case PathList("org", "hamcrest", xs@_*) => MergeStrategy.first
+ case PathList("javax", "xml", xs@_*) => MergeStrategy.first
+ case PathList("javax", "activation", xs@_*) => MergeStrategy.first
+ case PathList("io", "netty", xs@_*) => MergeStrategy.first
+ case PathList("org", "aopalliance", "intercept", xs@_*) => MergeStrategy.first
+ case PathList("org", "aopalliance", "aop", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "spark", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "hadoop", xs@_*) => MergeStrategy.first
+ case PathList("net", "jpountz", xs@_*) => MergeStrategy.last
+ case PathList("net", "jcip", xs@_*) => MergeStrategy.first
+ case PathList("javax", "inject", xs@_*) => MergeStrategy.first
+ case PathList("javax", "annotation", xs@_*) => MergeStrategy.first
+ case PathList("com", "sun", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "commons", xs@_*) => MergeStrategy.first
+ case PathList("javax", "servlet", xs@_*) => MergeStrategy.first
+
case x => (assemblyMergeStrategy in assembly).value.apply(x)
},
test in assembly := {},
@@ -117,11 +152,11 @@ lazy val app = (project in file("sparkler-app"))
mainClass in (Compile, packageBin) := Some("edu.usc.irds.sparkler.Main"),
libraryDependencies ++= (
if(sparkprovided == "true") {
- ("org.apache.spark" %% "spark-core" % "3.0.1" % "provided") :: Nil
- ("org.apache.spark" %% "spark-sql" % "3.0.1" % "provided") :: Nil
+ ("org.apache.spark" %% "spark-core" % "3.1.0" % "provided") :: Nil
+ ("org.apache.spark" %% "spark-sql" % "3.1.0" % "provided") :: Nil
} else {
- ("org.apache.spark" %% "spark-core" % "3.0.1") :: Nil
- ("org.apache.spark" %% "spark-sql" % "3.0.1") :: Nil
+ ("org.apache.spark" %% "spark-core" % "3.1.0") :: Nil
+ ("org.apache.spark" %% "spark-sql" % "3.1.0") :: Nil
}
),
libraryDependencies ++= Seq(
@@ -168,6 +203,7 @@ lazy val app = (project in file("sparkler-app"))
case PathList("javax", "inject", xs@_*) => MergeStrategy.first
case PathList("javax", "annotation", xs@_*) => MergeStrategy.first
case PathList("com", "sun", xs@_*) => MergeStrategy.first
+ case PathList("javax", "servlet", xs@_*) => MergeStrategy.first
case x => (assemblyMergeStrategy in assembly).value.apply(x)
},
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index 933c0911..d8ce7533 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -91,7 +91,7 @@ lazy val databricks = (project in file(s"$sparklerPlugins/databricks-api-plugin"
Settings.plugin,
name := "databricks-api",
libraryDependencies ++= Seq(
- Databricks.wrapper
+ Databricks.wrapper exclude("org.apache", "spark-core") exclude("org.apache", "spark-sql")
),
Settings.pluginManifest(
id = "databricks-api",
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index c711156c..1ad4ed92 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -45,5 +45,5 @@ object ScorerDdSvn {
object Databricks {
- lazy val wrapper = "default" % "webcrawlerwrapper_2.12" % "0.1"
+ lazy val wrapper = "default" % "webcrawlerwrapper_2.12" % "0.1-SNAPSHOT"
}
\ No newline at end of file
diff --git a/sparkler-core/project/Settings.scala b/sparkler-core/project/Settings.scala
index 238b0751..86c2535e 100644
--- a/sparkler-core/project/Settings.scala
+++ b/sparkler-core/project/Settings.scala
@@ -101,6 +101,7 @@ object Settings {
case PathList("org", "aopalliance", xs@_*) => MergeStrategy.first
case PathList("org", "apache", "spark", "unused", xs@_*) => MergeStrategy.first
+ case PathList("org", "apache", "spark", xs@_*) => MergeStrategy.discard
case PathList("org", "apache", "commons", xs@_*) => MergeStrategy.first
//case PathList("io", "netty", xs@_*) => MergeStrategy.last
diff --git a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/GenericProcess.java b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/GenericProcess.java
index eb7503dc..83ae26a8 100644
--- a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/GenericProcess.java
+++ b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/GenericProcess.java
@@ -1,5 +1,6 @@
package edu.usc.irds.sparkler;
+
public interface GenericProcess extends ExtensionPoint{
enum Event {
@@ -7,5 +8,5 @@ enum Event {
STARTUP,
ITERATION_COMPLETE,
}
- void executeProcess(Event e) throws Exception;
+ void executeProcess(Event e, Object spark) throws Exception;
}
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index fcde6862..cd58ef74 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -280,7 +280,8 @@ class Crawler extends CliTool {
}
storageProxy.close()
//PluginService.shutdown(job)
- GenericFunction(job, GenericProcess.Event.SHUTDOWN)
+ import org.apache.spark.sql.SQLContext
+ GenericFunction(job, GenericProcess.Event.SHUTDOWN,new SQLContext(sc).sparkSession)
LOG.info("Shutting down Spark CTX..")
sc.stop()
}
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/GenericFunction.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/GenericFunction.scala
index 308a141b..1071c5c2 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/GenericFunction.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/GenericFunction.scala
@@ -4,18 +4,19 @@ import edu.usc.irds.sparkler.{GenericProcess, Scorer}
import edu.usc.irds.sparkler.base.Loggable
import edu.usc.irds.sparkler.model.{CrawlData, SparklerJob}
import edu.usc.irds.sparkler.service.PluginService
+import org.apache.spark.sql.SparkSession
import java.io.Serializable
object GenericFunction
- extends ((SparklerJob, GenericProcess.Event) => GenericProcess.Event) with Serializable with Loggable {
+ extends ((SparklerJob, GenericProcess.Event, SparkSession) => GenericProcess.Event) with Serializable with Loggable {
- override def apply(job: SparklerJob, event: GenericProcess.Event) : GenericProcess.Event = {
+ override def apply(job: SparklerJob, event: GenericProcess.Event, spark: SparkSession) : GenericProcess.Event = {
val genericProc:scala.Option[GenericProcess] = PluginService.getExtension(classOf[GenericProcess], job)
try {
genericProc match {
case Some(genericProc) =>
- genericProc.executeProcess(event)
+ genericProc.executeProcess(event, spark)
LOG.info(s"Executing Event Process $event")
event
case None =>
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/CustomerPluginManager.java b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/CustomerPluginManager.java
index 8921925e..bcdc4e36 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/CustomerPluginManager.java
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/CustomerPluginManager.java
@@ -1,3 +1,4 @@
+/*
package edu.usc.irds.sparkler.service;
import org.pf4j.*;
@@ -17,3 +18,4 @@ protected PluginLoader createPluginLoader() {
};
}
}
+*/
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginService.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginService.scala
index 0bae207b..3afb1272 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginService.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginService.scala
@@ -36,7 +36,7 @@ import scala.collection.JavaConversions._
class PluginService(job:SparklerJob) {
import PluginService._
- val pluginManager = CustomerPluginManager.getPluginManager
+ val pluginManager = new DefaultPluginManager
// This map keeps cache of all active instances
diff --git a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
index fe37e45f..ccd1c4c8 100644
--- a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
+++ b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
@@ -1,34 +1,46 @@
package com.kytheralabs.databricks;
+
import edu.usc.irds.sparkler.AbstractExtensionPoint;
import edu.usc.irds.sparkler.GenericProcess;
import edu.usc.irds.sparkler.SparklerConfiguration;
import com.kytheralabs.management.jobutils.JobAPI;
+import org.apache.spark.sql.SparkSession;
import org.json.simple.JSONObject;
+import org.pf4j.Extension;
+import scala.Option;
import java.text.ParseException;
import java.util.Map;
import java.util.Scanner;
+@Extension
public class DatabricksAPI extends AbstractExtensionPoint implements GenericProcess {
- SparklerConfiguration pluginConfig = this.jobContext.getConfiguration();
@Override
- public void executeProcess(GenericProcess.Event event) throws Exception {
- if(pluginConfig.containsKey("databricks.api.events."+event.toString().toLowerCase())){
- Map m = (Map) pluginConfig.get("databricks.api.events.shutdown");
+ public void executeProcess(GenericProcess.Event event, Object spark) throws Exception {
+ SparklerConfiguration config = this.jobContext.getConfiguration();
+ Map pluginConfig = config.getPluginConfiguration(pluginId);
+
+ if(pluginConfig.containsKey("events")) {
+ Map o = (Map) pluginConfig.get("events");
+ if(o.containsKey(event.toString().toLowerCase())){
+ Map m = (Map) o.get(event.toString().toLowerCase());
for (Map.Entry entry : m.entrySet()) {
if(entry.getKey().equals("triggerjob")){
triggerJob((Map) entry.getValue());
} else if(entry.getKey().equals("updateeventlog")){
updateEventLog((Map) entry.getValue());
+ } else if(entry.getKey().equals("persistdata")){
+ persistData((Map) entry.getValue(), (SparkSession) spark);
}
}
}
+ }
}
private void triggerJob(Map map){
- String crawlid = this.pluginId;
+ String crawlid = this.jobContext.getId();
String notebook = map.get("notebook").toString();
String sparkversion = map.getOrDefault("sparkversion", "7.3.x-scala2.12").toString();
String clusterType = map.getOrDefault("instancetype", "i3.xlarge").toString();
@@ -45,11 +57,28 @@ private void triggerJob(Map map){
params = j.toJSONString();
}
- String environment = "";
- JobAPI.runSingle(notebook, sparkversion, clusterType, clusterSize.intValue(), crawlid, params, environment);
+ Option urlValue = scala.Option.apply(null);;
+ if(map.containsKey("url")){
+ urlValue = Option.apply(map.get("url").toString());
+ }
+
+ Option keyValue = scala.Option.apply(null);;
+ if(map.containsKey("key")){
+ keyValue = Option.apply(map.get("key").toString());
+ }
+
+ JobAPI.runSingle(notebook, sparkversion, clusterType, clusterSize.intValue(), crawlid, params, urlValue, keyValue);
}
+ private void persistData(Map map, SparkSession spark){
+ Persistence p = new Persistence();
+
+ p.persistResults(this.jobContext.getId(), this.jobContext.getId(), "warehouse_sandbox",
+ spark, "http://ec2-35-174-200-133.compute-1.amazonaws.com:8983/solr/crawldb");
+ }
+
+
private void updateEventLog(Map map){
}
diff --git a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/Persistence.java b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/Persistence.java
new file mode 100644
index 00000000..6a0af237
--- /dev/null
+++ b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/Persistence.java
@@ -0,0 +1,12 @@
+package com.kytheralabs.databricks;
+
+import com.google.gson.Gson;
+import org.apache.spark.sql.SparkSession;
+
+public class Persistence {
+ Gson gson = new Gson();
+
+ public void persistResults(String crawlId, String pTableName, String warehousename, SparkSession spark, String s) {
+
+ }
+}
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml b/sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml
deleted file mode 100644
index ceb1533b..00000000
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/pom.xml
+++ /dev/null
@@ -1,128 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.2-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- fetcher-chrome
- jar
-
- fetcher-chrome
- http://maven.apache.org
-
- UTF-8
- 1.0.1
- ${project.artifactId}
- edu.usc.irds.sparkler.plugin.FetcherChromeActivator
-
-
-
- gitlab-maven
- https://gitlab.com/api/v4/projects/26391218/packages/maven
-
-
-
-
-
- org.seleniumhq.selenium
- selenium-chrome-driver
- 3.141.59
-
-
-
- org.seleniumhq.selenium
- selenium-java
- 3.141.59
-
-
- org.seleniumhq.selenium
- selenium-api
- 3.141.59
-
-
-
- org.seleniumhq.selenium
- selenium-support
- 3.141.59
-
-
-
- org.seleniumhq.selenium
- selenium-remote-driver
- 3.141.59
-
-
-
- com.lihaoyi
- requests_2.12
- 0.1.7
-
-
- com.machinepublishers
- jbrowserdriver
- ${jbrowserdriver.version}
-
-
- slf4j-api
- org.slf4j
-
-
-
-
- com.browserup
- browserup-proxy-core
- 3.0.0-SNAPSHOT
-
-
- org.slf4j
- slf4j-api
- ${slf4j.version}
-
-
- org.slf4j
- slf4j-log4j12
- ${slf4j.version}
-
-
- io.netty
- netty-codec-http
- 4.0.15.Final
-
-
- com.kytheralabs
- seleniumscripter
- 1.4-SNAPSHOT
-
-
-
-
-
- maven-assembly-plugin
-
-
- maven-jar-plugin
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/scorer-dd-svn/pom.xml b/sparkler-core/sparkler-plugins/scorer-dd-svn/pom.xml
deleted file mode 100644
index 693f7ce1..00000000
--- a/sparkler-core/sparkler-plugins/scorer-dd-svn/pom.xml
+++ /dev/null
@@ -1,61 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.2-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- scorer-dd-svn
- jar
-
- scorer-dd-svn
- http://maven.apache.org
-
- UTF-8
- ${project.artifactId}
- ${project.version}
- ${project.groupId}
- edu.usc.irds.sparkler.plugin.DdSvnScorerActivator
-
-
-
-
- org.apache.httpcomponents
- httpclient
- 4.3.6
-
-
-
-
-
-
-
-
- maven-assembly-plugin
-
-
- maven-jar-plugin
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/urlfilter-regex/pom.xml b/sparkler-core/sparkler-plugins/urlfilter-regex/pom.xml
deleted file mode 100644
index bd902d61..00000000
--- a/sparkler-core/sparkler-plugins/urlfilter-regex/pom.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.2-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- urlfilter-regex
- jar
-
- urlfilter-regex
- http://maven.apache.org
-
- UTF-8
- ${project.artifactId}
- ${project.version}
- ${project.groupId}
- edu.usc.irds.sparkler.plugin.RegexURLFilterActivator
-
-
-
-
-
-
- maven-assembly-plugin
-
-
- maven-jar-plugin
-
-
-
-
From 81526875689cccf3b09308309d511cb949a0c9e3 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 19:55:49 +0100
Subject: [PATCH 103/335] update sbt
---
.github/workflows/build-sbt.yaml | 2 ++
1 file changed, 2 insertions(+)
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index def64e1c..c4c6450a 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -32,6 +32,8 @@ jobs:
- name: Run full package
run: sbt clean assembly -Dsparkprovided=false
working-directory: sparkler-core
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Deploy to databricks
run: ./deploy.sh standalone
working-directory: sparkler-core
From c005a5bc5767ab7dd03a0a215eabd47e7d7a5b4c Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 20:00:53 +0100
Subject: [PATCH 104/335] update sbt
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 1ad4ed92..fe999c65 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -45,5 +45,5 @@ object ScorerDdSvn {
object Databricks {
- lazy val wrapper = "default" % "webcrawlerwrapper_2.12" % "0.1-SNAPSHOT"
+ lazy val wrapper = "default.webcrawlerwrapper_2" % "12" % "0.1-SNAPSHOT"
}
\ No newline at end of file
From 4a1f04ecdc092c567ae06cd5ad72db54243f3ecf Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 20:14:07 +0100
Subject: [PATCH 105/335] update path
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index fe999c65..f77863d2 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -45,5 +45,5 @@ object ScorerDdSvn {
object Databricks {
- lazy val wrapper = "default.webcrawlerwrapper_2" % "12" % "0.1-SNAPSHOT"
+ lazy val wrapper = "com.kytheralabs" % "webcrawlerwrapper" % "2_12" % "0.1-SNAPSHOT"
}
\ No newline at end of file
From 6deaba8c85585607ca92ca1cb9f8c321ce4dbb9e Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 21:09:58 +0100
Subject: [PATCH 106/335] updates
---
sparkler-core/build.sbt | 19 +++----------------
sparkler-core/plugins.build.sbt | 12 ------------
.../project/PluginDependencies.scala | 2 +-
sparkler-core/project/Settings.scala | 2 +-
sparkler-core/project/plugins.sbt | 2 +-
.../kytheralabs/databricks/DatabricksAPI.java | 9 ++-------
.../kytheralabs/databricks/Persistence.java | 3 ---
7 files changed, 8 insertions(+), 41 deletions(-)
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index f8185126..afd22bfa 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -16,7 +16,7 @@
*/
import scala.sys.process._
-
+import com.gilcloud.sbt.gitlab.{GitlabCredentials,GitlabPlugin}
organization := Settings.projectOrganization
maintainer := Settings.projectMaintainer
@@ -29,7 +29,8 @@ javacOptions in (Compile, compile) ++= Seq("-target", "13")
libraryDependencies in ThisBuild ++= Seq(
Dependencies.pf4j % "provided",
)
-
+ThisBuild / useCoursier := false
+GitlabPlugin.autoImport.gitlabCredentials := Some(GitlabCredentials("Private-Token","_5w57W4QPjWFeKezV91y"))
developers := List(
// In alphabetic order
@@ -64,7 +65,6 @@ lazy val root = (project in file("."))
Settings.common,
name := "sparkler",
mainClass in Compile := Some("edu.usc.irds.sparkler.Main"),
- githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN")
)
.aggregate(api, app, plugins, ui)
@@ -73,15 +73,6 @@ lazy val api = (project in file("sparkler-api"))
.settings(
Settings.common,
name := "sparkler-api",
- /*libraryDependencies ++= (
- if(sparkprovided == "true") {
- ("org.apache.spark" %% "spark-core" % "3.1.0" % "provided") :: Nil
- ("org.apache.spark" %% "spark-sql" % "3.1.0" % "provided") :: Nil
- } else {
- ("org.apache.spark" %% "spark-core" % "3.1.0" % "provided") :: Nil
- ("org.apache.spark" %% "spark-sql" % "3.1.0" % "provided") :: Nil
- }
- ),*/
libraryDependencies ++= Seq(
Dependencies.jsonSimple exclude("junit", "junit"),
Dependencies.nutch exclude("*", "*"),
@@ -135,7 +126,6 @@ lazy val api = (project in file("sparkler-api"))
testOptions += Tests.Argument(TestFrameworks.JUnit,
"--verbosity=1",
"--run-listener=edu.usc.irds.sparkler.test.WebServerRunListener"),
- githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN")
)
@@ -223,7 +213,6 @@ lazy val app = (project in file("sparkler-app"))
buildLocation
},
- githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN")
)
.dependsOn(api)
@@ -239,7 +228,6 @@ lazy val testsBase = (project in file("sparkler-tests-base"))
Dependencies.Slf4j.api,
Dependencies.Slf4j.log4j12,
),
- githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
)
@@ -279,7 +267,6 @@ lazy val ui = (project in file("sparkler-ui"))
IO.move(packageFile, buildLocation)
buildLocation
},
- githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN")
)
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index d8ce7533..a2b50850 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -24,8 +24,6 @@ lazy val plugins = (project in file(s"$sparklerPlugins"))
.settings(
Settings.common,
name := "sparkler-plugins",
- githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
-
)
.aggregate(
@@ -54,7 +52,6 @@ lazy val templatePlugin = (project in file(s"$sparklerPlugins/template-plugin"))
className = "edu.usc.irds.sparkler.plugin.MyPluginActivator",
dependencies = List.empty
),
- githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
)
.dependsOn(api)
@@ -80,7 +77,6 @@ lazy val fetcherChrome = (project in file(s"$sparklerPlugins/fetcher-chrome"))
className = "edu.usc.irds.sparkler.plugin.FetcherChromeActivator",
dependencies = List.empty
),
- githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
)
.dependsOn(api)
@@ -98,7 +94,6 @@ lazy val databricks = (project in file(s"$sparklerPlugins/databricks-api-plugin"
className = "com.kytheralabs.databricks.DatabricksAPIActivator",
dependencies = List.empty
),
- githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
)
.dependsOn(api)
@@ -119,7 +114,6 @@ lazy val fetcherHtmlUnit = (project in file(s"$sparklerPlugins/fetcher-htmlunit"
testOptions += Tests.Argument(TestFrameworks.JUnit,
"--verbosity=1",
"--run-listener=edu.usc.irds.sparkler.test.WebServerRunListener"),
- githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
)
.dependsOn(api)
@@ -153,8 +147,6 @@ lazy val scorerDdSvn = (project in file(s"$sparklerPlugins/scorer-dd-svn"))
className = "edu.usc.irds.sparkler.plugin.DdSvnScorerActivator",
dependencies = List.empty
),
- githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
-
)
.dependsOn(api)
@@ -168,8 +160,6 @@ lazy val urlFilterRegex = (project in file(s"$sparklerPlugins/urlfilter-regex"))
className = "edu.usc.irds.sparkler.plugin.RegexURLFilterActivator",
dependencies = List.empty
),
- githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
-
)
.dependsOn(api)
@@ -183,7 +173,5 @@ lazy val urlFilterSameHost = (project in file(s"$sparklerPlugins/urlfilter-sameh
className = "edu.usc.irds.sparkler.plugin.UrlFilterSameHostActivator",
dependencies = List.empty
),
- githubTokenSource := TokenSource.GitConfig("github.token") || TokenSource.Environment("GITHUB_TOKEN"),
-
)
.dependsOn(api)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index f77863d2..9e231c6b 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -45,5 +45,5 @@ object ScorerDdSvn {
object Databricks {
- lazy val wrapper = "com.kytheralabs" % "webcrawlerwrapper" % "2_12" % "0.1-SNAPSHOT"
+ lazy val wrapper = "com.kytheralabs" % "webcrawlerwrapper_2.12" % "0.1-SNAPSHOT"
}
\ No newline at end of file
diff --git a/sparkler-core/project/Settings.scala b/sparkler-core/project/Settings.scala
index 86c2535e..1f5bb69e 100644
--- a/sparkler-core/project/Settings.scala
+++ b/sparkler-core/project/Settings.scala
@@ -64,7 +64,7 @@ object Settings {
"Scala-Tools Snapshots" at "https://scala-tools.org/repo-snapshots/",
"Gitlab Spicule 2" at "https://gitlab.com/api/v4/projects/26391218/packages/maven",
"Gitlab Spicule" at "https://gitlab.com/api/v4/projects/23300400/packages/maven",
- "Private Github" at "https://maven.pkg.github.com/spicule-kythera/webcrawlerwrapper/"
+ "Private Github" at "https://gitlab.com/api/v4/projects/28025579/packages/maven"
)
diff --git a/sparkler-core/project/plugins.sbt b/sparkler-core/project/plugins.sbt
index a9779901..3ce74dcf 100644
--- a/sparkler-core/project/plugins.sbt
+++ b/sparkler-core/project/plugins.sbt
@@ -19,4 +19,4 @@ addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.7.4")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.15.0")
addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.13")
addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.13")
-addSbtPlugin("com.codecommit" % "sbt-github-packages" % "0.5.3")
\ No newline at end of file
+addSbtPlugin("com.gilcloud" % "sbt-gitlab" % "0.0.6")
\ No newline at end of file
diff --git a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
index ccd1c4c8..ac8bb837 100644
--- a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
+++ b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
@@ -4,8 +4,6 @@
import edu.usc.irds.sparkler.AbstractExtensionPoint;
import edu.usc.irds.sparkler.GenericProcess;
import edu.usc.irds.sparkler.SparklerConfiguration;
-import com.kytheralabs.management.jobutils.JobAPI;
-import org.apache.spark.sql.SparkSession;
import org.json.simple.JSONObject;
import org.pf4j.Extension;
import scala.Option;
@@ -32,7 +30,7 @@ public void executeProcess(GenericProcess.Event event, Object spark) throws Exce
} else if(entry.getKey().equals("updateeventlog")){
updateEventLog((Map) entry.getValue());
} else if(entry.getKey().equals("persistdata")){
- persistData((Map) entry.getValue(), (SparkSession) spark);
+ persistData((Map) entry.getValue());
}
}
}
@@ -71,11 +69,8 @@ private void triggerJob(Map map){
}
- private void persistData(Map map, SparkSession spark){
- Persistence p = new Persistence();
+ private void persistData(Map map){
- p.persistResults(this.jobContext.getId(), this.jobContext.getId(), "warehouse_sandbox",
- spark, "http://ec2-35-174-200-133.compute-1.amazonaws.com:8983/solr/crawldb");
}
diff --git a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/Persistence.java b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/Persistence.java
index 6a0af237..a901e48b 100644
--- a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/Persistence.java
+++ b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/Persistence.java
@@ -1,12 +1,9 @@
package com.kytheralabs.databricks;
import com.google.gson.Gson;
-import org.apache.spark.sql.SparkSession;
public class Persistence {
Gson gson = new Gson();
- public void persistResults(String crawlId, String pTableName, String warehousename, SparkSession spark, String s) {
- }
}
From 731694818518e91b04cdf33a6ecc5a0f670f1c97 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 21:14:02 +0100
Subject: [PATCH 107/335] updates
---
sparkler-core/build.sbt | 3 ---
1 file changed, 3 deletions(-)
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index afd22bfa..8546b039 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -16,7 +16,6 @@
*/
import scala.sys.process._
-import com.gilcloud.sbt.gitlab.{GitlabCredentials,GitlabPlugin}
organization := Settings.projectOrganization
maintainer := Settings.projectMaintainer
@@ -29,8 +28,6 @@ javacOptions in (Compile, compile) ++= Seq("-target", "13")
libraryDependencies in ThisBuild ++= Seq(
Dependencies.pf4j % "provided",
)
-ThisBuild / useCoursier := false
-GitlabPlugin.autoImport.gitlabCredentials := Some(GitlabCredentials("Private-Token","_5w57W4QPjWFeKezV91y"))
developers := List(
// In alphabetic order
From 9397063d297166fe2516a087139ee80441fc500f Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 21:17:15 +0100
Subject: [PATCH 108/335] updates
---
.../src/main/java/com/kytheralabs/databricks/DatabricksAPI.java | 1 +
1 file changed, 1 insertion(+)
diff --git a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
index ac8bb837..7b2b63e8 100644
--- a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
+++ b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
@@ -1,6 +1,7 @@
package com.kytheralabs.databricks;
+import com.kytheralabs.management.jobutils.JobAPI;
import edu.usc.irds.sparkler.AbstractExtensionPoint;
import edu.usc.irds.sparkler.GenericProcess;
import edu.usc.irds.sparkler.SparklerConfiguration;
From 05df3910fd1910adac94350cacbde8db1b9d030c Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 21:35:37 +0100
Subject: [PATCH 109/335] remove poms
---
sparkler-core/build.sbt | 2 +-
sparkler-core/sparkler-api/pom.xml | 118 -----
sparkler-core/sparkler-app/pom.xml | 464 ------------------
.../usc/irds/sparkler/service/Injector.scala | 9 -
.../sparkler-plugins/fetcher-htmlunit/pom.xml | 70 ---
.../sparkler-plugins/fetcher-jbrowser/pom.xml | 72 ---
sparkler-core/sparkler-plugins/pom.xml | 148 ------
.../sparkler-plugins/template-plugin/pom.xml | 52 --
.../sparkler-plugins/url-injector/pom.xml | 52 --
.../urlfilter-samehost/pom.xml | 52 --
sparkler-core/sparkler-tests-base/pom.xml | 50 --
11 files changed, 1 insertion(+), 1088 deletions(-)
delete mode 100644 sparkler-core/sparkler-api/pom.xml
delete mode 100644 sparkler-core/sparkler-app/pom.xml
delete mode 100644 sparkler-core/sparkler-plugins/fetcher-htmlunit/pom.xml
delete mode 100644 sparkler-core/sparkler-plugins/fetcher-jbrowser/pom.xml
delete mode 100644 sparkler-core/sparkler-plugins/pom.xml
delete mode 100644 sparkler-core/sparkler-plugins/template-plugin/pom.xml
delete mode 100644 sparkler-core/sparkler-plugins/url-injector/pom.xml
delete mode 100644 sparkler-core/sparkler-plugins/urlfilter-samehost/pom.xml
delete mode 100644 sparkler-core/sparkler-tests-base/pom.xml
diff --git a/sparkler-core/build.sbt b/sparkler-core/build.sbt
index 8546b039..06e1730c 100644
--- a/sparkler-core/build.sbt
+++ b/sparkler-core/build.sbt
@@ -156,7 +156,7 @@ lazy val app = (project in file("sparkler-app"))
Dependencies.pf4j,
Dependencies.Solr.core,
Dependencies.tikaParsers,
-
+ "org.scala-lang.modules" %% "scala-collection-compat" % "2.5.0"
),
assemblyMergeStrategy in assembly := {
case x if x.contains("io.netty.versions.properties") => MergeStrategy.first
diff --git a/sparkler-core/sparkler-api/pom.xml b/sparkler-core/sparkler-api/pom.xml
deleted file mode 100644
index 6a0dc625..00000000
--- a/sparkler-core/sparkler-api/pom.xml
+++ /dev/null
@@ -1,118 +0,0 @@
-
-
-
-
- sparkler-parent
- edu.usc.irds.sparkler
- 0.2.2-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- sparkler-api
- jar
-
- sparkler-api
- http://irds.usc.edu/sparkler/
-
- UTF-8
-
-
-
-
- ..${file.separator}${project.conf.dir}
-
-
-
-
- maven-compiler-plugin
-
-
-
- org.apache.maven.plugins
- maven-surefire-plugin
- 3.0.0-M5
-
-
-
- listener
- edu.usc.irds.sparkler.test.WebServerRunListener
-
-
-
-
-
-
-
-
-
- org.slf4j
- slf4j-api
- ${slf4j.version}
-
-
- org.slf4j
- slf4j-log4j12
- ${slf4j.version}
-
-
- org.apache.nutch
- nutch
- ${nutch.version}
-
-
- *
- *
-
-
-
-
- org.yaml
- snakeyaml
- ${snakeyaml.version}
-
-
- com.googlecode.json-simple
- json-simple
- ${json.simple.version}
-
-
- org.apache.solr
- solr-solrj
- ${solr.version}
-
-
- edu.usc.irds.sparkler
- sparkler-tests-base
- ${project.version}
- test
-
-
- org.pf4j
- pf4j
- provided
-
-
- com.google.code.gson
- gson
- 2.8.6
-
-
-
-
diff --git a/sparkler-core/sparkler-app/pom.xml b/sparkler-core/sparkler-app/pom.xml
deleted file mode 100644
index 802a4bbd..00000000
--- a/sparkler-core/sparkler-app/pom.xml
+++ /dev/null
@@ -1,464 +0,0 @@
-
-
-
-
- sparkler-parent
- edu.usc.irds.sparkler
- 0.2.2-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- sparkler-app
- jar
-
- sparkler
- http://irds.usc.edu/sparkler/
-
-
- ${project.parent.basedir}${file.separator}${project.conf.dir}
- ${project.parent.basedir}${file.separator}${project.resources.dir}
-
- edu.usc.irds.sparkler.Main
-
-
-
-
-
-
- org.seleniumhq.selenium
- selenium-java
- 3.141.59
-
-
- org.seleniumhq.selenium
- selenium-api
- 3.141.59
-
-
-
-
- edu.usc.irds.sparkler
- sparkler-api
- ${project.version}
-
-
-
- org.pf4j
- pf4j
- ${pf4j.version}
-
-
-
- org.apache.spark
- spark-core_${version.scala.epoch}
- ${spark.version}
-
-
-
-
- org.apache.spark
- spark-sql_${version.scala.epoch}
- ${spark.version}
-
-
-
-
- org.apache.nutch
- nutch
- ${nutch.version}
-
-
- org.apache.tika
- tika-core
-
-
-
-
-
- org.apache.kafka
- kafka-clients
- ${kafka.version}
-
-
-
-
- org.apache.solr
- solr-solrj
- ${solr.version}
-
-
- org.apache.solr
- solr-core
- ${solr.version}
-
-
- org.apache.tika
- tika-parsers
- ${tika.version}
-
-
-
-
- com.fasterxml.jackson.core
- jackson-databind
- 2.10.0
-
-
-
- com.fasterxml.jackson.core
- jackson-core
- 2.10.0
-
-
-
- args4j
- args4j
- ${args4j.version}
-
-
- org.slf4j
- slf4j-api
- ${slf4j.version}
-
-
- org.slf4j
- slf4j-log4j12
- ${slf4j.version}
-
-
- commons-validator
- commons-validator
- ${commons.validator.version}
-
-
-
- com.googlecode.json-simple
- json-simple
- 1.1.1
-
-
-
-
-
-
- junit
- junit
- ${junit.version}
- test
-
-
-
-
-
-
-
- ${sparkler.resources.dir}
-
-
-
-
- org.apache.maven.plugins
- maven-shade-plugin
- 3.2.2
-
-
- package
-
- shade
-
-
- ${project.parent.basedir}${file.separator}build/${project.artifactId}-${project.version}.jar
- false
- false
-
-
-
- *
-
-
-
-
- *:*
-
- META-INF/*.SF
- META-INF/*.DSA
- META-INF/*.RSA
-
-
-
-
-
- reference.conf
-
-
-
- ${exec.mainClass}
-
-
-
-
-
-
-
- org.apache.http
- shaded.org.apache.http
-
-
-
-
-
-
-
- com.carrotgarden.maven
- scalor-maven-plugin_2.12
- 1.5.0.20190502185130
-
-
-
-
-
- org.scala-sbt
- compiler-bridge_${version.scala.epoch}
- ${version.scala.zinc}
-
-
-
-
-
-
- org.scala-lang
- scala-compiler
- ${version.scala.release}
-
-
-
-
-
-
- org.scalamacros
- paradise_${version.scala.release}
- ${version.scala.plugin.macro}
-
-
-
-
-
-
-
-
-
- setup-cross
- eclipse-config
-
- register-macro
- register-main
- register-test
-
- compile-macro
- compile-main
- compile-test
-
- scala-js-link-main
- scala-js-link-test
-
-
-
-
-
-
-
-
- org.scalastyle
- scalastyle-maven-plugin
- 0.8.0
-
- false
- true
- true
- false
- ${basedir}/src/main/scala
- ${basedir}/src/test/scala
- ${project.parent.basedir}/scalastyle_config.xml
- ${project.basedir}/target/scalastyle-output.xml
-
- UTF-8
-
-
-
-
- check
-
-
-
-
-
- maven-assembly-plugin
- 2.5.3
-
- src/assembly/dep.xml
- posix
-
-
-
-
-
-
- maven-resources-plugin
- 2.6
-
-
- copy-bins
- validate
-
- copy-resources
-
-
- ${project.parent.basedir}${file.separator}${project.bins.dir}
-
-
- ${project.parent.basedir}${file.separator}bin
- false
-
-
-
-
-
- copy-resources
- validate
-
- copy-resources
-
-
- ${project.parent.basedir}${file.separator}${project.resources.dir}
-
-
- ${project.parent.basedir}${file.separator}conf
- true
-
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-antrun-plugin
- 1.6
-
-
- fix-perms
- validate
-
-
-
-
-
-
- run
-
-
-
-
-
-
-
-
- sbt
-
-
-
-
- net.alchim31.maven
- scala-maven-plugin
-
-
- scala-compile-first
- none
-
- add-source
- compile
-
-
-
- scala-test-compile
- none
-
- testCompile
-
-
-
-
-
- org.codehaus.mojo
- exec-maven-plugin
- 1.6.0
-
-
- process-resources
-
- exec
-
-
-
-
- sbt
-
- .
-
- package
-
-
-
-
- maven-antrun-plugin
-
-
- process-resources
-
-
-
-
-
-
-
-
- run
-
-
-
-
-
-
-
-
-
-
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
index 1c6325d9..0ccdc81f 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
@@ -30,7 +30,6 @@ import scala.collection.JavaConversions._
import scala.io.Source
import java.nio.file.NotDirectoryException
import org.apache.commons.validator.routines.UrlValidator
-import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.Stack
import scala.collection.mutable.ArrayBuffer
@@ -71,14 +70,6 @@ class Injector extends CliTool {
var configOverride: Array[Any] = Array()
override def run(): Unit = {
- //val sconf = new SparkConf().setAppName("sparkler-job")
- //val sc = new SparkContext(sconf)
- //val logFile = "/home/bugg/Projects/spark-3.0.2-bin-hadoop2.7/README.md"
- //val logData = sc.textFile(logFile, 2).cache()
- //val numAs = logData.filter(line => line.contains("a")).count()
- //val numBs = logData.filter(line => line.contains("b")).count()
- //println("Lines with a: %s, Lines with b: %s".format(numAs, numBs))
- //println("SU: " + seedUrls.mkString(","))
if (configOverride != ""){
conf.overloadConfig(configOverride.mkString(" "));
}
diff --git a/sparkler-core/sparkler-plugins/fetcher-htmlunit/pom.xml b/sparkler-core/sparkler-plugins/fetcher-htmlunit/pom.xml
deleted file mode 100644
index 30e83203..00000000
--- a/sparkler-core/sparkler-plugins/fetcher-htmlunit/pom.xml
+++ /dev/null
@@ -1,70 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.2-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- fetcher-htmlunit
- jar
-
- fetcher-htmlunit
- http://maven.apache.org
-
- UTF-8
- 2.43.0
- ${project.artifactId}
- edu.usc.irds.sparkler.plugin.HtmlUnitFetcherActivator
-
-
-
- net.sourceforge.htmlunit
- htmlunit
- ${htmlunit.version}
-
-
-
-
-
- maven-assembly-plugin
-
-
-
- maven-jar-plugin
-
-
- org.apache.maven.plugins
- maven-surefire-plugin
- 3.0.0-M5
-
-
-
- listener
- edu.usc.irds.sparkler.test.WebServerRunListener
-
-
-
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/fetcher-jbrowser/pom.xml b/sparkler-core/sparkler-plugins/fetcher-jbrowser/pom.xml
deleted file mode 100644
index 78805f8c..00000000
--- a/sparkler-core/sparkler-plugins/fetcher-jbrowser/pom.xml
+++ /dev/null
@@ -1,72 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.2-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- fetcher-jbrowser
- jar
-
- fetcher-jbrowser
- http://maven.apache.org
-
- UTF-8
- 1.1.1
- ${project.artifactId}
- edu.usc.irds.sparkler.plugin.FetcherJBrowserActivator
-
-
-
- com.machinepublishers
- jbrowserdriver
- ${jbrowserdriver.version}
-
-
- slf4j-api
- org.slf4j
-
-
-
-
- org.seleniumhq.selenium
- selenium-java
- 3.141.59
-
-
- org.seleniumhq.selenium
- selenium-api
- 3.141.59
-
-
-
-
-
- maven-assembly-plugin
-
-
- maven-jar-plugin
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/pom.xml b/sparkler-core/sparkler-plugins/pom.xml
deleted file mode 100644
index 165d93c4..00000000
--- a/sparkler-core/sparkler-plugins/pom.xml
+++ /dev/null
@@ -1,148 +0,0 @@
-
-
-
-
-
-
- sparkler-parent
- edu.usc.irds.sparkler
- 0.2.2-SNAPSHOT
- ../pom.xml
-
- edu.usc.irds.sparkler.plugin
- sparkler-plugins
- 4.0.0
- pom
- http://irds.usc.edu/sparkler/
-
-
- urlfilter-regex
- scorer-dd-svn
- fetcher-jbrowser
- fetcher-htmlunit
- fetcher-chrome
- urlfilter-samehost
- url-injector
-
- template-plugin
-
-
-
- include-chrome
-
- urlfilter-regex
- scorer-dd-svn
- fetcher-jbrowser
- fetcher-htmlunit
- fetcher-chrome
- urlfilter-samehost
- url-injector
- template-plugin
-
-
-
-
- ${project.version}
- ${project.groupId}
-
-
-
-
-
-
-
- edu.usc.irds.sparkler
- sparkler-api
- ${project.parent.version}
- provided
-
-
-
-
-
- org.pf4j
- pf4j
- ${pf4j.version}
- provided
-
-
- edu.usc.irds.sparkler
- sparkler-tests-base
- ${project.version}
- test
-
-
-
-
-
-
-
- maven-assembly-plugin
-
- false
- ${project.parent.parent.basedir}${file.separator}build${file.separator}plugins
-
- jar-with-dependencies
-
-
-
- ${plugin.id}
- ${plugin.class}
- ${plugin.version}
- ${plugin.provider}
- ${plugin.dependencies}
-
-
-
-
-
- make-assembly
- package
-
- single
-
-
-
-
-
- org.apache.maven.plugins
- maven-jar-plugin
- 2.4
-
-
-
- ${plugin.id}
- ${plugin.class}
- ${plugin.version}
- ${plugin.provider}
- ${plugin.dependencies}
-
-
-
-
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/template-plugin/pom.xml b/sparkler-core/sparkler-plugins/template-plugin/pom.xml
deleted file mode 100644
index bfbb6fa9..00000000
--- a/sparkler-core/sparkler-plugins/template-plugin/pom.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.2-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- template-plugin
- jar
-
- template-plugin
- http://maven.apache.org
-
- UTF-8
- ${project.artifactId}
- ${project.version}
- ${project.groupId}
- edu.usc.irds.sparkler.plugin.MyPluginActivator
-
-
-
-
-
-
- maven-assembly-plugin
-
-
- maven-jar-plugin
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/url-injector/pom.xml b/sparkler-core/sparkler-plugins/url-injector/pom.xml
deleted file mode 100644
index 0e09d4b4..00000000
--- a/sparkler-core/sparkler-plugins/url-injector/pom.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.2-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- url-injector
- jar
-
- url-injector
- http://maven.apache.org
-
- UTF-8
- ${project.artifactId}
- ${project.version}
- ${project.groupId}
- edu.usc.irds.sparkler.plugin.UrlInjectorActivator
-
-
-
-
-
-
- maven-assembly-plugin
-
-
- maven-jar-plugin
-
-
-
-
diff --git a/sparkler-core/sparkler-plugins/urlfilter-samehost/pom.xml b/sparkler-core/sparkler-plugins/urlfilter-samehost/pom.xml
deleted file mode 100644
index c7592cbc..00000000
--- a/sparkler-core/sparkler-plugins/urlfilter-samehost/pom.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-
-
-
-
- sparkler-plugins
- edu.usc.irds.sparkler.plugin
- 0.2.2-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- urlfilter-samehost
- jar
-
- urlfilter-samehost
- http://maven.apache.org
-
- UTF-8
- ${project.artifactId}
- ${project.version}
- ${project.groupId}
- edu.usc.irds.sparkler.plugin.UrlFilterSameHostActivator
-
-
-
-
-
-
- maven-assembly-plugin
-
-
- maven-jar-plugin
-
-
-
-
diff --git a/sparkler-core/sparkler-tests-base/pom.xml b/sparkler-core/sparkler-tests-base/pom.xml
deleted file mode 100644
index efb1b17b..00000000
--- a/sparkler-core/sparkler-tests-base/pom.xml
+++ /dev/null
@@ -1,50 +0,0 @@
-
-
- sparkler-parent
- edu.usc.irds.sparkler
- 0.2.2-SNAPSHOT
-
- 4.0.0
-
- sparkler-tests-base
- jar
-
- sparkler-tests-base
- http://maven.apache.org
-
-
- UTF-8
- 9.4.0.v20161208
-
-
-
-
- org.eclipse.jetty
- jetty-server
- ${jetty.version}
-
-
- org.eclipse.jetty
- jetty-servlet
- ${jetty.version}
-
-
- junit
- junit
-
-
- org.slf4j
- slf4j-api
- ${slf4j.version}
- provided
-
-
- org.slf4j
- slf4j-log4j12
- ${slf4j.version}
- provided
-
-
-
From 4c4be20cd177f67bf7c8810cb9519987cf3c641c Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 21:42:54 +0100
Subject: [PATCH 110/335] fix linebreaks
---
sparkler-core/bin/sparkler.sh | 1 +
1 file changed, 1 insertion(+)
diff --git a/sparkler-core/bin/sparkler.sh b/sparkler-core/bin/sparkler.sh
index 0e6f47fb..8f83401c 100755
--- a/sparkler-core/bin/sparkler.sh
+++ b/sparkler-core/bin/sparkler.sh
@@ -35,5 +35,6 @@ JAR=`echo $DIR/../sparkler-app-*/lib`
# exit 2
#fi
# run
+# debugging lines
# -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005
java -Xms1g -cp $DIR/../conf:$JAR/* -Dpf4j.pluginsDir=$DIR/../plugins edu.usc.irds.sparkler.Main $@
\ No newline at end of file
From 9aed461c170c62bffd8c91a0986d57ec528e6c16 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 21:48:40 +0100
Subject: [PATCH 111/335] update packaging
---
sparkler-core/deploy.sh | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/deploy.sh b/sparkler-core/deploy.sh
index b2305d29..728e4308 100755
--- a/sparkler-core/deploy.sh
+++ b/sparkler-core/deploy.sh
@@ -5,10 +5,11 @@ method=$1
pip install databricks-cli
if [ "$method" = "standalone" ]; then
- ls
+ rm -rf build/sparkler-app-0.3.1-SNAPSHOT
+ mv build sparkler
+ zip -r sparkler.zip sparkler
~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-standalone/
else
rm -rf build/sparkler-app-0.3.1-SNAPSHOT
-
~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
fi
From ad4864d4076c56008d71e80fae5f32489889d9e7 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 21:56:04 +0100
Subject: [PATCH 112/335] update packaging
---
sparkler-core/deploy.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/deploy.sh b/sparkler-core/deploy.sh
index 728e4308..2c403a17 100755
--- a/sparkler-core/deploy.sh
+++ b/sparkler-core/deploy.sh
@@ -8,7 +8,7 @@ if [ "$method" = "standalone" ]; then
rm -rf build/sparkler-app-0.3.1-SNAPSHOT
mv build sparkler
zip -r sparkler.zip sparkler
- ~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-standalone/
+ ~/.local/bin/databricks fs cp --recursive --overwrite sparkler.zip dbfs:/FileStore/sparkler-standalone/
else
rm -rf build/sparkler-app-0.3.1-SNAPSHOT
~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
From a2e3dcf01caeaa9ae8f079c0d5655c2e8bff5ba1 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 22:12:35 +0100
Subject: [PATCH 113/335] package
---
.github/workflows/build-sbt.yaml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index c4c6450a..aec8ae1e 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -19,7 +19,7 @@ jobs:
java-version: '8'
distribution: 'adopt'
- name: Run submit package
- run: sbt assembly -Dsparkprovided=true
+ run: sbt package assembly -Dsparkprovided=true
working-directory: sparkler-core
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -30,7 +30,7 @@ jobs:
DATABRICKS_HOST: https://kli-mmit.cloud.databricks.com
DATABRICKS_TOKEN: dapi88de017d8ab1fe80afe5691c1e786185
- name: Run full package
- run: sbt clean assembly -Dsparkprovided=false
+ run: sbt clean package assembly -Dsparkprovided=false
working-directory: sparkler-core
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
From 6e5b15bfa6c079633989d66406744341dd54ffc9 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 22:43:57 +0100
Subject: [PATCH 114/335] add missing url injector stuff
---
sparkler-core/plugins.build.sbt | 14 +++
sparkler-core/pom.xml | 191 --------------------------------
2 files changed, 14 insertions(+), 191 deletions(-)
delete mode 100644 sparkler-core/pom.xml
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index a2b50850..c449b211 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -33,6 +33,7 @@ lazy val plugins = (project in file(s"$sparklerPlugins"))
scorerDdSvn,
urlFilterRegex,
urlFilterSameHost,
+ urlInjector,
databricks,
)
//fetcherJBrowser,
@@ -150,6 +151,19 @@ lazy val scorerDdSvn = (project in file(s"$sparklerPlugins/scorer-dd-svn"))
)
.dependsOn(api)
+lazy val urlInjector = (project in file(s"$sparklerPlugins/url-injector"))
+ .enablePlugins(JavaAppPackaging)
+ .settings(
+ Settings.plugin,
+ name := "url-injector",
+ Settings.pluginManifest(
+ id = "url-injector",
+ className = "edu.usc.irds.sparkler.plugin.UrlInjector",
+ dependencies = List.empty
+ ),
+ )
+ .dependsOn(api)
+
lazy val urlFilterRegex = (project in file(s"$sparklerPlugins/urlfilter-regex"))
.enablePlugins(JavaAppPackaging)
.settings(
diff --git a/sparkler-core/pom.xml b/sparkler-core/pom.xml
deleted file mode 100644
index 20047d93..00000000
--- a/sparkler-core/pom.xml
+++ /dev/null
@@ -1,191 +0,0 @@
-
-
-
-
-
- sparkler-parent
- edu.usc.irds.sparkler
- 0.2.2-SNAPSHOT
- 4.0.0
- pom
-
- UTF-8
- conf
- build${file.separator}conf
- build${file.separator}bin
-
- 3.0.1
- 2.12
- ${version.scala.epoch}.12
- 0.6
- 0.6.27
- sjs${version.scalajs.epoch}_${version.scala.epoch}
-
-
- 1.2.5
- 2.1.1
- 1.16
- 0.10.0.0
- 1.2.1
- 8.5.0
- 1.7.30
- 1.26
- 1.1.1
- 1.5.1
- 0.16.4
- 4.5.2
-
-
- 3.6.1
- 2.5.0
- 2.1.1
- 2.4.1
-
-
- 4.13.1
- 2.0.29
- 1.7.12
- 1.24
- 2.6.0
-
-
- sparkler-api
- sparkler-tests-base
-
-
-
- gitlab-maven
- https://gitlab.com/api/v4/projects/23300400/packages/maven
-
- true
-
-
-
- central
- Maven Central
- default
- https://repo1.maven.org/maven2
-
- false
-
-
-
-
-
-
- junit
- junit
- ${junit.version}
-
-
- org.pf4j
- pf4j
- ${pf4j.version}
-
-
-
-
-
-
- maven-compiler-plugin
-
-
- maven-release-plugin
-
-
- maven-clean-plugin
- ${maven.clean.plugin.version}
-
-
-
- build
-
-
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-compiler-plugin
- 3.7.0
-
-
- 1.8
-
-
-
- net.alchim31.maven
- scala-maven-plugin
- 4.3.1
-
-
- org.apache.maven.plugins
- maven-release-plugin
- 3.0.0-M1
-
-
-
-
-
-
- all
-
- true
-
-
- sparkler-api
- sparkler-app
- sparkler-plugins
- sparkler-tests-base
- sparkler-ui
-
-
-
- core
-
- sparkler-app
-
-
-
- plugins
-
- sparkler-plugins
-
-
-
- http://irds.usc.edu/sparkler/
-
- scm:git:https://github.com/USCDataScience/sparkler.git
- scm:git:git@github.com:USCDataScience/sparkler.git
- https://github.com/USCDataScience/sparkler
- HEAD
-
-
-
- Thamme Gowda
- University of Southern California
- http://irds.usc.edu/
- http://github.com/thammegowda/
-
-
-
-
From 91882b945b801cd71d3d8ac16a1b87f506e8179e Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 23:17:13 +0100
Subject: [PATCH 115/335] add missing url injector stuff
---
sparkler-core/plugins.build.sbt | 2 +-
.../src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java | 1 -
2 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/sparkler-core/plugins.build.sbt b/sparkler-core/plugins.build.sbt
index c449b211..472d4e23 100644
--- a/sparkler-core/plugins.build.sbt
+++ b/sparkler-core/plugins.build.sbt
@@ -158,7 +158,7 @@ lazy val urlInjector = (project in file(s"$sparklerPlugins/url-injector"))
name := "url-injector",
Settings.pluginManifest(
id = "url-injector",
- className = "edu.usc.irds.sparkler.plugin.UrlInjector",
+ className = "edu.usc.irds.sparkler.plugin.UrlInjectorActivator",
dependencies = List.empty
),
)
diff --git a/sparkler-core/sparkler-plugins/url-injector/src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java b/sparkler-core/sparkler-plugins/url-injector/src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java
index 6b59fcbd..89318d94 100644
--- a/sparkler-core/sparkler-plugins/url-injector/src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java
+++ b/sparkler-core/sparkler-plugins/url-injector/src/main/java/edu/usc/irds/sparkler/plugin/UrlInjector.java
@@ -5,7 +5,6 @@
import edu.usc.irds.sparkler.SparklerConfiguration;
import edu.usc.irds.sparkler.SparklerException;
import edu.usc.irds.sparkler.UrlInjectorObj;
-import io.netty.handler.codec.json.JsonObjectDecoder;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
From fd5e17aa070a2d9bd9a361820d32c07613e9a806 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 23:17:53 +0100
Subject: [PATCH 116/335] add missing url injector stuff
---
sparkler-core/conf/sparkler-default.yaml | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index 7d2e85ce..37042c17 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -37,8 +37,7 @@ elasticsearch.uri: http://localhost:9200
# URL on which Apache Spark is running.
# Type: String. Default is "local[*]" for local mode.
-spark.master:
- #local[*]
+spark.master: local[*]
databricks.enable: false
crawl.repartition: 500
From 1030f22cacba5720596ada593a9054f461aca4e4 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 23:33:45 +0100
Subject: [PATCH 117/335] add missing url injector stuff
---
sparkler-core/conf/sparkler-default.yaml | 78 ++++++++++++------------
1 file changed, 39 insertions(+), 39 deletions(-)
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index 37042c17..c68d95fe 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -165,45 +165,45 @@ plugins:
# - "\"COR\""
# - "\"VEN\""
# - "\"SOM\""
- values:
- - Acitretin
- - Adempas
- - Actiq
- selenium:
- 1:
- operation: click
- value: id:some-id
- 2:
- operation: keys
- value: "id:some-input-id:${token}"
- 3:
- operation: click
- value: "id:some-id"
- json: "{ \"name\":\"John\", \"age\":${token}, \"car\":null }"
- form:
- hdnField: "submit"
- txtRequired: ""
- radSearchBy: "drugname"
- txtName: "${token}"
- selTC: ""
- selProgram: "MA"
- txtDateOfService: "12/01/2020"
- databricks.api.events:
- startup:
- updateeventlog:
- sql: xxx
- iteration_complete:
- updateeventlog:
- sql: xxx
- shutdown:
- triggerjob:
- notebook: xxx
- sparkversion: xxx
- clustertype: xxx
- clustersize: xxx
- parameters:
- abc: xxx
- def: xxx
+# values:
+# - Acitretin
+# - Adempas
+# - Actiq
+# selenium:
+# 1:
+# operation: click
+# value: id:some-id
+# 2:
+# operation: keys
+# value: "id:some-input-id:${token}"
+# 3:
+# operation: click
+# value: "id:some-id"
+# json: "{ \"name\":\"John\", \"age\":${token}, \"car\":null }"
+# form:
+# hdnField: "submit"
+# txtRequired: ""
+# radSearchBy: "drugname"
+# txtName: "${token}"
+# selTC: ""
+# selProgram: "MA"
+# txtDateOfService: "12/01/2020"
+# databricks.api.events:
+# startup:
+# updateeventlog:
+# sql: xxx
+# iteration_complete:
+# updateeventlog:
+# sql: xxx
+# shutdown:
+# triggerjob:
+# notebook: xxx
+# sparkversion: xxx
+# clustertype: xxx
+# clustersize: xxx
+# parameters:
+# abc: xxx
+# def: xxx
##################### Custom properties for MEMEX ###########################################
From 7d687e22e32d2b624b06397f3e0a4394f5db673c Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 9 Jul 2021 23:34:06 +0100
Subject: [PATCH 118/335] add missing url injector stuff
---
sparkler-core/conf/sparkler-default.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index c68d95fe..c0d8181e 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -157,7 +157,7 @@ plugins:
- "--disable-permissions-api"
#chrome.proxy.address: 127.0.0.1:9998
url.injector:
- mode: selenium # currently only compatible with the fetcher-chrome plugin
+ # mode: selenium # currently only compatible with the fetcher-chrome plugin
#mode: replace
#mode: json
#mode: form
From d10b0b6984cf370a689ae5782a77811db309b8e3 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sat, 10 Jul 2021 02:20:33 +0100
Subject: [PATCH 119/335] update samples
---
sparkler-core/conf/sparkler-default.yaml | 33 ++++++++++++------------
1 file changed, 17 insertions(+), 16 deletions(-)
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index c0d8181e..c87f17e3 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -188,22 +188,23 @@ plugins:
# selTC: ""
# selProgram: "MA"
# txtDateOfService: "12/01/2020"
-# databricks.api.events:
-# startup:
-# updateeventlog:
-# sql: xxx
-# iteration_complete:
-# updateeventlog:
-# sql: xxx
-# shutdown:
-# triggerjob:
-# notebook: xxx
-# sparkversion: xxx
-# clustertype: xxx
-# clustersize: xxx
-# parameters:
-# abc: xxx
-# def: xxx
+# databricks.api:
+# events:
+# startup:
+# updateeventlog:
+# sql: xxx
+# iteration_complete:
+# updateeventlog:
+# sql: xxx
+# shutdown:
+# triggerjob:
+# notebook: xxx
+# sparkversion: xxx
+# instancetype: xxx
+# clustersize: xxx
+# parameters:
+# abc: xxx
+# def: xxx
##################### Custom properties for MEMEX ###########################################
From b367f307ef5378c6edda11278af59ee3b4e4cfe9 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sat, 10 Jul 2021 02:55:20 +0100
Subject: [PATCH 120/335] deploy to other envs
---
.github/workflows/build-sbt.yaml | 27 ++++++++++++++++++---------
sparkler-core/deploy.sh | 16 ++++++++++++++++
2 files changed, 34 insertions(+), 9 deletions(-)
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index aec8ae1e..d0121dc7 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -1,10 +1,10 @@
name: Scala CI
-on:
- push:
- branches: [ mvn2sbt ]
- pull_request:
- branches: [ mvn2sbt ]
+#on:
+# push:
+# branches: [ mvn2sbt ]
+# pull_request:
+# branches: [ mvn2sbt ]
jobs:
build:
@@ -27,8 +27,13 @@ jobs:
run: ./deploy.sh
working-directory: sparkler-core
env:
- DATABRICKS_HOST: https://kli-mmit.cloud.databricks.com
- DATABRICKS_TOKEN: dapi88de017d8ab1fe80afe5691c1e786185
+ DEV_DATABRICKS_HOST: https://kli-mmit.cloud.databricks.com
+ DEV_DATABRICKS_TOKEN: dapi88de017d8ab1fe80afe5691c1e786185
+ NEWDEV_DATABRICKS_HOST: https://wayfinder-mmit-dev.cloud.databricks.com
+ NEWDEV_DATABRICKS_TOKEN: dapi37db0d0c0a9997dd315af28eb82f6acc
+ TEST_DATABRICKS_HOST: https://wayfinder-mmit-test.cloud.databricks.com
+ TEST_DATABRICKS_TOKEN: dapiad20608bca91d571d58e06c4f7dba4d9
+
- name: Run full package
run: sbt clean package assembly -Dsparkprovided=false
working-directory: sparkler-core
@@ -38,5 +43,9 @@ jobs:
run: ./deploy.sh standalone
working-directory: sparkler-core
env:
- DATABRICKS_HOST: https://kli-mmit.cloud.databricks.com
- DATABRICKS_TOKEN: dapi88de017d8ab1fe80afe5691c1e786185
+ DEV_DATABRICKS_HOST: https://kli-mmit.cloud.databricks.com
+ DEV_DATABRICKS_TOKEN: dapi88de017d8ab1fe80afe5691c1e786185
+ NEWDEV_DATABRICKS_HOST: https://wayfinder-mmit-dev.cloud.databricks.com
+ NEWDEV_DATABRICKS_TOKEN: dapi37db0d0c0a9997dd315af28eb82f6acc
+ TEST_DATABRICKS_HOST: https://wayfinder-mmit-test.cloud.databricks.com
+ TEST_DATABRICKS_TOKEN: dapiad20608bca91d571d58e06c4f7dba4d9
diff --git a/sparkler-core/deploy.sh b/sparkler-core/deploy.sh
index 2c403a17..b5a4c8c8 100755
--- a/sparkler-core/deploy.sh
+++ b/sparkler-core/deploy.sh
@@ -8,8 +8,24 @@ if [ "$method" = "standalone" ]; then
rm -rf build/sparkler-app-0.3.1-SNAPSHOT
mv build sparkler
zip -r sparkler.zip sparkler
+ export DATABRICKS_HOST=$DEV_DATABRICKS_HOST
+ export DATABRICKS_KEY=$DEV_DATABRICKS_KEY
+ ~/.local/bin/databricks fs cp --recursive --overwrite sparkler.zip dbfs:/FileStore/sparkler-standalone/
+ export DATABRICKS_HOST=$NEWDEV_DATABRICKS_HOST
+ export DATABRICKS_KEY=$NEWDEV_DATABRICKS_TOKEN
+ ~/.local/bin/databricks fs cp --recursive --overwrite sparkler.zip dbfs:/FileStore/sparkler-standalone/
+ export DATABRICKS_HOST=$TEST_DATABRICKS_HOST
+ export DATABRICKS_KEY=$TEST_DATABRICKS_TOKEN
~/.local/bin/databricks fs cp --recursive --overwrite sparkler.zip dbfs:/FileStore/sparkler-standalone/
else
rm -rf build/sparkler-app-0.3.1-SNAPSHOT
+ export DATABRICKS_HOST=$DEV_DATABRICKS_HOST
+ export DATABRICKS_KEY=$DEV_DATABRICKS_KEY
+ ~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
+ export DATABRICKS_HOST=$NEWDEV_DATABRICKS_HOST
+ export DATABRICKS_KEY=$NEWDEV_DATABRICKS_TOKEN
+ ~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
+ export DATABRICKS_HOST=$TEST_DATABRICKS_HOST
+ export DATABRICKS_KEY=$TEST_DATABRICKS_TOKEN
~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
fi
From 04065f011c732643667970f6b4a539ad178999a8 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sun, 11 Jul 2021 18:22:09 +0100
Subject: [PATCH 121/335] change params to parameters
---
.../main/java/com/kytheralabs/databricks/DatabricksAPI.java | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
index 7b2b63e8..34214ffd 100644
--- a/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
+++ b/sparkler-core/sparkler-plugins/databricks-api-plugin/src/main/java/com/kytheralabs/databricks/DatabricksAPI.java
@@ -51,8 +51,8 @@ private void triggerJob(Map map){
}
String params = "{}";
- if(map.containsKey("params")){
- JSONObject j = new JSONObject((Map) map.get("params"));
+ if(map.containsKey("parameters")){
+ JSONObject j = new JSONObject((Map) map.get("parameters"));
params = j.toJSONString();
}
From f9a9a4deb81573cf8be7a759eec5970e0d3b5d51 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sun, 11 Jul 2021 18:28:27 +0100
Subject: [PATCH 122/335] change params to parameters
---
.github/workflows/build-sbt.yaml | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/.github/workflows/build-sbt.yaml b/.github/workflows/build-sbt.yaml
index d0121dc7..75fb2874 100644
--- a/.github/workflows/build-sbt.yaml
+++ b/.github/workflows/build-sbt.yaml
@@ -1,10 +1,7 @@
name: Scala CI
-#on:
-# push:
-# branches: [ mvn2sbt ]
-# pull_request:
-# branches: [ mvn2sbt ]
+on:
+ push:
jobs:
build:
From 5451b7bc72cc80786683395898048ff18c9e3289 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sun, 11 Jul 2021 18:33:24 +0100
Subject: [PATCH 123/335] change params to parameters
---
sparkler-core/deploy.sh | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/deploy.sh b/sparkler-core/deploy.sh
index b5a4c8c8..cdddb567 100755
--- a/sparkler-core/deploy.sh
+++ b/sparkler-core/deploy.sh
@@ -9,7 +9,7 @@ if [ "$method" = "standalone" ]; then
mv build sparkler
zip -r sparkler.zip sparkler
export DATABRICKS_HOST=$DEV_DATABRICKS_HOST
- export DATABRICKS_KEY=$DEV_DATABRICKS_KEY
+ export DATABRICKS_KEY=$DEV_DATABRICKS_TOKEN
~/.local/bin/databricks fs cp --recursive --overwrite sparkler.zip dbfs:/FileStore/sparkler-standalone/
export DATABRICKS_HOST=$NEWDEV_DATABRICKS_HOST
export DATABRICKS_KEY=$NEWDEV_DATABRICKS_TOKEN
@@ -20,7 +20,7 @@ if [ "$method" = "standalone" ]; then
else
rm -rf build/sparkler-app-0.3.1-SNAPSHOT
export DATABRICKS_HOST=$DEV_DATABRICKS_HOST
- export DATABRICKS_KEY=$DEV_DATABRICKS_KEY
+ export DATABRICKS_KEY=$DEV_DATABRICKS_TOKEN
~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
export DATABRICKS_HOST=$NEWDEV_DATABRICKS_HOST
export DATABRICKS_KEY=$NEWDEV_DATABRICKS_TOKEN
From f6c07caa1fc3e388fdf45895551288c46557f16b Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sun, 11 Jul 2021 18:39:16 +0100
Subject: [PATCH 124/335] change params to parameters
---
sparkler-core/deploy.sh | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/sparkler-core/deploy.sh b/sparkler-core/deploy.sh
index cdddb567..8ba05c1a 100755
--- a/sparkler-core/deploy.sh
+++ b/sparkler-core/deploy.sh
@@ -9,23 +9,23 @@ if [ "$method" = "standalone" ]; then
mv build sparkler
zip -r sparkler.zip sparkler
export DATABRICKS_HOST=$DEV_DATABRICKS_HOST
- export DATABRICKS_KEY=$DEV_DATABRICKS_TOKEN
+ export DATABRICKS_TOKEN=$DEV_DATABRICKS_TOKEN
~/.local/bin/databricks fs cp --recursive --overwrite sparkler.zip dbfs:/FileStore/sparkler-standalone/
export DATABRICKS_HOST=$NEWDEV_DATABRICKS_HOST
- export DATABRICKS_KEY=$NEWDEV_DATABRICKS_TOKEN
+ export DATABRICKS_TOKEN=$NEWDEV_DATABRICKS_TOKEN
~/.local/bin/databricks fs cp --recursive --overwrite sparkler.zip dbfs:/FileStore/sparkler-standalone/
export DATABRICKS_HOST=$TEST_DATABRICKS_HOST
- export DATABRICKS_KEY=$TEST_DATABRICKS_TOKEN
+ export DATABRICKS_TOKEN=$TEST_DATABRICKS_TOKEN
~/.local/bin/databricks fs cp --recursive --overwrite sparkler.zip dbfs:/FileStore/sparkler-standalone/
else
rm -rf build/sparkler-app-0.3.1-SNAPSHOT
export DATABRICKS_HOST=$DEV_DATABRICKS_HOST
- export DATABRICKS_KEY=$DEV_DATABRICKS_TOKEN
+ export DATABRICKS_TOKEN=$DEV_DATABRICKS_TOKEN
~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
export DATABRICKS_HOST=$NEWDEV_DATABRICKS_HOST
- export DATABRICKS_KEY=$NEWDEV_DATABRICKS_TOKEN
+ export DATABRICKS_TOKEN=$NEWDEV_DATABRICKS_TOKEN
~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
export DATABRICKS_HOST=$TEST_DATABRICKS_HOST
- export DATABRICKS_KEY=$TEST_DATABRICKS_TOKEN
+ export DATABRICKS_TOKEN=$TEST_DATABRICKS_TOKEN
~/.local/bin/databricks fs cp --recursive --overwrite build/ dbfs:/FileStore/sparkler-submit/
fi
From 8d30eb26f1ee54182f44a70c7ed5c15d497bccd0 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 12 Jul 2021 13:04:20 +0100
Subject: [PATCH 125/335] update scripter
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 9e231c6b..c3cc5e5c 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.4-20210707.233519-5"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.4-20210712.093510-6"
}
object FetcherHtmlUnit {
From f00c89751497b8e839798d05e960954ff8748a61 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 12 Jul 2021 14:01:13 +0100
Subject: [PATCH 126/335] add base64 encoded endpoints
---
.../edu/usc/irds/sparkler/pipeline/Crawler.scala | 13 +++++++++++++
.../edu/usc/irds/sparkler/service/Injector.scala | 12 ++++++++++++
2 files changed, 25 insertions(+)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index cd58ef74..cf7b3569 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -117,6 +117,11 @@ class Crawler extends CliTool {
usage = "Configuration override. JSON Blob, key values in this take priority over config values in the config file.")
var configOverride: Array[Any] = Array()
+ @Option(name = "-co64", aliases = Array("--config-override-encoded"),
+ handler = classOf[StringArrayOptionHandler],
+ usage = "Configuration override. JSON Blob, key values in this take priority over config values in the config file.")
+ var configOverrideEncoded: String = ""
+
/* Generator options, currently not exposed via the CLI
and only accessible through the config yaml file
*/
@@ -130,6 +135,14 @@ class Crawler extends CliTool {
if (configOverride != ""){
sparklerConf.overloadConfig(configOverride.mkString(" "));
}
+ if(configOverrideEncoded != ""){
+ import java.util.Base64
+ import java.nio.charset.StandardCharsets
+ val decoded = Base64.getDecoder().decode(configOverrideEncoded)
+ val str = new String(decoded, StandardCharsets.UTF_8)
+ sparklerConf.overloadConfig(str)
+ }
+
if (this.outputPath.isEmpty) {
this.outputPath = jobId
}
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
index 0ccdc81f..360ea8a1 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/Injector.scala
@@ -69,10 +69,22 @@ class Injector extends CliTool {
usage = "Configuration override. JSON Blob, key values in this take priority over config values in the config file.")
var configOverride: Array[Any] = Array()
+ @Option(name = "-co64", aliases = Array("--config-override-encoded"),
+ handler = classOf[StringArrayOptionHandler],
+ usage = "Configuration override. JSON Blob, key values in this take priority over config values in the config file.")
+ var configOverrideEncoded: String = ""
+
override def run(): Unit = {
if (configOverride != ""){
conf.overloadConfig(configOverride.mkString(" "));
}
+ if(configOverrideEncoded != ""){
+ import java.util.Base64
+ import java.nio.charset.StandardCharsets
+ val decoded = Base64.getDecoder().decode(configOverrideEncoded)
+ val str = new String(decoded, StandardCharsets.UTF_8)
+ conf.overloadConfig(str)
+ }
if (!sparkStorage.isEmpty) {
val uri = conf.asInstanceOf[java.util.HashMap[String, String]]
uri.put("crawldb.uri", sparkStorage)
From 35b5167315955269821247575e1f3a419bc249bb Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 12 Jul 2021 16:04:34 +0100
Subject: [PATCH 127/335] update repartition
---
sparkler-core/conf/sparkler-default.yaml | 2 +-
.../service/CustomerPluginManager.java | 21 ---------------
.../sparkler/service/PluginManagerLoader.java | 27 -------------------
3 files changed, 1 insertion(+), 49 deletions(-)
delete mode 100644 sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/CustomerPluginManager.java
delete mode 100644 sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginManagerLoader.java
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index c87f17e3..ba2214da 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -40,7 +40,7 @@ elasticsearch.uri: http://localhost:9200
spark.master: local[*]
databricks.enable: false
-crawl.repartition: 500
+crawl.repartition: 1
##################### Apache Kafka Properties ###########################
# Enable Kafka Dump
# Type: Boolean. Default is "false"
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/CustomerPluginManager.java b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/CustomerPluginManager.java
deleted file mode 100644
index bcdc4e36..00000000
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/CustomerPluginManager.java
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
-package edu.usc.irds.sparkler.service;
-
-import org.pf4j.*;
-
-public class CustomerPluginManager {
-
- public static DefaultPluginManager getPluginManager(){
- return new DefaultPluginManager(){
- @Override
- protected PluginLoader createPluginLoader() {
- return new CompoundPluginLoader()
- .add(new PluginManagerLoader(this), this::isNotDevelopment);
- //.add(new JarPluginLoader(this), this::isNotDevelopment)
- //.add(new DefaultPluginLoader(this), this::isNotDevelopment);
-
- }
- };
- }
-}
-*/
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginManagerLoader.java b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginManagerLoader.java
deleted file mode 100644
index 6f13595b..00000000
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/service/PluginManagerLoader.java
+++ /dev/null
@@ -1,27 +0,0 @@
-package edu.usc.irds.sparkler.service;
-
-import org.pf4j.*;
-
-import java.nio.file.Path;
-
-public class PluginManagerLoader extends JarPluginLoader {
- public PluginManagerLoader(PluginManager pluginManager) {
- super(pluginManager);
- }
-
- @Override
- public ClassLoader loadPlugin(Path pluginPath, PluginDescriptor pluginDescriptor) {
- if(pluginPath.toString().contains("fetcher-chrome")) {
- PluginClassLoader pluginClassLoader = new PluginClassLoader(pluginManager, pluginDescriptor, getClass().getClassLoader(), ClassLoadingStrategy.PDA);
- pluginClassLoader.addFile(pluginPath.toFile());
- return pluginClassLoader;
- } else{
- PluginClassLoader pluginClassLoader = new PluginClassLoader(pluginManager, pluginDescriptor, getClass().getClassLoader());
- pluginClassLoader.addFile(pluginPath.toFile());
- return pluginClassLoader;
- }
-
-
- }
-
-}
From 6a67c1da906c4905aa350e4f7b7d33c9e3e673fc Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Mon, 12 Jul 2021 16:21:50 +0100
Subject: [PATCH 128/335] update repartition
---
.../sparkler-app/src/main/resources/sparkler-default.yaml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
index 0b824a43..e5eb367b 100644
--- a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
+++ b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
@@ -32,7 +32,7 @@ solr.uri: http://ec2-35-174-200-133.compute-1.amazonaws.com:8983/solr/crawldb
# elasticsearch settings
elasticsearch.uri: http://localhost:9200
-crawl.repartition: 500
+crawl.repartition: 1
##################### Apache Spark Properties ###########################
From 499506f975fd7202b87bb02a8270a4b98ea879a6 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 13 Jul 2021 01:14:11 +0100
Subject: [PATCH 129/335] add ability to change default query
---
.../usc/irds/sparkler/pipeline/Crawler.scala | 31 ++++++++-----------
1 file changed, 13 insertions(+), 18 deletions(-)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index cf7b3569..810a4e19 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -40,6 +40,8 @@ import java.util.UUID
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
import scala.io.Source
+import java.util.Base64
+import java.nio.charset.StandardCharsets
/**
*
@@ -122,6 +124,11 @@ class Crawler extends CliTool {
usage = "Configuration override. JSON Blob, key values in this take priority over config values in the config file.")
var configOverrideEncoded: String = ""
+ @Option(name = "-dq", aliases = Array("--default-query"),
+ handler = classOf[StringArrayOptionHandler],
+ usage = "Configuration override. JSON Blob, key values in this take priority over config values in the config file.")
+ var defaultquery: String = ""
+
/* Generator options, currently not exposed via the CLI
and only accessible through the config yaml file
*/
@@ -136,8 +143,6 @@ class Crawler extends CliTool {
sparklerConf.overloadConfig(configOverride.mkString(" "));
}
if(configOverrideEncoded != ""){
- import java.util.Base64
- import java.nio.charset.StandardCharsets
val decoded = Base64.getDecoder().decode(configOverrideEncoded)
val str = new String(decoded, StandardCharsets.UTF_8)
sparklerConf.overloadConfig(str)
@@ -176,21 +181,6 @@ class Crawler extends CliTool {
//TODO: URL normalizers
//TODO: Robots.txt
- def mapCrawl(x: Iterator[(String, Iterable[Resource])]): Iterator[CrawlData] = {
- val m = 1000
- x.flatMap({case (grp, rs) => new FairFetcher(job, rs.iterator, m,
- FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer)})
- }
-
- def goup: String = {
- val uuid = UUID.randomUUID
- val uuidAsString = uuid.toString
- uuidAsString
- }
-
- def maplogic: Unit = {
-
- }
override def run(): Unit = {
//STEP : Initialize environment
@@ -245,7 +235,12 @@ class Crawler extends CliTool {
LOG.info(s"Starting the job:$jobId, task:$taskId")
val rc = new RunCrawl
- val rdd = new MemexCrawlDbRDD(sc, job, maxGroups = topG, topN = topN)
+ val rdd = if(defaultquery != "" && !defaultquery.isEmpty){
+ new MemexCrawlDbRDD(sc, job, generateQry= defaultquery, maxGroups = topG, topN = topN)
+ } else{
+ new MemexCrawlDbRDD(sc, job, maxGroups = topG, topN = topN)
+ }
+
//TODO RESTORE THIS HACK
val f = rc.map(rdd)
/*val f = rdd.map(r => (r.getDedupeId, r))
From 56fd55b4e1d113f686ecd812293ff694383dfea4 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Tue, 13 Jul 2021 11:43:18 +0100
Subject: [PATCH 130/335] update scripter
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index c3cc5e5c..6548f885 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.4-20210712.093510-6"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.5"
}
object FetcherHtmlUnit {
From c55fa22c8f914082f375e7339dedb5af9bed294e Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Thu, 15 Jul 2021 02:47:01 +0100
Subject: [PATCH 131/335] updates
---
.../usc/irds/sparkler/pipeline/Crawler.scala | 22 ++++++-------------
1 file changed, 7 insertions(+), 15 deletions(-)
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index 810a4e19..b73fae72 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -230,7 +230,7 @@ class Crawler extends CliTool {
storageProxy.commitCrawlDb()
}
- var taskId = JobUtil.newSegmentId(true)
+ val taskId = JobUtil.newSegmentId(true)
job.currentTask = taskId
LOG.info(s"Starting the job:$jobId, task:$taskId")
val rc = new RunCrawl
@@ -258,21 +258,13 @@ class Crawler extends CliTool {
var fetchedRdd: RDD[CrawlData] = null
- val rep: Int = sparklerConf.get("crawl.repartition").asInstanceOf[Number].intValue()
- if (rep > 0) {
- fetchedRdd = f.repartition(rep).flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
- FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq
- }).repartition(rep)
- .persist()
- } else {
- fetchedRdd = f.repartition(1).flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
- FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq
- }).repartition(1)
- .persist()
+ var rep: Int = sparklerConf.get("crawl.repartition").asInstanceOf[Number].intValue()
+ if(rep <= 0){
+ rep = 1
}
-
- //val coll = fetchedRdd.collect()
- //val d = fetchedRdd.getNumPartitions
+ fetchedRdd = f.repartition(rep).flatMap({ case (grp, rs) => new FairFetcher(job, rs.iterator, localFetchDelay,
+ FetchFunction, ParseFunction, OutLinkFilterFunction, StatusUpdateSolrTransformer).toSeq
+ }).persist()
if (kafkaEnable) {
storeContentKafka(kafkaListeners, kafkaTopic.format(jobId), fetchedRdd)
From 87d7d5db33a3d12ed0f0a62635380b3cac1d5b4e Mon Sep 17 00:00:00 2001
From: dmitri-mcguckin
Date: Thu, 15 Jul 2021 14:12:46 -0400
Subject: [PATCH 132/335] Selenium Scripter version bump
---
project/build.properties | 1 +
sparkler-core/project/PluginDependencies.scala | 2 +-
2 files changed, 2 insertions(+), 1 deletion(-)
create mode 100644 project/build.properties
diff --git a/project/build.properties b/project/build.properties
new file mode 100644
index 00000000..9edb75b7
--- /dev/null
+++ b/project/build.properties
@@ -0,0 +1 @@
+sbt.version=1.5.4
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 6548f885..5b7fc805 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.5"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.6"
}
object FetcherHtmlUnit {
From fcf82290a09eeae2f83f793980b07aec5a30fe1c Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 16 Jul 2021 13:51:45 +0100
Subject: [PATCH 133/335] add content hash logic and put files in better order
---
sparkler-core/conf/sparkler-default.yaml | 2 +
.../edu/usc/irds/sparkler/model/Resource.java | 421 +++++++++---------
.../irds/sparkler/util/FetcherDefault.java | 14 +-
.../src/main/resources/sparkler-default.yaml | 2 +
.../usc/irds/sparkler/pipeline/Crawler.scala | 1 -
5 files changed, 232 insertions(+), 208 deletions(-)
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index ba2214da..ce165a50 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -87,6 +87,8 @@ fetcher.headers:
Accept-Language: "en-US,en"
fetcher.persist.content.location: /dbfs/FileStore/bcf/content/
+fetcher.persist.content.filename: hash
+
# Rotating agents file.
# File should contain a list of agents which will be used to override the default agent string
# This is an unbounded list, it can take any number of agents you wish.
diff --git a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/model/Resource.java b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/model/Resource.java
index 6ed45bca..825574dd 100644
--- a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/model/Resource.java
+++ b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/model/Resource.java
@@ -1,206 +1,215 @@
-package edu.usc.irds.sparkler.model;
-
-import edu.usc.irds.sparkler.JobContext;
-import edu.usc.irds.sparkler.util.StringUtil;
-import org.apache.solr.client.solrj.beans.Field;
-
-import java.io.Serializable;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Random;
-
-/**
- * Created by karanjeetsingh on 10/22/16.
- */
-public class Resource implements Serializable {
-
- //NOTE: Keep the variable names in sync with solr schema
- @Field private String id;
- @Field("crawl_id") private String crawlId;
- @Field private String url;
- @Field private String group;
- @Field("fetch_timestamp") private Date fetchTimestamp;
- //@Field private Integer numTries = 0;
- //@Field private Integer numFetches = 0;
- @Field("discover_depth") private Integer discoverDepth = 0;
- //@Field("page_score") private Double score = 0.0;
- @Field("generate_score") private Double generateScore = 0.0;
- @Field("*_score") private Map score = new HashMap<>();
- @Field private String status = ResourceStatus.UNFETCHED.toString();
- @Field("last_updated_at") private Date lastUpdatedAt;
- @Field("indexed_at") private Date indexedAt;
- @Field private String hostname;
- @Field private String parent;
- @Field("dedupe_id") private String dedupeId;
- @Field("http_method") private String httpMethod;
- @Field("jobmeta") private String metadata;
-
- public Resource() {
- }
-
- public Resource(String url, String group, JobContext job) {
- super();
- //this.id = resourceId(url, job);
- this.url = url;
- this.group = group;
- this.hostname = group;
- this.crawlId = job.getId();
- this.dedupeId = StringUtil.sha256hash(url + "-" + job.getId());
- }
-
- public Resource(String url, String group, JobContext sparklerJob, Date fetchTimestamp) {
- this(url, group, sparklerJob);
- this.id = resourceId(url, sparklerJob, fetchTimestamp);
- this.fetchTimestamp = fetchTimestamp;
- }
-
- public Resource(String url, Integer discoverDepth, JobContext sparklerJob, ResourceStatus status) throws MalformedURLException {
- this(url, new URL(url).getHost(), sparklerJob);
- this.indexedAt = new Date();
- this.id = resourceId(url, sparklerJob, this.indexedAt);
- this.discoverDepth = discoverDepth;
- this.status = status.toString();
- }
-
- public Resource(String url, Integer discoverDepth, JobContext sparklerJob, ResourceStatus status,
- String parent, Map score, String metadata, String httpMethod) throws MalformedURLException {
- this(url, new URL(url).getHost(), sparklerJob);
- this.indexedAt = new Date();
- this.id = resourceId(url, sparklerJob, this.indexedAt);
- this.discoverDepth = discoverDepth;
- this.status = status.toString();
- this.parent = parent;
- this.score = score;
- this.httpMethod = httpMethod;
- this.metadata = metadata;
- }
-
- public Resource(String url, Integer discoverDepth, JobContext sparklerJob, ResourceStatus status,
- Date fetchTimestamp, String parent) throws MalformedURLException {
- this(url, new URL(url).getHost(), sparklerJob);
- this.id = resourceId(url, sparklerJob, fetchTimestamp);
- this.discoverDepth = discoverDepth;
- this.status = status.toString();
- this.parent = parent;
- }
-
- public Resource(String url, String group, JobContext sparklerJob, Date fetchTimestamp, Integer numTries,
- Integer numFetches, ResourceStatus status) {
- this(url, group, sparklerJob, fetchTimestamp);
- //this.numFetches = numFetches;
- this.status = status.toString();
- }
-
- public Resource(String url, Integer discoverDepth, JobContext sparklerJob, ResourceStatus status, Date fetchTimestamp, String parent, Map score) throws MalformedURLException {
-
- this(url, discoverDepth, sparklerJob, status, fetchTimestamp, parent);
- this.score = score;
-
- }
-
- @Override
- public String toString() {
- return String.format("Resource(%s, %s, %s, %d, %f, %s)",
- id, group, fetchTimestamp, discoverDepth, score, status);
- //id, group, fetchTimestamp, numTries, numFetches, discoverDepth, score, status);
- }
-
- public static String resourceId(String url, JobContext job) {
- return String.format("%s-%s", job.getId(), url);
- }
-
- public static String resourceId(String url, JobContext job, Date timestamp) {
- Random rand = new Random();
- int int_random = rand.nextInt(10000000);
-
- return StringUtil.sha256hash(String.format("%s-%s-%s-%s", job.getId(), url, timestamp.getTime(), int_random));
- }
-
- // Getters & Setters
- public String getId() {
- return id;
- }
-
- public void setId(String id) {
- this.id = id;
- }
-
- public String getUrl() {
- return url;
- }
-
- public void setUrl(String url) {
- this.url = url;
- }
-
- public String getGroup() {
- return group;
- }
-
- public Integer getDiscoverDepth() {
- return discoverDepth;
- }
-
- public String getStatus() { return status; }
-
- public void setStatus(String status) {
- this.status = status;
- }
-
- public Date getFetchTimestamp() { return fetchTimestamp; }
-
- public void setFetchTimestamp(Date fetchTimestamp) { this.fetchTimestamp = fetchTimestamp; }
-
- public String getCrawlId() { return crawlId; }
-
- public void setCrawlId(String crawlId) { this.crawlId = crawlId; }
-
- public String getDedupeId() { return dedupeId; }
-
- public void setDedupeId(String dedupeId) { this.dedupeId = dedupeId; }
-
- public Map getScore() {
- return this.score;
- }
-
- public void setScore(Map score) {
- this.score = score;
- }
-
- public Double getScore(String scoreKey) {
- return this.score.get(scoreKey);
- }
-
- public void setScore(String scoreKey, Double value) {
- this.score.put(scoreKey, value);
- }
-
- public Double getGenerateScore() {
- return this.generateScore;
- }
-
- public void setGenerateScore(Double generateScore) {
- this.generateScore = generateScore;
- }
-
- public Map getScoreAsMap(){
- HashMap hm = new HashMap<>();
- hm.put("generate_score", this.generateScore);
- return hm;
- }
-
- public String getHttpMethod(){
- if(this.httpMethod == null || this.httpMethod.equals("")){
- return "GET";
- } else{
- return this.httpMethod;
- }
- }
-
- public String getMetadata(){
- return this.metadata;
- }
-}
+package edu.usc.irds.sparkler.model;
+
+import edu.usc.irds.sparkler.JobContext;
+import edu.usc.irds.sparkler.util.StringUtil;
+import org.apache.solr.client.solrj.beans.Field;
+
+import java.io.Serializable;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+
+/**
+ * Created by karanjeetsingh on 10/22/16.
+ */
+public class Resource implements Serializable {
+
+ //NOTE: Keep the variable names in sync with solr schema
+ @Field private String id;
+ @Field("crawl_id") private String crawlId;
+ @Field private String url;
+ @Field private String group;
+ @Field("fetch_timestamp") private Date fetchTimestamp;
+ //@Field private Integer numTries = 0;
+ //@Field private Integer numFetches = 0;
+ @Field("discover_depth") private Integer discoverDepth = 0;
+ //@Field("page_score") private Double score = 0.0;
+ @Field("generate_score") private Double generateScore = 0.0;
+ @Field("*_score") private Map score = new HashMap<>();
+ @Field private String status = ResourceStatus.UNFETCHED.toString();
+ @Field("last_updated_at") private Date lastUpdatedAt;
+ @Field("indexed_at") private Date indexedAt;
+ @Field private String hostname;
+ @Field private String parent;
+ @Field("dedupe_id") private String dedupeId;
+ @Field("http_method") private String httpMethod;
+ @Field("jobmeta") private String metadata;
+ @Field("contenthash")private String contenthash;
+
+ public Resource() {
+ }
+
+ public Resource(String url, String group, JobContext job) {
+ super();
+ //this.id = resourceId(url, job);
+ this.url = url;
+ this.group = group;
+ this.hostname = group;
+ this.crawlId = job.getId();
+ this.dedupeId = StringUtil.sha256hash(url + "-" + job.getId());
+ }
+
+ public Resource(String url, String group, JobContext sparklerJob, Date fetchTimestamp) {
+ this(url, group, sparklerJob);
+ this.id = resourceId(url, sparklerJob, fetchTimestamp);
+ this.fetchTimestamp = fetchTimestamp;
+ }
+
+ public Resource(String url, Integer discoverDepth, JobContext sparklerJob, ResourceStatus status) throws MalformedURLException {
+ this(url, new URL(url).getHost(), sparklerJob);
+ this.indexedAt = new Date();
+ this.id = resourceId(url, sparklerJob, this.indexedAt);
+ this.discoverDepth = discoverDepth;
+ this.status = status.toString();
+ }
+
+ public Resource(String url, Integer discoverDepth, JobContext sparklerJob, ResourceStatus status,
+ String parent, Map score, String metadata, String httpMethod) throws MalformedURLException {
+ this(url, new URL(url).getHost(), sparklerJob);
+ this.indexedAt = new Date();
+ this.id = resourceId(url, sparklerJob, this.indexedAt);
+ this.discoverDepth = discoverDepth;
+ this.status = status.toString();
+ this.parent = parent;
+ this.score = score;
+ this.httpMethod = httpMethod;
+ this.metadata = metadata;
+ }
+
+ public Resource(String url, Integer discoverDepth, JobContext sparklerJob, ResourceStatus status,
+ Date fetchTimestamp, String parent) throws MalformedURLException {
+ this(url, new URL(url).getHost(), sparklerJob);
+ this.id = resourceId(url, sparklerJob, fetchTimestamp);
+ this.discoverDepth = discoverDepth;
+ this.status = status.toString();
+ this.parent = parent;
+ }
+
+ public Resource(String url, String group, JobContext sparklerJob, Date fetchTimestamp, Integer numTries,
+ Integer numFetches, ResourceStatus status) {
+ this(url, group, sparklerJob, fetchTimestamp);
+ //this.numFetches = numFetches;
+ this.status = status.toString();
+ }
+
+ public Resource(String url, Integer discoverDepth, JobContext sparklerJob, ResourceStatus status, Date fetchTimestamp, String parent, Map score) throws MalformedURLException {
+
+ this(url, discoverDepth, sparklerJob, status, fetchTimestamp, parent);
+ this.score = score;
+
+ }
+
+ @Override
+ public String toString() {
+ return String.format("Resource(%s, %s, %s, %d, %f, %s)",
+ id, group, fetchTimestamp, discoverDepth, score, status);
+ //id, group, fetchTimestamp, numTries, numFetches, discoverDepth, score, status);
+ }
+
+ public static String resourceId(String url, JobContext job) {
+ return String.format("%s-%s", job.getId(), url);
+ }
+
+ public static String resourceId(String url, JobContext job, Date timestamp) {
+ Random rand = new Random();
+ int int_random = rand.nextInt(10000000);
+
+ return StringUtil.sha256hash(String.format("%s-%s-%s-%s", job.getId(), url, timestamp.getTime(), int_random));
+ }
+
+ // Getters & Setters
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public void setUrl(String url) {
+ this.url = url;
+ }
+
+ public String getGroup() {
+ return group;
+ }
+
+ public Integer getDiscoverDepth() {
+ return discoverDepth;
+ }
+
+ public String getStatus() { return status; }
+
+ public void setStatus(String status) {
+ this.status = status;
+ }
+
+ public Date getFetchTimestamp() { return fetchTimestamp; }
+
+ public void setFetchTimestamp(Date fetchTimestamp) { this.fetchTimestamp = fetchTimestamp; }
+
+ public String getCrawlId() { return crawlId; }
+
+ public void setCrawlId(String crawlId) { this.crawlId = crawlId; }
+
+ public String getDedupeId() { return dedupeId; }
+
+ public void setDedupeId(String dedupeId) { this.dedupeId = dedupeId; }
+
+ public Map getScore() {
+ return this.score;
+ }
+
+ public void setScore(Map score) {
+ this.score = score;
+ }
+
+ public Double getScore(String scoreKey) {
+ return this.score.get(scoreKey);
+ }
+
+ public void setScore(String scoreKey, Double value) {
+ this.score.put(scoreKey, value);
+ }
+
+ public Double getGenerateScore() {
+ return this.generateScore;
+ }
+
+ public void setGenerateScore(Double generateScore) {
+ this.generateScore = generateScore;
+ }
+
+ public Map getScoreAsMap(){
+ HashMap hm = new HashMap<>();
+ hm.put("generate_score", this.generateScore);
+ return hm;
+ }
+
+ public String getHttpMethod(){
+ if(this.httpMethod == null || this.httpMethod.equals("")){
+ return "GET";
+ } else{
+ return this.httpMethod;
+ }
+ }
+
+ public String getMetadata(){
+ return this.metadata;
+ }
+
+ public void setContentHash(String md5hash) {
+ this.contenthash = md5hash;
+ }
+
+ public String getContentHash(){
+ return this.contenthash;
+ }
+}
diff --git a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
index b044de2f..5578ea69 100644
--- a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
+++ b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
@@ -19,10 +19,12 @@
import java.io.*;
import java.net.HttpURLConnection;
+import java.net.URI;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Paths;
+import java.security.MessageDigest;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
@@ -170,9 +172,19 @@ public FetchedData fetch(Resource resource) throws Exception {
}
bufferOutStream.flush();
byte[] rawData = bufferOutStream.toByteArray();
+ byte[] md5hash = MessageDigest.getInstance("MD5").digest(rawData);
+ resource.setContentHash(new String(md5hash, StandardCharsets.UTF_8));
if(jobContext.getConfiguration().containsKey("fetcher.persist.content.location")){
File outputDirectory = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId()).toFile();
- File outputFile = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId(), FilenameUtils.getName(resource.getUrl())).toFile();
+ File outputFile;
+ URI uri = new URI(resource.getUrl());
+ String domain = uri.getHost();
+ if(jobContext.getConfiguration().get("fetcher.persist.content.filename").toString().equals("hash")){
+ String ext = FilenameUtils.getExtension(resource.getUrl());
+ outputFile = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId(), domain, new String(md5hash, StandardCharsets.UTF_8)+ext).toFile();
+ } else{
+ outputFile = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId(), domain, FilenameUtils.getName(resource.getUrl())).toFile();
+ }
outputDirectory.mkdirs();
try (FileOutputStream outputStream = new FileOutputStream(outputFile)) {
outputStream.write(rawData);
diff --git a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
index e5eb367b..4da81dca 100644
--- a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
+++ b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
@@ -88,6 +88,8 @@ fetcher.headers:
Accept-Language: "en-US,en"
fetcher.persist.content.location: /dbfs/FileStore/bcf/content/
+fetcher.persist.content.filename: hash
+
# Rotating agents file.
# File should contain a list of agents which will be used to override the default agent string
# This is an unbounded list, it can take any number of agents you wish.
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
index b73fae72..3212c626 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/Crawler.scala
@@ -286,7 +286,6 @@ class Crawler extends CliTool {
sc.stop()
}
-
def score(fetchedRdd: RDD[CrawlData]): RDD[CrawlData] = {
val job = this.job.asInstanceOf[SparklerJob]
From 9eaf262984b0a4b9d3fd931327c625b17efe4126 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Fri, 16 Jul 2021 20:47:16 +0100
Subject: [PATCH 134/335] fix md5summing
---
.../java/edu/usc/irds/sparkler/model/Resource.java | 2 +-
.../edu/usc/irds/sparkler/util/FetcherDefault.java | 10 ++--------
.../src/main/resources/sparkler-default.yaml | 4 +++-
3 files changed, 6 insertions(+), 10 deletions(-)
diff --git a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/model/Resource.java b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/model/Resource.java
index 825574dd..d0de422d 100644
--- a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/model/Resource.java
+++ b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/model/Resource.java
@@ -105,7 +105,7 @@ public Resource(String url, Integer discoverDepth, JobContext sparklerJob, Resou
@Override
public String toString() {
return String.format("Resource(%s, %s, %s, %d, %f, %s)",
- id, group, fetchTimestamp, discoverDepth, score, status);
+ id, group, fetchTimestamp, discoverDepth, 0.0, status);
//id, group, fetchTimestamp, numTries, numFetches, discoverDepth, score, status);
}
diff --git a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
index 5578ea69..d74e54b5 100644
--- a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
+++ b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
@@ -25,13 +25,7 @@
import java.nio.charset.StandardCharsets;
import java.nio.file.Paths;
import java.security.MessageDigest;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.LinkedHashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
@@ -173,7 +167,7 @@ public FetchedData fetch(Resource resource) throws Exception {
bufferOutStream.flush();
byte[] rawData = bufferOutStream.toByteArray();
byte[] md5hash = MessageDigest.getInstance("MD5").digest(rawData);
- resource.setContentHash(new String(md5hash, StandardCharsets.UTF_8));
+ resource.setContentHash(Base64.getEncoder().encodeToString(md5hash));
if(jobContext.getConfiguration().containsKey("fetcher.persist.content.location")){
File outputDirectory = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId()).toFile();
File outputFile;
diff --git a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
index 4da81dca..0e740251 100644
--- a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
+++ b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
@@ -89,7 +89,9 @@ fetcher.headers:
fetcher.persist.content.location: /dbfs/FileStore/bcf/content/
fetcher.persist.content.filename: hash
-
+fetcher.persist.content.types:
+ - pdf
+ - doc
# Rotating agents file.
# File should contain a list of agents which will be used to override the default agent string
# This is an unbounded list, it can take any number of agents you wish.
From 8f1ff3cef5920fab3541acc05104896433b52b81 Mon Sep 17 00:00:00 2001
From: dmitri-mcguckin
Date: Fri, 16 Jul 2021 16:02:36 -0400
Subject: [PATCH 135/335] Selenium Scripter 1.6.1 version bump
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 5b7fc805..38c0a6ae 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.6"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.6.1"
}
object FetcherHtmlUnit {
From 53c88dc58a12d7f6a966c897e86c9ab44e0b31ee Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sat, 17 Jul 2021 12:30:02 +0100
Subject: [PATCH 136/335] fix md5summing
---
sparkler-core/conf/sparkler-default.yaml | 2 +-
.../java/edu/usc/irds/sparkler/util/FetcherDefault.java | 6 +++---
.../sparkler-app/src/main/resources/sparkler-default.yaml | 2 +-
3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index ce165a50..75746bc2 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -86,7 +86,7 @@ fetcher.headers:
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
Accept-Language: "en-US,en"
-fetcher.persist.content.location: /dbfs/FileStore/bcf/content/
+fetcher.persist.content.location: /tmp/content/
fetcher.persist.content.filename: hash
# Rotating agents file.
diff --git a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
index d74e54b5..eb90eaa0 100644
--- a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
+++ b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
@@ -169,13 +169,13 @@ public FetchedData fetch(Resource resource) throws Exception {
byte[] md5hash = MessageDigest.getInstance("MD5").digest(rawData);
resource.setContentHash(Base64.getEncoder().encodeToString(md5hash));
if(jobContext.getConfiguration().containsKey("fetcher.persist.content.location")){
- File outputDirectory = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId()).toFile();
- File outputFile;
URI uri = new URI(resource.getUrl());
String domain = uri.getHost();
+ File outputDirectory = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId(), domain).toFile();
+ File outputFile;
if(jobContext.getConfiguration().get("fetcher.persist.content.filename").toString().equals("hash")){
String ext = FilenameUtils.getExtension(resource.getUrl());
- outputFile = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId(), domain, new String(md5hash, StandardCharsets.UTF_8)+ext).toFile();
+ outputFile = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId(), domain, Base64.getEncoder().encodeToString(md5hash)+"."+ext).toFile();
} else{
outputFile = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId(), domain, FilenameUtils.getName(resource.getUrl())).toFile();
}
diff --git a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
index 0e740251..a829deb6 100644
--- a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
+++ b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
@@ -87,7 +87,7 @@ fetcher.headers:
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
Accept-Language: "en-US,en"
-fetcher.persist.content.location: /dbfs/FileStore/bcf/content/
+fetcher.persist.content.location: /tmp/content
fetcher.persist.content.filename: hash
fetcher.persist.content.types:
- pdf
From 8f83067bc8c8a90d4554b669812a9daf4a48f0e4 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sat, 17 Jul 2021 12:35:11 +0100
Subject: [PATCH 137/335] fix md5summing
---
sparkler-core/project/PluginDependencies.scala | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sparkler-core/project/PluginDependencies.scala b/sparkler-core/project/PluginDependencies.scala
index 38c0a6ae..b8273d47 100644
--- a/sparkler-core/project/PluginDependencies.scala
+++ b/sparkler-core/project/PluginDependencies.scala
@@ -28,7 +28,7 @@ object FetcherChrome {
lazy val java = group % "selenium-java" % version
}
lazy val browserup = "com.browserup" % "browserup-proxy-core" % "3.0.0-SNAPSHOT"
- lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.6.1"
+ lazy val seleniumscripter = "com.kytheralabs" % "seleniumscripter" % "1.7.0"
}
object FetcherHtmlUnit {
From eeea7dcf9fe1301ac51839edd5fb6dd85be6264a Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sat, 17 Jul 2021 12:41:50 +0100
Subject: [PATCH 138/335] fix md5summing
---
sparkler-core/conf/sparkler-default.yaml | 2 +-
.../sparkler-app/src/main/resources/sparkler-default.yaml | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index 75746bc2..ce165a50 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -86,7 +86,7 @@ fetcher.headers:
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
Accept-Language: "en-US,en"
-fetcher.persist.content.location: /tmp/content/
+fetcher.persist.content.location: /dbfs/FileStore/bcf/content/
fetcher.persist.content.filename: hash
# Rotating agents file.
diff --git a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
index a829deb6..0e740251 100644
--- a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
+++ b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
@@ -87,7 +87,7 @@ fetcher.headers:
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
Accept-Language: "en-US,en"
-fetcher.persist.content.location: /tmp/content
+fetcher.persist.content.location: /dbfs/FileStore/bcf/content/
fetcher.persist.content.filename: hash
fetcher.persist.content.types:
- pdf
From 4657aae37550039b5b40a764dfc3c37ffbc76bb7 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sat, 17 Jul 2021 14:28:47 +0100
Subject: [PATCH 139/335] fix md5summing
---
.../java/edu/usc/irds/sparkler/Constants.java | 313 +++++++++---------
.../usc/irds/sparkler/model/FetchedData.java | 10 +
.../irds/sparkler/util/FetcherDefault.java | 37 ++-
.../usc/irds/sparkler/model/ParsedData.scala | 1 +
.../sparkler/pipeline/ParseFunction.scala | 247 +++++++-------
.../solr/StatusUpdateSolrTransformer.scala | 167 +++++-----
6 files changed, 397 insertions(+), 378 deletions(-)
diff --git a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/Constants.java b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/Constants.java
index 26c67135..d1da1b9a 100644
--- a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/Constants.java
+++ b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/Constants.java
@@ -1,156 +1,157 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.usc.irds.sparkler;
-
-import org.apache.commons.io.IOUtils;
-import org.yaml.snakeyaml.Yaml;
-
-import java.io.InputStream;
-import java.util.Map;
-import java.util.UUID;
-
-/**
- * A static container for all the constants
- * (NOTE: interface is used to make all the fields as public static final )
- */
-public interface Constants {
-
- interface key { //config key name
-
- //ID for config
- @ConfigKey
- String UUID_KEY = "sparkler.conf.uuid";
-
- // General Properties
- @ConfigKey
- String CRAWLDB_BACKEND = "crawldb.backend";
-
- // Apache Spark Properties
- @ConfigKey
- String SPARK_MASTER = "spark.master";
-
- // Apache Kafka Properties
- @ConfigKey
- String KAFKA_ENABLE = "kafka.enable";
-
- @ConfigKey
- String KAFKA_LISTENERS = "kafka.listeners";
-
- @ConfigKey
- String KAFKA_TOPIC = "kafka.topic";
-
- // Databricks Properties
- @ConfigKey
- String DATABRICKS_ENABLE = "databricks.enable";
-
- // HTTP Properties
- // Database Properties
-
- // Generator Properties
- @ConfigKey(type = int.class)
- String GENERATE_TOPN = "generate.topn";
-
- @ConfigKey(type = int.class)
- String GENERATE_TOP_GROUPS = "generate.top.groups";
-
- @ConfigKey
- String GENERATE_SORTBY = "generate.sortby";
-
- @ConfigKey
- String GENERATE_GROUPBY = "generate.groupby";
-
- // Fetcher Properties
- @ConfigKey(type = int.class)
- String FETCHER_SERVER_DELAY = "fetcher.server.delay";
-
- @ConfigKey
- String PLUGINS = "plugins";
-
- @ConfigKey
- String ACTIVE_PLUGINS = "plugins.active";
-
- @ConfigKey
- String FETCHER_HEADERS = "fetcher.headers";
-
- @ConfigKey
- String FETCHER_USER_AGENTS = "fetcher.user.agents";
- }
-
-
- abstract class defaults {
- /**
- * Create configuration instance for Sparkler
- */
- public static SparklerConfiguration newDefaultConfig(){
- //FIXME: needs rework!
- Yaml yaml = new Yaml();
- InputStream input = null;
- SparklerConfiguration sparklerConf = null;
- try {
- input = Constants.class.getClassLoader().getResourceAsStream(file.SPARKLER_DEFAULT);
- Map yamlMap = (Map) yaml.load(input);
- sparklerConf = new SparklerConfiguration(yamlMap);
- } catch (Exception e) {
- e.printStackTrace();
- } finally {
- IOUtils.closeQuietly(input);
- }
-
- if (sparklerConf != null) {
- sparklerConf.put(key.UUID_KEY, UUID.randomUUID().toString());
- }
- return sparklerConf;
- }
- }
-
- interface file {
- String SPARKLER_DEFAULT = "sparkler-default.yaml";
- }
-
- interface storage { // Storage Fields
- String ID = "id";
- String CRAWL_ID = "crawl_id";
- String URL = "url";
- String GROUP = "group";
- String FETCH_TIMESTAMP = "fetch_timestamp";
- String RETRIES_SINCE_FETCH = "retries_since_fetch";
- String NUM_FETCHES = "numFetches";
- String DISCOVER_DEPTH = "discover_depth";
- String FETCH_DEPTH = "fetch_depth";
- String SCORE = "page_score";
- String GENERATE_SCORE = "generate_score";
- String STATUS = "status";
- String LAST_UPDATED_AT = "last_updated_at";
- String EXTRACTED_TEXT = "extracted_text";
- String CONTENT_TYPE = "content_type";
- String FETCH_STATUS_CODE = "fetch_status_code";
- String SIGNATURE = "signature";
- String OUTLINKS = "outlinks";
- String RELATIVE_PATH = "relative_path";
- String DEDUPE_ID = "dedupe_id";
- String MD_SUFFIX = "_md";
- String HDR_SUFFIX = "_hd";
- String SEGMENT = "segment";
- String RAW_CONTENT = "raw_content";
- String WEBPAGE_MIMETYPE = "text/html";
- String JSON_MIMETYPE = "application/json";
- String PARENT = "parent";
- String RESPONSE_TIME = "response_time";
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.usc.irds.sparkler;
+
+import org.apache.commons.io.IOUtils;
+import org.yaml.snakeyaml.Yaml;
+
+import java.io.InputStream;
+import java.util.Map;
+import java.util.UUID;
+
+/**
+ * A static container for all the constants
+ * (NOTE: interface is used to make all the fields as public static final )
+ */
+public interface Constants {
+
+ interface key { //config key name
+
+ //ID for config
+ @ConfigKey
+ String UUID_KEY = "sparkler.conf.uuid";
+
+ // General Properties
+ @ConfigKey
+ String CRAWLDB_BACKEND = "crawldb.backend";
+
+ // Apache Spark Properties
+ @ConfigKey
+ String SPARK_MASTER = "spark.master";
+
+ // Apache Kafka Properties
+ @ConfigKey
+ String KAFKA_ENABLE = "kafka.enable";
+
+ @ConfigKey
+ String KAFKA_LISTENERS = "kafka.listeners";
+
+ @ConfigKey
+ String KAFKA_TOPIC = "kafka.topic";
+
+ // Databricks Properties
+ @ConfigKey
+ String DATABRICKS_ENABLE = "databricks.enable";
+
+ // HTTP Properties
+ // Database Properties
+
+ // Generator Properties
+ @ConfigKey(type = int.class)
+ String GENERATE_TOPN = "generate.topn";
+
+ @ConfigKey(type = int.class)
+ String GENERATE_TOP_GROUPS = "generate.top.groups";
+
+ @ConfigKey
+ String GENERATE_SORTBY = "generate.sortby";
+
+ @ConfigKey
+ String GENERATE_GROUPBY = "generate.groupby";
+
+ // Fetcher Properties
+ @ConfigKey(type = int.class)
+ String FETCHER_SERVER_DELAY = "fetcher.server.delay";
+
+ @ConfigKey
+ String PLUGINS = "plugins";
+
+ @ConfigKey
+ String ACTIVE_PLUGINS = "plugins.active";
+
+ @ConfigKey
+ String FETCHER_HEADERS = "fetcher.headers";
+
+ @ConfigKey
+ String FETCHER_USER_AGENTS = "fetcher.user.agents";
+ }
+
+
+ abstract class defaults {
+ /**
+ * Create configuration instance for Sparkler
+ */
+ public static SparklerConfiguration newDefaultConfig(){
+ //FIXME: needs rework!
+ Yaml yaml = new Yaml();
+ InputStream input = null;
+ SparklerConfiguration sparklerConf = null;
+ try {
+ input = Constants.class.getClassLoader().getResourceAsStream(file.SPARKLER_DEFAULT);
+ Map yamlMap = (Map) yaml.load(input);
+ sparklerConf = new SparklerConfiguration(yamlMap);
+ } catch (Exception e) {
+ e.printStackTrace();
+ } finally {
+ IOUtils.closeQuietly(input);
+ }
+
+ if (sparklerConf != null) {
+ sparklerConf.put(key.UUID_KEY, UUID.randomUUID().toString());
+ }
+ return sparklerConf;
+ }
+ }
+
+ interface file {
+ String SPARKLER_DEFAULT = "sparkler-default.yaml";
+ }
+
+ interface storage { // Storage Fields
+ String ID = "id";
+ String CRAWL_ID = "crawl_id";
+ String URL = "url";
+ String GROUP = "group";
+ String FETCH_TIMESTAMP = "fetch_timestamp";
+ String RETRIES_SINCE_FETCH = "retries_since_fetch";
+ String NUM_FETCHES = "numFetches";
+ String DISCOVER_DEPTH = "discover_depth";
+ String FETCH_DEPTH = "fetch_depth";
+ String SCORE = "page_score";
+ String GENERATE_SCORE = "generate_score";
+ String STATUS = "status";
+ String LAST_UPDATED_AT = "last_updated_at";
+ String EXTRACTED_TEXT = "extracted_text";
+ String CONTENT_TYPE = "content_type";
+ String FETCH_STATUS_CODE = "fetch_status_code";
+ String SIGNATURE = "signature";
+ String OUTLINKS = "outlinks";
+ String RELATIVE_PATH = "relative_path";
+ String DEDUPE_ID = "dedupe_id";
+ String MD_SUFFIX = "_md";
+ String HDR_SUFFIX = "_hd";
+ String SEGMENT = "segment";
+ String CONTENTHASH = "contenthash";
+ String RAW_CONTENT = "raw_content";
+ String WEBPAGE_MIMETYPE = "text/html";
+ String JSON_MIMETYPE = "application/json";
+ String PARENT = "parent";
+ String RESPONSE_TIME = "response_time";
+ }
+
+}
diff --git a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/model/FetchedData.java b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/model/FetchedData.java
index ce74080d..fd9aab64 100644
--- a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/model/FetchedData.java
+++ b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/model/FetchedData.java
@@ -34,6 +34,16 @@ public class FetchedData implements Serializable {
private int responseCode;
private long responseTime = -1;
+ public String getContenthash() {
+ return contenthash;
+ }
+
+ public void setContenthash(String contenthash) {
+ this.contenthash = contenthash;
+ }
+
+ private String contenthash;
+
private String segment;
public FetchedData() {
diff --git a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
index eb90eaa0..d55e859b 100644
--- a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
+++ b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
@@ -166,22 +166,26 @@ public FetchedData fetch(Resource resource) throws Exception {
}
bufferOutStream.flush();
byte[] rawData = bufferOutStream.toByteArray();
- byte[] md5hash = MessageDigest.getInstance("MD5").digest(rawData);
- resource.setContentHash(Base64.getEncoder().encodeToString(md5hash));
- if(jobContext.getConfiguration().containsKey("fetcher.persist.content.location")){
- URI uri = new URI(resource.getUrl());
- String domain = uri.getHost();
- File outputDirectory = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId(), domain).toFile();
- File outputFile;
- if(jobContext.getConfiguration().get("fetcher.persist.content.filename").toString().equals("hash")){
- String ext = FilenameUtils.getExtension(resource.getUrl());
- outputFile = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId(), domain, Base64.getEncoder().encodeToString(md5hash)+"."+ext).toFile();
- } else{
- outputFile = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId(), domain, FilenameUtils.getName(resource.getUrl())).toFile();
- }
- outputDirectory.mkdirs();
- try (FileOutputStream outputStream = new FileOutputStream(outputFile)) {
- outputStream.write(rawData);
+ String contentHash = null;
+ if(rawData.length>0) {
+ byte[] md5hash = MessageDigest.getInstance("MD5").digest(rawData);
+ contentHash = Base64.getEncoder().encodeToString(md5hash);
+ resource.setContentHash(contentHash);
+ if (jobContext.getConfiguration().containsKey("fetcher.persist.content.location")) {
+ URI uri = new URI(resource.getUrl());
+ String domain = uri.getHost();
+ File outputDirectory = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId(), domain).toFile();
+ File outputFile;
+ if (jobContext.getConfiguration().get("fetcher.persist.content.filename").toString().equals("hash")) {
+ String ext = FilenameUtils.getExtension(resource.getUrl());
+ outputFile = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId(), domain, contentHash + "." + ext).toFile();
+ } else {
+ outputFile = Paths.get(jobContext.getConfiguration().get("fetcher.persist.content.location").toString(), jobContext.getId(), domain, FilenameUtils.getName(resource.getUrl())).toFile();
+ }
+ outputDirectory.mkdirs();
+ try (FileOutputStream outputStream = new FileOutputStream(outputFile)) {
+ outputStream.write(rawData);
+ }
}
}
@@ -190,6 +194,7 @@ public FetchedData fetch(Resource resource) throws Exception {
resource.setStatus(ResourceStatus.FETCHED.toString());
fetchedData.setResource(resource);
fetchedData.setHeaders(urlConn.getHeaderFields());
+ fetchedData.setContenthash(contentHash);
if (truncated) {
fetchedData.getHeaders().put(TRUNCATED, Collections.singletonList(Boolean.TRUE.toString()));
}
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/model/ParsedData.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/model/ParsedData.scala
index ad11df21..9064b4c3 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/model/ParsedData.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/model/ParsedData.scala
@@ -27,4 +27,5 @@ class ParsedData extends Serializable {
var outlinks: Set[String] = Set.empty[String]
var metadata: Metadata = new Metadata()
var headers: Map[String, AnyRef] = Map.empty
+ var contentHash: String = _
}
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/ParseFunction.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/ParseFunction.scala
index 0471531c..615cd46e 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/ParseFunction.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/pipeline/ParseFunction.scala
@@ -1,123 +1,124 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.usc.irds.sparkler.pipeline
-
-import java.io.ByteArrayInputStream
-import java.text.{ParseException, SimpleDateFormat}
-import java.util
-import java.util.Date
-
-import edu.usc.irds.sparkler.base.Loggable
-import edu.usc.irds.sparkler.model.{CrawlData, ParsedData}
-import org.apache.commons.io.IOUtils
-import org.apache.tika.metadata.Metadata
-import org.apache.tika.parser.AutoDetectParser
-import org.apache.tika.sax.{BodyContentHandler, LinkContentHandler, WriteOutContentHandler}
-
-import scala.collection.JavaConversions._
-import scala.collection.JavaConverters._
-import scala.collection.mutable
-
-/**
- * This is a transformation function for transforming raw data from crawler to parsed data
- */
-object ParseFunction extends ((CrawlData) => (ParsedData)) with Serializable with Loggable {
-
- override def apply(data: CrawlData): (ParsedData) = {
- val parseData = new ParsedData()
- var stream = new ByteArrayInputStream(data.fetchedData.getContent)
- val linkHandler = new LinkContentHandler()
- val parser = new AutoDetectParser()
- var meta = new Metadata()
- val outHandler = new WriteOutContentHandler(-1)
- val contentHandler = new BodyContentHandler(outHandler)
- LOG.info("PARSING {}", data.fetchedData.getResource.getUrl)
-
- // parse outlinks
- try {
- // Parse OutLinks
- meta.set("resourceName", data.fetchedData.getResource.getUrl)
- parser.parse(stream, linkHandler, meta)
- parseData.outlinks = linkHandler.getLinks.asScala.map(_.getUri.trim).filter(!_.isEmpty).toSet
- } catch {
- case e: Throwable =>
- LOG.warn("PARSING-OUTLINKS-ERROR {}", data.fetchedData.getResource.getUrl)
- LOG.warn(e.getMessage, e)
- } finally { IOUtils.closeQuietly(stream) }
-
- //parse main text content
- try {
- meta = new Metadata
- meta.set("resourceName", data.fetchedData.getResource.getUrl)
- // Parse Text
- stream = new ByteArrayInputStream(data.fetchedData.getContent)
- parser.parse(stream, contentHandler, meta)
- parseData.extractedText = outHandler.toString
- parseData.metadata = meta
- } catch {
- case e: Throwable =>
- LOG.warn("PARSING-CONTENT-ERROR {}", data.fetchedData.getResource.getUrl + " " + e.getMessage())
- LOG.debug(e.getMessage, e)
- parseData
- } finally { IOUtils.closeQuietly(stream) }
-
- // parse headers
- val headers = data.fetchedData.getHeaders
- if (headers.containsKey("Location")) { // redirect
- val redirectUrls = headers.get("Location")
- parseData.outlinks ++= redirectUrls.asScala.filter(u => u != null && !u.isEmpty)
- }
- parseData.headers = parseHeaders(headers)
- parseData
- }
-
- def parseHeaders(headers: util.Map[String, util.List[String]]): Map[String, AnyRef] = {
- val dateHeaders = Set("Date", "Last-Modified", "Expires")
- val intHeaders = Set("ContentLength")
- val dateFmt = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz")
-
- val result = mutable.Map[String, AnyRef]()
- for (name <- headers.keySet()) {
- val values = headers.get(name)
- var parsed: AnyRef = values
- if (values.size() == 1){
- val value = values.get(0)
- parsed = value
- try {
- if (dateHeaders contains name) {
- parsed = parseDate(value)
- } else if (intHeaders contains name) {
- parsed = new java.lang.Long(value.toLong)
- }
- } catch {
- case e: Exception => LOG.debug(e.getMessage, e)
- } finally {
- result(name) = parsed
- }
- }
- }
- result.toMap
- }
-
- /**
- * Parse date string as per RFC7231 https://tools.ietf.org/html/rfc7231#section-7.1.1.1
- */
- val httpDateFormat = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz")
- @throws[ParseException] //but be aware of errors
- def parseDate(dateStr:String): Date = httpDateFormat.parse(dateStr.trim)
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.usc.irds.sparkler.pipeline
+
+import java.io.ByteArrayInputStream
+import java.text.{ParseException, SimpleDateFormat}
+import java.util
+import java.util.Date
+
+import edu.usc.irds.sparkler.base.Loggable
+import edu.usc.irds.sparkler.model.{CrawlData, ParsedData}
+import org.apache.commons.io.IOUtils
+import org.apache.tika.metadata.Metadata
+import org.apache.tika.parser.AutoDetectParser
+import org.apache.tika.sax.{BodyContentHandler, LinkContentHandler, WriteOutContentHandler}
+
+import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+
+/**
+ * This is a transformation function for transforming raw data from crawler to parsed data
+ */
+object ParseFunction extends ((CrawlData) => (ParsedData)) with Serializable with Loggable {
+
+ override def apply(data: CrawlData): (ParsedData) = {
+ val parseData = new ParsedData()
+ var stream = new ByteArrayInputStream(data.fetchedData.getContent)
+ val linkHandler = new LinkContentHandler()
+ val parser = new AutoDetectParser()
+ var meta = new Metadata()
+ val outHandler = new WriteOutContentHandler(-1)
+ val contentHandler = new BodyContentHandler(outHandler)
+ LOG.info("PARSING {}", data.fetchedData.getResource.getUrl)
+
+ // parse outlinks
+ try {
+ // Parse OutLinks
+ meta.set("resourceName", data.fetchedData.getResource.getUrl)
+ parser.parse(stream, linkHandler, meta)
+ parseData.outlinks = linkHandler.getLinks.asScala.map(_.getUri.trim).filter(!_.isEmpty).toSet
+ } catch {
+ case e: Throwable =>
+ LOG.warn("PARSING-OUTLINKS-ERROR {}", data.fetchedData.getResource.getUrl)
+ LOG.warn(e.getMessage, e)
+ } finally { IOUtils.closeQuietly(stream) }
+
+ //parse main text content
+ try {
+ meta = new Metadata
+ meta.set("resourceName", data.fetchedData.getResource.getUrl)
+ // Parse Text
+ stream = new ByteArrayInputStream(data.fetchedData.getContent)
+ parser.parse(stream, contentHandler, meta)
+ parseData.extractedText = outHandler.toString
+ parseData.metadata = meta
+ } catch {
+ case e: Throwable =>
+ LOG.warn("PARSING-CONTENT-ERROR {}", data.fetchedData.getResource.getUrl + " " + e.getMessage())
+ LOG.debug(e.getMessage, e)
+ parseData
+ } finally { IOUtils.closeQuietly(stream) }
+
+ // parse headers
+ val headers = data.fetchedData.getHeaders
+ if (headers.containsKey("Location")) { // redirect
+ val redirectUrls = headers.get("Location")
+ parseData.outlinks ++= redirectUrls.asScala.filter(u => u != null && !u.isEmpty)
+ }
+ parseData.headers = parseHeaders(headers)
+ parseData.contentHash = data.fetchedData.getContenthash
+ parseData
+ }
+
+ def parseHeaders(headers: util.Map[String, util.List[String]]): Map[String, AnyRef] = {
+ val dateHeaders = Set("Date", "Last-Modified", "Expires")
+ val intHeaders = Set("ContentLength")
+ val dateFmt = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz")
+
+ val result = mutable.Map[String, AnyRef]()
+ for (name <- headers.keySet()) {
+ val values = headers.get(name)
+ var parsed: AnyRef = values
+ if (values.size() == 1){
+ val value = values.get(0)
+ parsed = value
+ try {
+ if (dateHeaders contains name) {
+ parsed = parseDate(value)
+ } else if (intHeaders contains name) {
+ parsed = new java.lang.Long(value.toLong)
+ }
+ } catch {
+ case e: Exception => LOG.debug(e.getMessage, e)
+ } finally {
+ result(name) = parsed
+ }
+ }
+ }
+ result.toMap
+ }
+
+ /**
+ * Parse date string as per RFC7231 https://tools.ietf.org/html/rfc7231#section-7.1.1.1
+ */
+ val httpDateFormat = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz")
+ @throws[ParseException] //but be aware of errors
+ def parseDate(dateStr:String): Date = httpDateFormat.parse(dateStr.trim)
+}
diff --git a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/storage/solr/StatusUpdateSolrTransformer.scala b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/storage/solr/StatusUpdateSolrTransformer.scala
index 779aa864..aa0d6204 100644
--- a/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/storage/solr/StatusUpdateSolrTransformer.scala
+++ b/sparkler-core/sparkler-app/src/main/scala/edu/usc/irds/sparkler/storage/solr/StatusUpdateSolrTransformer.scala
@@ -1,83 +1,84 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.usc.irds.sparkler.storage.solr
-
-import java.util
-
-import com.google.common.hash.{HashFunction, Hashing}
-import edu.usc.irds.sparkler.Constants
-import edu.usc.irds.sparkler.base.Loggable
-import edu.usc.irds.sparkler.model.CrawlData
-import edu.usc.irds.sparkler.storage.solr.schema.FieldMapper
-import edu.usc.irds.sparkler.util.URLUtil
-import org.apache.solr.common.SolrInputDocument
-
-import scala.collection.JavaConversions._
-import scala.collection.JavaConverters._
-
-/**
- * Created by Thamme Gowda on 6/7/16.
- * Modified by karanjeets
- */
-object StatusUpdateSolrTransformer extends (CrawlData => SolrInputDocument ) with Serializable with Loggable {
- LOG.debug("Solr Update Transformer Created")
- val fieldMapper: FieldMapper = FieldMapper.initialize()
-
- override def apply(data: CrawlData): SolrInputDocument = {
- val hashFunction: HashFunction = Hashing.sha256()
- val sUpdate = new SolrInputDocument()
- //FIXME: handle failure case
- //val x:java.util.Map[String, Object] = Map("ss" -> new Object).asJava
- sUpdate.setField(Constants.storage.ID, data.fetchedData.getResource.getId)
- sUpdate.setField(Constants.storage.STATUS, Map("set" -> data.fetchedData.getResource.getStatus).asJava)
- sUpdate.setField(Constants.storage.FETCH_TIMESTAMP, Map("set" -> data.fetchedData.getFetchedAt).asJava)
- sUpdate.setField(Constants.storage.LAST_UPDATED_AT, Map("set" -> new util.Date()).asJava)
- sUpdate.setField(Constants.storage.RETRIES_SINCE_FETCH, Map("inc" -> 1).asJava)
- sUpdate.setField(Constants.storage.EXTRACTED_TEXT, data.parsedData.extractedText)
- sUpdate.setField(Constants.storage.CONTENT_TYPE, data.fetchedData.getContentType.split("; ")(0))
- sUpdate.setField(Constants.storage.FETCH_STATUS_CODE, data.fetchedData.getResponseCode)
- sUpdate.setField(Constants.storage.SIGNATURE, hashFunction.hashBytes(data.fetchedData.getContent).toString)
- sUpdate.setField(Constants.storage.RELATIVE_PATH, URLUtil.reverseUrl(data.fetchedData.getResource.getUrl))
- sUpdate.setField(Constants.storage.OUTLINKS, data.parsedData.outlinks.toArray)
- sUpdate.setField(Constants.storage.SEGMENT, data.fetchedData.getSegment)
- val splitMimeTypes = data.fetchedData.getContentType.toLowerCase().split(";")
- if (splitMimeTypes.contains(Constants.storage.WEBPAGE_MIMETYPE.toLowerCase())) {
- sUpdate.setField(Constants.storage.RAW_CONTENT, new String(data.fetchedData.getContent))
- } else if (splitMimeTypes.contains(Constants.storage.JSON_MIMETYPE.toLowerCase())){
- sUpdate.setField(Constants.storage.RAW_CONTENT, new String(data.fetchedData.getContent))
- }
- sUpdate.setField(Constants.storage.RESPONSE_TIME, data.fetchedData.getResponseTime)
- for ((scoreKey, score) <- data.fetchedData.getResource.getScore) {
- sUpdate.setField(scoreKey, Map("set" -> score).asJava)
- }
-
- val md = data.parsedData.metadata
- val mdFields = md.names().map(name => (name, if (md.isMultiValued(name)) md.getValues(name) else md.get(name))).toMap
- updateFields(mdFields, Constants.storage.MD_SUFFIX, sUpdate)
- updateFields(data.parsedData.headers, Constants.storage.HDR_SUFFIX, sUpdate)
- sUpdate
- }
-
- def updateFields(dict: Map[String, AnyRef], suffix:String, solrDoc:SolrInputDocument): Unit ={
- val mapped = fieldMapper.mapFields(dict, true)
- for (k <- mapped.keySet()) {
- val key = if (suffix == null || suffix.isEmpty || k.endsWith(suffix)) k else k + suffix
- solrDoc.setField(key, mapped(k))
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.usc.irds.sparkler.storage.solr
+
+import java.util
+
+import com.google.common.hash.{HashFunction, Hashing}
+import edu.usc.irds.sparkler.Constants
+import edu.usc.irds.sparkler.base.Loggable
+import edu.usc.irds.sparkler.model.CrawlData
+import edu.usc.irds.sparkler.storage.solr.schema.FieldMapper
+import edu.usc.irds.sparkler.util.URLUtil
+import org.apache.solr.common.SolrInputDocument
+
+import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
+
+/**
+ * Created by Thamme Gowda on 6/7/16.
+ * Modified by karanjeets
+ */
+object StatusUpdateSolrTransformer extends (CrawlData => SolrInputDocument ) with Serializable with Loggable {
+ LOG.debug("Solr Update Transformer Created")
+ val fieldMapper: FieldMapper = FieldMapper.initialize()
+
+ override def apply(data: CrawlData): SolrInputDocument = {
+ val hashFunction: HashFunction = Hashing.sha256()
+ val sUpdate = new SolrInputDocument()
+ //FIXME: handle failure case
+ //val x:java.util.Map[String, Object] = Map("ss" -> new Object).asJava
+ sUpdate.setField(Constants.storage.ID, data.fetchedData.getResource.getId)
+ sUpdate.setField(Constants.storage.STATUS, Map("set" -> data.fetchedData.getResource.getStatus).asJava)
+ sUpdate.setField(Constants.storage.FETCH_TIMESTAMP, Map("set" -> data.fetchedData.getFetchedAt).asJava)
+ sUpdate.setField(Constants.storage.LAST_UPDATED_AT, Map("set" -> new util.Date()).asJava)
+ sUpdate.setField(Constants.storage.RETRIES_SINCE_FETCH, Map("inc" -> 1).asJava)
+ sUpdate.setField(Constants.storage.EXTRACTED_TEXT, data.parsedData.extractedText)
+ sUpdate.setField(Constants.storage.CONTENT_TYPE, data.fetchedData.getContentType.split("; ")(0))
+ sUpdate.setField(Constants.storage.FETCH_STATUS_CODE, data.fetchedData.getResponseCode)
+ sUpdate.setField(Constants.storage.SIGNATURE, hashFunction.hashBytes(data.fetchedData.getContent).toString)
+ sUpdate.setField(Constants.storage.RELATIVE_PATH, URLUtil.reverseUrl(data.fetchedData.getResource.getUrl))
+ sUpdate.setField(Constants.storage.OUTLINKS, data.parsedData.outlinks.toArray)
+ sUpdate.setField(Constants.storage.SEGMENT, data.fetchedData.getSegment)
+ sUpdate.setField(Constants.storage.CONTENTHASH, data.fetchedData.getContenthash)
+ val splitMimeTypes = data.fetchedData.getContentType.toLowerCase().split(";")
+ if (splitMimeTypes.contains(Constants.storage.WEBPAGE_MIMETYPE.toLowerCase())) {
+ sUpdate.setField(Constants.storage.RAW_CONTENT, new String(data.fetchedData.getContent))
+ } else if (splitMimeTypes.contains(Constants.storage.JSON_MIMETYPE.toLowerCase())){
+ sUpdate.setField(Constants.storage.RAW_CONTENT, new String(data.fetchedData.getContent))
+ }
+ sUpdate.setField(Constants.storage.RESPONSE_TIME, data.fetchedData.getResponseTime)
+ for ((scoreKey, score) <- data.fetchedData.getResource.getScore) {
+ sUpdate.setField(scoreKey, Map("set" -> score).asJava)
+ }
+
+ val md = data.parsedData.metadata
+ val mdFields = md.names().map(name => (name, if (md.isMultiValued(name)) md.getValues(name) else md.get(name))).toMap
+ updateFields(mdFields, Constants.storage.MD_SUFFIX, sUpdate)
+ updateFields(data.parsedData.headers, Constants.storage.HDR_SUFFIX, sUpdate)
+ sUpdate
+ }
+
+ def updateFields(dict: Map[String, AnyRef], suffix:String, solrDoc:SolrInputDocument): Unit ={
+ val mapped = fieldMapper.mapFields(dict, true)
+ for (k <- mapped.keySet()) {
+ val key = if (suffix == null || suffix.isEmpty || k.endsWith(suffix)) k else k + suffix
+ solrDoc.setField(key, mapped(k))
+ }
+ }
+}
From 498122e0f5458784db8aee30ccc64ae37e739749 Mon Sep 17 00:00:00 2001
From: Tom Barber
Date: Sat, 17 Jul 2021 22:10:46 +0100
Subject: [PATCH 140/335] update some flow logic
---
sparkler-core/conf/sparkler-default.yaml | 8 +--
.../irds/sparkler/util/FetcherDefault.java | 1 +
.../src/main/resources/sparkler-default.yaml | 8 +--
.../irds/sparkler/plugin/FetcherChrome.java | 49 ++++++++++++-------
4 files changed, 41 insertions(+), 25 deletions(-)
diff --git a/sparkler-core/conf/sparkler-default.yaml b/sparkler-core/conf/sparkler-default.yaml
index ce165a50..52ba4bb4 100644
--- a/sparkler-core/conf/sparkler-default.yaml
+++ b/sparkler-core/conf/sparkler-default.yaml
@@ -86,7 +86,7 @@ fetcher.headers:
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
Accept-Language: "en-US,en"
-fetcher.persist.content.location: /dbfs/FileStore/bcf/content/
+fetcher.persist.content.location: /tmp/content/
fetcher.persist.content.filename: hash
# Rotating agents file.
@@ -112,7 +112,7 @@ plugins.active:
# - scorer-dd-svn
# - fetcher-jbrowser
# - fetcher-htmlunit
-# - fetcher-chrome
+ - fetcher-chrome
# All Plugins are listed under this tree
plugins:
@@ -133,11 +133,11 @@ plugins:
#socket.timeout: 3000
#connect.timeout: 3000
fetcher.chrome:
- chrome.dns: "http://localhost:3000/webdriver"
+ chrome.dns: "local"
chrome.selenium.screenshotdir: "/dbfs/FileStore/screenshots/"
chrome.options:
- "--no-sandbox"
- - "--headless"
+
- "--disable-gpu"
- "--disable-extensions"
- "--ignore-certificate-errors"
diff --git a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
index d55e859b..55c6ee6d 100644
--- a/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
+++ b/sparkler-core/sparkler-api/src/main/java/edu/usc/irds/sparkler/util/FetcherDefault.java
@@ -202,6 +202,7 @@ public FetchedData fetch(Resource resource) throws Exception {
}
}
+
@Override
public FetchedData apply(Resource resource) {
try {
diff --git a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
index 0e740251..049614fe 100644
--- a/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
+++ b/sparkler-core/sparkler-app/src/main/resources/sparkler-default.yaml
@@ -87,7 +87,7 @@ fetcher.headers:
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
Accept-Language: "en-US,en"
-fetcher.persist.content.location: /dbfs/FileStore/bcf/content/
+fetcher.persist.content.location: /tmp/content
fetcher.persist.content.filename: hash
fetcher.persist.content.types:
- pdf
@@ -115,7 +115,7 @@ plugins.active:
# - scorer-dd-svn
# - fetcher-jbrowser
# - fetcher-htmlunit
-# - fetcher-chrome
+ - fetcher-chrome
# All Plugins are listed under this tree
plugins:
@@ -136,12 +136,12 @@ plugins:
#socket.timeout: 3000
#connect.timeout: 3000
fetcher.chrome:
- chrome.dns: "http://localhost:3000/webdriver"
+ chrome.dns: "local"
chrome.selenium.screenshotdir: "/dbfs/FileStore/screenshots/"
#chrome.proxy.address: 127.0.0.1:9998
chrome.options:
- "--no-sandbox"
- - "--headless"
+
- "--disable-gpu"
- "--disable-extensions"
- "--ignore-certificate-errors"
diff --git a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
index 12551d68..9e4a06a2 100644
--- a/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
+++ b/sparkler-core/sparkler-plugins/fetcher-chrome/src/main/java/edu/usc/irds/sparkler/plugin/FetcherChrome.java
@@ -45,6 +45,7 @@
import org.slf4j.LoggerFactory;
import java.io.File;
+import java.io.IOException;
import java.net.*;
import java.nio.file.Path;
import java.nio.file.Paths;
@@ -178,13 +179,13 @@ public void filterResponse(HttpResponse response, HttpMessageContents contents,
public FetchedData fetch(Resource resource) throws Exception {
startDriver(false);
LOG.info("Chrome FETCHER {}", resource.getUrl());
- FetchedData fetchedData;
- JSONObject json = null;
try {
checkSession();
} catch (Exception e){
System.out.println("failed to start selenium session");
}
+ long start = System.currentTimeMillis();
+ FetchedData fetchedData;
/*
* In this plugin we will work on only HTML data If data is of any other data
@@ -196,10 +197,34 @@ public FetchedData fetch(Resource resource) throws Exception {
// This should be true for all URLS ending with 4 character file extension
// return new FetchedData("".getBytes(), "application/html", ERROR_CODE) ;
return super.fetch(resource);
+ } else{
+ fetchedData = htmlFlow(resource, start);
}
- long start = System.currentTimeMillis();
+
+
+
+ LOG.debug("Time taken to load {} - {} ", resource.getUrl(), (System.currentTimeMillis() - start));
+
+ LOG.info("LATEST STATUS: " + latestStatus);
+
+
+ driver.quit();
+ driver = null;
+
+ return fetchedData;
+ }
+
+ public FetchedData dataFlow(Resource resource, long start){
+
+ return null;
+ }
+
+ public FetchedData htmlFlow(Resource resource, long start) throws IOException, java.text.ParseException {
+ FetchedData fetchedData;
+
LOG.debug("Time taken to create driver- {}", (System.currentTimeMillis() - start));
+ JSONObject json = null;
if(resource.getMetadata()!=null && !resource.getMetadata().equals("")){
json = processMetadata(resource.getMetadata());
@@ -272,19 +297,6 @@ public FetchedData fetch(Resource resource) throws Exception {
fetchedData = new FetchedData(html.getBytes(), "text/html", latestStatus);
fetchedData.setResource(resource);
- LOG.debug("Time taken to load {} - {} ", resource.getUrl(), (System.currentTimeMillis() - start));
-
- LOG.info("LATEST STATUS: " + latestStatus);
- /*if (!(latestStatus >= 200 && latestStatus < 300) && latestStatus != 0) {
- // If not fetched through plugin successfully
- // Falling back to default fetcher
- LOG.info("{} Failed to fetch the page. Falling back to default fetcher.", resource.getUrl());
- return super.fetch(resource);
- }*/
-
- driver.quit();
- driver = null;
-
return fetchedData;
}
@@ -312,7 +324,10 @@ private boolean isWebPage(String webUrl) {
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
HttpURLConnection conn = (HttpURLConnection)url.openConnection();
String contentType = conn.getHeaderField("Content-Type");
- return contentType.contains("json") || contentType.contains("text") || contentType.contains("ml") || conn.getResponseCode() == 302;
+ if(contentType == null && conn.getResponseCode() == 302){
+ return isWebPage(conn.getHeaderField("Location"));
+ }
+ return contentType.contains("json") || contentType.contains("text") || contentType.contains("ml");
} catch (Exception e) {
LOG.debug(e.getMessage(), e);
}
From 2e398a721fa3056d82735e3f58fb0679951d8ba9 Mon Sep 17 00:00:00 2001
From: Tom Barber