Skip to content

Commit

Permalink
Fix for #116 - Store segment path of url content in SOLR
Browse files Browse the repository at this point in the history
Add segment field to the managed schema

Add segment field to the managed schema
  • Loading branch information
sujen1412 committed Apr 9, 2018
1 parent 946f20e commit 0de316c
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 0 deletions.
1 change: 1 addition & 0 deletions conf/solr/crawldb/conf/managed-schema
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@
<field name="fetch_depth" type="int" indexed="true" stored="true" default="0" multiValued="false" docValues="true" />
<field name="relative_path" type="string" indexed="true" stored="true" multiValued="false" docValues="true" />
<field name="parent" type="string" indexed="true" stored="true" multiValued="false" docValues="true" />
<field name="segment" type="string" indexed="true" stored="true" multiValued="false" docValues="true" />

<field name="modified_time" type="date" indexed="true" stored="true" multiValued="false" default="NOW" />
<field name="retries_since_fetch" type="int" indexed="true" stored="true" default="-1" multiValued="false" docValues="true" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ interface solr { // Solr Fields
String MD_SUFFIX = "_md";
String HDR_SUFFIX = "_hd";
String RESPONSE_TIME = "response_time";
String SEGMENT = "segment";
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public class FetchedData implements Serializable {
private int responseCode;
private long responseTime = -1;

private String segment;

public FetchedData() {
}
Expand Down Expand Up @@ -92,6 +93,15 @@ public void setContentLength(Integer contentLength) {
this.contentLength = contentLength;
}

public String getSegment() {
return segment;
}

public void setSegment(String segment) {
this.segment = segment;
}


public void setContent(byte[] content) {
this.content = content;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class FairFetcher(val job: SparklerJob, val resources: Iterator[Resource], val d
data.fetchedData = fetchedData.next
val endTime = System.currentTimeMillis()
data.fetchedData.getResource.setFetchTimestamp(data.fetchedData.getFetchedAt)
data.fetchedData.setSegment(job.currentTask)
lastHit = data.fetchedData.getResource.getUrl
if (data.fetchedData.getResponseTime < 0) {
data.fetchedData.setResponseTime(endTime - startTime)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ object StatusUpdateSolrTransformer extends (CrawlData => SolrInputDocument ) wit
sUpdate.setField(Constants.solr.SIGNATURE, hashFunction.hashBytes(data.fetchedData.getContent).toString)
sUpdate.setField(Constants.solr.RELATIVE_PATH, URLUtil.reverseUrl(data.fetchedData.getResource.getUrl))
sUpdate.setField(Constants.solr.OUTLINKS, data.parsedData.outlinks.toArray)
sUpdate.setField(Constants.solr.SEGMENT, data.fetchedData.getSegment)
sUpdate.setField(Constants.solr.RESPONSE_TIME, data.fetchedData.getResponseTime)
for ((scoreKey, score) <- data.fetchedData.getResource.getScore) {
sUpdate.setField(scoreKey, Map("set" -> score).asJava)
Expand Down

0 comments on commit 0de316c

Please sign in to comment.