Skip to content

Commit

Permalink
Added support to CDRv3.1 in AcheToCdrExporter
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed Jun 19, 2017
1 parent e655eed commit 3a5c616
Show file tree
Hide file tree
Showing 3 changed files with 359 additions and 16 deletions.
56 changes: 40 additions & 16 deletions src/main/java/focusedCrawler/memex/cdr/AcheToCdrExporter.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import java.util.Iterator;
import java.util.zip.GZIPOutputStream;

import com.fasterxml.jackson.databind.ObjectMapper;

import focusedCrawler.target.model.TargetModelJson;
import focusedCrawler.target.repository.FileSystemTargetRepository;
import focusedCrawler.target.repository.FileSystemTargetRepository.DataFormat;
Expand All @@ -20,15 +22,17 @@
@Command(name="AcheToCdrExporter", description="Exports crawled data to CDR format")
public class AcheToCdrExporter extends CliTool {

private static final ObjectMapper jsonMapper = new ObjectMapper();

//
// Input data options
//

@Option(name = "--input-path", description="Path to ACHE data target folder", required=true)
private String inputPath;

@Option(name={"--repository-type", "-rt"}, description="Which repository type should be used", required=true)
private RepositoryType repositoryType;
@Option(name={"--repository-type", "-rt"}, description="Which repository type should be used")
private RepositoryType repositoryType = RepositoryType.FILES;

public enum RepositoryType {
FILES, FILESYSTEM_JSON;
Expand All @@ -45,10 +49,10 @@ public enum RepositoryType {
//

@Option(name="--cdr-version", description="Which CDR version should be used")
private CDRVersion cdrVersion = CDRVersion.CDRv2;
private CDRVersion cdrVersion = CDRVersion.CDRv31;

public enum CDRVersion {
CDRv2, CDRv3
CDRv2, CDRv3, CDRv31
}

@Option(name="--output-file", description="Gziped output file containing data formmated as per CDR schema")
Expand All @@ -63,7 +67,7 @@ public enum CDRVersion {
String outputType;

@Option(name={"--output-es-url", "-ou"}, description="ElasticSearch full HTTP URL address")
String elasticSearchServer = "http://localhost:9200";
String elasticSearchServer = null;

@Option(name={"--output-es-auth", "-oa"}, description="User and password for ElasticSearch in format: user:pass")
String userPass = null;
Expand Down Expand Up @@ -117,7 +121,7 @@ public void execute() throws Exception {
try{
processRecord(pageModel);
processedPages++;
if(processedPages % 100 == 0) {
if(processedPages % 1 == 0) {
System.out.printf("Processed %d pages\n", processedPages);
}
} catch(Exception e) {
Expand All @@ -138,29 +142,30 @@ private void processRecord(TargetModelJson pageModel) throws IOException {
String contentType = pageModel.getContentType();

if (contentType == null || contentType.isEmpty()) {
System.err.println("Ignoring URL with no content-type: "+pageModel.getUrl());
System.err.println("Ignoring URL with no content-type: " + pageModel.getUrl());
return;
}

if (!contentType.startsWith("text/")) {
if (!contentType.startsWith("text/html")) {
return;
}

if(cdrVersion == CDRVersion.CDRv2) {
if (cdrVersion == CDRVersion.CDRv31) {
createCDR31DocumentJson(pageModel);
} else if (cdrVersion == CDRVersion.CDRv2) {
createCDR2DocumentJson(pageModel);
} else {
createCDR3DocumentJson(pageModel);
}
if(doc != null&& out != null) {
out.println(doc);

if (doc != null && out != null) {
out.println(jsonMapper.writeValueAsString(doc));
}
if(bulkIndexer != null) {

if (bulkIndexer != null) {
bulkIndexer.addDocument(outputIndex, outputType, doc, id);
}



}

public void createCDR2DocumentJson(TargetModelJson pageModel) {
Expand Down Expand Up @@ -200,4 +205,23 @@ public void createCDR3DocumentJson(TargetModelJson pageModel) {
this.doc = doc;
}

public void createCDR31DocumentJson(TargetModelJson pageModel) {
HashMap<String, Object> crawlData = new HashMap<>();
crawlData.put("response_headers", pageModel.getResponseHeaders());

CDR31Document.Builder builder = new CDR31Document.Builder()
.setUrl(pageModel.getUrl())
.setTimestampCrawl(new Date(pageModel.getFetchTime()))
.setTimestampIndex(new Date())
.setContentType(pageModel.getContentType())
.setResponseHeaders(pageModel.getResponseHeaders())
.setRawContent(pageModel.getContentAsString())
.setTeam("NYU")
.setCrawler("ACHE");

CDR31Document doc = builder.build();
this.id = doc.getId();
this.doc = doc;
}

}
260 changes: 260 additions & 0 deletions src/main/java/focusedCrawler/memex/cdr/CDR31Document.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
package focusedCrawler.memex.cdr;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringJoiner;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.tika.mime.MediaType;

import com.fasterxml.jackson.annotation.JsonFormat;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonInclude.Include;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;

/**
* Represents a web page according the CDRv3.1 schema.
*/
@SuppressWarnings("serial")
@JsonInclude(Include.NON_NULL)
public class CDR31Document implements Serializable {

public static class CDR31MediaObject {

@JsonProperty("obj_original_url")
String objOriginalUrl;

@JsonProperty("obj_stored_url")
String objStoredUrl;

@JsonProperty("content_type")
private String contentType;

@JsonProperty("response_headers")
private Map<String, String> responseHeaders;

@JsonProperty("timestamp_crawl")
@JsonFormat(shape = JsonFormat.Shape.STRING, pattern = "yyyy-MM-dd'T'HH:mm:ss'Z'")
private Date timestampCrawl;

}

private String _id;

@JsonProperty("content_type")
private String contentType;

@JsonProperty("crawler")
private String crawler;

@JsonProperty("objects")
private List<CDR31MediaObject> objects;

@JsonProperty("raw_content")
private String rawContent;

@JsonProperty("response_headers")
private Map<String, String> responseHeaders;

@JsonProperty("team")
private String team;

@JsonProperty("timestamp_crawl")
@JsonFormat(shape = JsonFormat.Shape.STRING, pattern = "yyyy-MM-dd'T'HH:mm:ss'Z'")
private Date timestampCrawl;

@JsonProperty("timestamp_index")
@JsonFormat(shape = JsonFormat.Shape.STRING, pattern = "yyyy-MM-dd'T'HH:mm:ss'Z'")
private Date timestampIndex;

@JsonProperty("url")
private String url;

@JsonProperty("version")
private final float version = 3.1f;

public CDR31Document() {
// required from JSON deserialization
}

public CDR31Document(Builder builder) {
this._id = builder._id;
this.contentType = builder.contentType;
this.crawler = builder.crawler;
this.objects = builder.objects;
this.rawContent = builder.rawContent;
this.responseHeaders = builder.responseHeaders;
this.team = builder.team;
this.timestampCrawl = builder.timestampCrawl;
this.timestampIndex = builder.timestampIndex;
this.url = builder.url;
}

public String getUrl() {
return url;
}

public Date getTimestampCrawl() {
return timestampCrawl;
}

public Date getTimestampIndex() {
return timestampIndex;
}

public String getTeam() {
return team;
}

public String getCrawler() {
return crawler;
}

public String getRawContent() {
return rawContent;
}

public String getContentType() {
return contentType;
}

@JsonIgnore
public String getId() {
return this._id;
}

public float getVersion() {
return version;
}

public static class Builder {

private static final TikaExtractor extractor = new TikaExtractor();
private static final ObjectMapper jsonMapper = new ObjectMapper();

private String _id;
private String contentType;
private String crawler;
private List<CDR31MediaObject> objects;
private String rawContent;
private Map<String, String> responseHeaders;
private String team;
private Date timestampCrawl;
private Date timestampIndex;
private String url;

public CDR31Document build() {

if (this.url == null)
throw new IllegalArgumentException("Field 'url' is mandatory");
if (this.rawContent == null)
throw new IllegalArgumentException("Field 'raw_content' is mandatory");
if (this.crawler == null)
throw new IllegalArgumentException("Field 'crawler' is mandatory");
if (this.team == null)
throw new IllegalArgumentException("Field 'team' is mandatory");
if (this.timestampIndex == null)
throw new IllegalArgumentException("Field 'timestampIndex' is mandatory");

if (this.contentType == null) {
MediaType mediaType = extractor.detect(this.rawContent, this.url, this.contentType);
this.contentType = mediaType.getBaseType().toString();
}

if (this.responseHeaders == null) {
this.responseHeaders = new HashMap<>();
}

if (this.objects == null) {
this.objects = new ArrayList<>();
}

if (this._id == null) {
// auto-generate _id field
this._id = computeId();
}

return new CDR31Document(this);
}

public String buildAsJson() throws JsonProcessingException {
return jsonMapper.writeValueAsString(this.build());
}

private String computeId() {
StringBuilder textForId = new StringBuilder();
textForId.append(this.url);
textForId.append("-");
textForId.append(this.timestampCrawl);
return DigestUtils.sha256Hex(textForId.toString()).toUpperCase();
}

public Builder setId(String id) {
this._id = id;
return this;
}

public Builder setUrl(String url) {
this.url = url;
return this;
}

public Builder setTimestampCrawl(Date timestampCrawl) {
this.timestampCrawl = timestampCrawl;
return this;
}

public Builder setTimestampIndex(Date timestampIndex) {
this.timestampIndex = timestampIndex;
return this;
}

public Builder setTeam(String team) {
this.team = team;
return this;
}

public Builder setCrawler(String crawler) {
this.crawler = crawler;
return this;
}

public Builder setRawContent(String rawContent) {
this.rawContent = rawContent;
return this;
}

public Builder setResponseHeaders(Map<String, List<String>> responseHeaders) {
Map<String, String> headers = new HashMap<>();
for (Entry<String, List<String>> header : responseHeaders.entrySet()) {
if (header.getValue() != null) {
StringJoiner joiner = new StringJoiner(",");
for (String value : header.getValue()) {
joiner.add(value);
}
headers.put(header.getKey(), joiner.toString());
}
}
this.responseHeaders = headers;
return this;
}

public Builder setContentType(String contentType) {
this.contentType = contentType;
return this;
}

public Builder setObjects(List<CDR31MediaObject> mediaObjects) {
this.objects = mediaObjects;
return this;
}
}
}
Loading

0 comments on commit 3a5c616

Please sign in to comment.