Skip to content

Commit

Permalink
Resolve the backlink issue #23
Browse files Browse the repository at this point in the history
  • Loading branch information
rajatIIT authored and aecio committed Mar 10, 2016
1 parent fa551bc commit 9d1539b
Show file tree
Hide file tree
Showing 10 changed files with 277 additions and 9 deletions.
5 changes: 4 additions & 1 deletion config/sample_config/link_storage/backlink.cfg
Expand Up @@ -2,7 +2,10 @@
# This configuration file has the paramters to run the LinkClassifier
#
####
. ../home.cfg
.

MOZ_ACCESS_ID mozscape-4a1d0827fc
MOZ_KEY d6ea0c3b253ab44425769e422624a0f

PATTERN_INI ,"uu":"
Expand Down
2 changes: 1 addition & 1 deletion config/sample_config/link_storage/link_storage.cfg
Expand Up @@ -14,7 +14,7 @@ GRAB_LINKS TRUE
USE_SCOPE FALSE

#Gets backlinks of the pages from a search engine used by the bipartite crawling
SAVE_BACKLINKS FALSE
SAVE_BACKLINKS TRUE

#Type of classifier used by link storage
##LinkClassifierBaseline: random link strategy when no page classifier is provided, or Soumen's baseline strategy when a page classifier is provided
Expand Down
2 changes: 1 addition & 1 deletion config/sample_config/target_storage/target_storage.cfg
Expand Up @@ -33,7 +33,7 @@ USE_CLASSIFIER TRUE
HARD_FOCUS TRUE

#Run bipartite crawler
BIPARTITE FALSE
BIPARTITE TRUE

#Relevance threshold for classified pages. Pages with probability of being
#relevant above this threshold are considered relevant
Expand Down
1 change: 1 addition & 0 deletions src/main/java/focusedCrawler/Main.java
Expand Up @@ -279,6 +279,7 @@ private static void startCrawl(CommandLine cmd) throws MissingArgumentException

Path linkStorageConf = Paths.get(configPath, "/link_storage/link_storage.cfg");
ParameterFile linkStorageConfig = new ParameterFile(linkStorageConf.toFile());
linkStorageConfig.putParam("CONFIG_DIR", configPath);

try {
Storage linkStorage = LinkStorage.createLinkStorage(configPath, seedPath,
Expand Down
5 changes: 5 additions & 0 deletions src/main/java/focusedCrawler/link/LinkStorage.java
Expand Up @@ -250,9 +250,14 @@ private static BipartiteGraphManager createBipartiteGraphManager(LinkStorageConf
String patternIniTitle = config.getBackSurferConfig().getPatternIniTitle();
String patternEndTitle = config.getBackSurferConfig().getPatternEndTitle();

String mozAccessID = config.getBackSurferConfig().getMozAccessId();
String mozKey = config.getBackSurferConfig().getMozKey();

SimpleWrapper simpleWrapper = new SimpleWrapper(patternIni, patternEnd);
SimpleWrapper simpleWrapperTitle = new SimpleWrapper(patternIniTitle, patternEndTitle);
BacklinkSurfer surfer = new BacklinkSurfer(simpleWrapper, simpleWrapperTitle);
surfer.setAccessID(mozAccessID);
surfer.setPassKey(mozKey);

LinkClassifier bClassifier = new LinkClassifierHub();
manager = new BipartiteGraphManager(frontierManager, graphRep, linkClassifier, bClassifier);
Expand Down
12 changes: 12 additions & 0 deletions src/main/java/focusedCrawler/link/LinkStorageConfig.java
Expand Up @@ -10,12 +10,24 @@ public static class BackSurferConfig {
private final String patternEnd;
private final String patternIniTitle;
private final String patternEndTitle;
private final String mozAccessId;
private final String mozKey;

public BackSurferConfig(ParameterFile params) {
patternIni = params.getParam("PATTERN_INI");
patternEnd = params.getParam("PATTERN_END");
patternIniTitle = params.getParam("PATTERN_INI_TITLE");
patternEndTitle = params.getParam("PATTERN_END_TITLE");
mozAccessId = params.getParam("MOZ_ACCESS_ID");
mozKey = params.getParam("MOZ_KEY");
}

public String getMozAccessId() {
return mozAccessId;
}

public String getMozKey() {
return mozKey;
}

public String getPatternIni() {
Expand Down
Expand Up @@ -38,14 +38,21 @@
import java.net.URL;
import java.net.MalformedURLException;
import java.net.URLEncoder;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.io.FileWriter;
import java.util.Vector;
import java.net.URLConnection;
import java.io.*;

import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

public class BacklinkSurfer {

private String googleBacklink; // = "http://www.google.com/search?sourceid=navclient&ie=UTF-8&q=link%3A";
Expand All @@ -61,6 +68,8 @@ public class BacklinkSurfer {
private HashSet<String> newURLs;
private HashMap<String, VSMElement>[] levels = new HashMap[3];
private Vector<LinkNeighborhood>[] lns = new Vector[3];
private String access;
private String key;

public BacklinkSurfer(StopList stoplist, SimpleWrapper wrapperURL,
String backlink, FileWriter out, int connectTimeout, int readTimeout, int numBack) {
Expand Down Expand Up @@ -129,18 +138,46 @@ record = record.substring(index,record.length());
}

private BackLinkNeighborhood[] downloadBacklinks(String host) throws IOException{
String backlink = "http://lsapi.seomoz.com/linkscape/links/" + host +"?AccessID=member-2e52b09aae&Expires=1365280453&Signature=WFcSAnhBG62xmt2f57bGrqCtiOM%3D&Filter=external&Scope=page_to_page&Limit=50&Sort=page_authority&SourceCols=4&TargetCols=4";

MozAuthenticator myAuthenticator = new MozAuthenticator(access,key,300);
String authStr = myAuthenticator.getAuthenticationStr();

String queryStr = "?Filter=external&Scope=page_to_page&Limit=50&Sort=page_authority&SourceCols=5&TargetCols=4&";

String backlink = "http://lsapi.seomoz.com/linkscape/links/" + host + queryStr + authStr;


Page page = downloadPage(newURL(backlink));
if (page == null) {
return null;
}
String[] links = wrapperURL.filterMultipleStrings(page.getContent());

ObjectMapper jacksonMapper = new ObjectMapper();
JsonNode root = jacksonMapper.readTree(page.getContent());


Iterator<JsonNode> childIterator = root.elements();
int resultSize = root.size();

String[] links = new String[resultSize];
String[] titles = new String[resultSize];

String mozLinksPattern = "uu";
String mozTitlePattern = "ut";

for(int i=0;i<resultSize;i++){

JsonNode next = childIterator.next();
links[i] = next.get(mozLinksPattern).asText();
titles[i] = next.get(mozTitlePattern).asText();
}

BackLinkNeighborhood[] backlinks = new BackLinkNeighborhood[links.length];
for (int i = 0; i < links.length; i++) {
backlinks[i] = new BackLinkNeighborhood();
backlinks[i].setLink("http://" + links[i]);
}
String[] titles = wrapperTitle.filterMultipleStrings(page.getContent());

for (int i = 0; i < titles.length; i++) {
backlinks[i].setTitle(titles[i]);
}
Expand Down Expand Up @@ -409,7 +446,17 @@ public static void main(String[] args) throws IOException {
ex1.printStackTrace();
}
out.close();
}
}

public void setAccessID(String accessID) {
this.access = accessID;
}


public void setPassKey(String passKey) {
this.key = passKey;
}

}


Expand Down
@@ -0,0 +1,148 @@
package focusedCrawler.link.classifier.builder;

import java.net.URLEncoder;
import java.security.InvalidKeyException;
import java.security.NoSuchAlgorithmException;
import java.util.Date;
import javax.xml.bind.DatatypeConverter;

import javax.crypto.Mac;
import javax.crypto.spec.SecretKeySpec;

public class MozAuthenticator {

private static final String HMAC_SHA1_ALGORITHM = "HmacSHA1";

/**
* accessID The user's Access ID
*/
private String accessID;

/**
* secretKey The user's Secret Key
*/
private String secretKey;

/**
* expiresInterval The interval after which the authentication string
* expires Default 300s
*/
private long expiresInterval = 300;

public MozAuthenticator() {

}

/**
* Constructor to set all the variables
*
* @param accessID
* @param secretKey
* @param expiresInterval
*/
public MozAuthenticator(String accessID, String secretKey, long expiresInterval) {
this.accessID = accessID;
this.secretKey = secretKey;
this.expiresInterval = expiresInterval;
}

/**
*
* This method calculates the authentication String based on the user's
* credentials.
*
* Set the user credentials before calling this method
*
* @return the authentication string
*
* @see #setAccessID(String)
* @see #setSecretKey(String)
*/
public String getAuthenticationStr() {
long expires = ((new Date()).getTime()) / 1000 + expiresInterval;

String stringToSign = accessID + "\n" + expires;

SecretKeySpec signingKey = new SecretKeySpec(secretKey.getBytes(), HMAC_SHA1_ALGORITHM);

// get an hmac_sha1 Mac instance and initialize with the signing key
Mac mac = null;
try {
mac = Mac.getInstance(HMAC_SHA1_ALGORITHM);
mac.init(signingKey);
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
return "";
} catch (InvalidKeyException e) {
e.printStackTrace();
return "";
}

// compute the hmac on input data bytes
byte[] rawHmac = mac.doFinal(stringToSign.getBytes());

// base64-encode the hmac
String urlSafeSignature = URLEncoder.encode(EncodeBase64(rawHmac));

String authenticationStr = "AccessID=" + accessID + "&Expires=" + expires + "&Signature="
+ urlSafeSignature;

return authenticationStr;
}

/**
* Encodes the rawdata in Base64 format
*
* @param rawData
* @return
*/
public String EncodeBase64(byte[] rawData) {
return DatatypeConverter.printBase64Binary(rawData);
}

/**
* @return the accessID
*/
public String getAccessID() {
return accessID;
}

/**
* @param accessID
* the accessID to set
*/
public void setAccessID(String accessID) {
this.accessID = accessID;
}

/**
* @return the secretKey
*/
public String getSecretKey() {
return secretKey;
}

/**
* @param secretKey
* the secretKey to set
*/
public void setSecretKey(String secretKey) {
this.secretKey = secretKey;
}

/**
* @return the expiresInterval
*/
public long getExpiresInterval() {
return expiresInterval;
}

/**
* @param expiresInterval
* the expiresInterval to set
*/
public void setExpiresInterval(long expiresInterval) {
this.expiresInterval = expiresInterval;
}

}
10 changes: 8 additions & 2 deletions src/main/java/focusedCrawler/util/ParameterFile.java
Expand Up @@ -426,5 +426,11 @@ else if (tipo == URL) {
}

}

}

public void putParam(String key, String value) {
Vector valueVector = new Vector();
valueVector.addElement(value);
hash.put(key, valueVector);
}

}
@@ -0,0 +1,46 @@
package focusedCrawler.link.classifier.builder;

import static org.junit.Assert.assertEquals;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;

import org.apache.commons.validator.routines.UrlValidator;
import org.junit.Test;

import focusedCrawler.util.parser.BackLinkNeighborhood;
import focusedCrawler.util.parser.SimpleWrapper;

public class BacklinkSurferTest {

@Test
public void backlinksShouldBeDownloaded() throws MalformedURLException, IOException {

// of no use since we are using JSON for parsing. pattern taken from
// config file.
SimpleWrapper wrapper = new SimpleWrapper(",\"uu\":\"", "\"}");

BacklinkSurfer surfer = new BacklinkSurfer(wrapper);
surfer.setAccessID("mozscape-4a1d0827fc");
surfer.setPassKey("d6ea0c3b253ab44425769e422624a0f");

String[] testURLs = { "http://www.bbc.co.uk/news/health-30577776" };

for (String url : testURLs) {
BackLinkNeighborhood[] backlinks = surfer.getLNBacklinks(new URL(url));
assertEquals("Backlink extraction not working! ", true, isBackLinkSetValid(backlinks));
}

}

public boolean isBackLinkSetValid(BackLinkNeighborhood[] backlinks) {
UrlValidator validator = new UrlValidator();
for (BackLinkNeighborhood backlink : backlinks) {
if (validator.isValid(backlink.getLink()))
return true;
}
return false;
}

}

0 comments on commit 9d1539b

Please sign in to comment.