Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Power Users Can Use Redis to Filter Already Downloaded URLs, Use HashSet for URL Matching From File #1914

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@
<artifactId>junit-vintage-engine</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>it.ozimov</groupId>
<artifactId>embedded-redis</artifactId>
<version>0.7.2</version>
<scope>test</scope>
</dependency>
<dependency>
<!-- jsoup HTML parser library @ http://jsoup.org/ -->
<groupId>org.jsoup</groupId>
Expand Down Expand Up @@ -88,6 +94,11 @@
<artifactId>Java-WebSocket</artifactId>
<version>1.5.1</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>3.6.0</version>
</dependency>
</dependencies>
<build>
<plugins>
Expand Down
86 changes: 72 additions & 14 deletions src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import java.util.Map;
import java.util.Observable;
import java.util.Scanner;
import java.util.HashSet;
import org.apache.log4j.FileAppender;
import org.apache.log4j.Logger;
import org.jsoup.HttpStatusException;
Expand All @@ -23,6 +24,7 @@
import com.rarchives.ripme.ui.RipStatusMessage;
import com.rarchives.ripme.ui.RipStatusMessage.STATUS;
import com.rarchives.ripme.utils.Utils;
import redis.clients.jedis.Jedis;

public abstract class AbstractRipper
extends Observable
Expand All @@ -40,7 +42,8 @@ public abstract class AbstractRipper
RipStatusHandler observer = null;

private boolean completed = true;

private Jedis jedis;
protected HashSet<String> urlHistoryHashSet = new HashSet();
public abstract void rip() throws IOException;
public abstract String getHost();
public abstract String getGID(URL url) throws MalformedURLException;
Expand All @@ -62,6 +65,27 @@ protected void stopCheck() throws IOException {
}
}

protected void initializeHistoryStore() {
String host = Utils.getConfigString("url_history.redis_cache.host", "");
if (host != "") {
Integer port = Utils.getConfigInteger("url_history.redis_cache.port", 6379);
jedis = new Jedis(host, port);
} else {
File file = new File(URLHistoryFile);
if (file.exists()) {
try (Scanner scanner = new Scanner(file)) {
LOGGER.debug("Building url hash set");
while (scanner.hasNextLine()) {
final String lineFromFile = scanner.nextLine();
urlHistoryHashSet.add(lineFromFile.trim());
}
} catch (FileNotFoundException e) {
LOGGER.error(e.toString());
}
}
}
}


/**
* Adds a URL to the url history file
Expand All @@ -72,7 +96,16 @@ protected void writeDownloadedURL(String downloadedURL) throws IOException {
if (Utils.getConfigBoolean("urls_only.save", false)) {
return;
}

if (Utils.getConfigString("url_history.redis_cache.host", "") != "") {
String keyPrefix = Utils.getConfigString("url_history.redis_cache.key_prefix", "");
String key = keyPrefix + downloadedURL.trim();
LOGGER.info("Setting in Redis: " + key);
jedis.set(key, "true");
}

downloadedURL = normalizeUrl(downloadedURL);
urlHistoryHashSet.add(downloadedURL);
BufferedWriter bw = null;
FileWriter fw = null;
try {
Expand Down Expand Up @@ -132,23 +165,46 @@ public String normalizeUrl(String url) {
* Returns false if not yet downloaded.
*/
protected boolean hasDownloadedURL(String url) {
File file = new File(URLHistoryFile);
url = normalizeUrl(url);

try (Scanner scanner = new Scanner(file)) {
while (scanner.hasNextLine()) {
final String lineFromFile = scanner.nextLine();
if (lineFromFile.equals(url)) {
return true;
}
}
} catch (FileNotFoundException e) {
return false;
if (Utils.getConfigString("url_history.redis_cache.host", "") != "") {
return redisContainsURL(url);
} else {
return fileContainsURL(url);
}
}

return false;
/**
* Checks redis to see if Ripme has already downloaded a URL
* @param url URL to check if downloaded
* @return
* Returns true if previously downloaded.
* Returns false if not yet downloaded.
*/
private boolean redisContainsURL(String url) {
String keyPrefix = Utils.getConfigString("url_history.redis_cache.key_prefix", "");
String key = keyPrefix + normalizeUrl(url.trim());
String jedisResult = jedis.get(key);
if (jedisResult == null) {
LOGGER.info(key + " not found in redis");
return false;
} else {
LOGGER.info(key + " was found in redis");
return true;
}
}

/**
* Checks history file to see if Ripme has already downloaded a URL
* @param url URL to check if downloaded
* @return
* Returns true if previously downloaded.
* Returns false if not yet downloaded.
*/
private boolean fileContainsURL(String url) {
url = normalizeUrl(url.trim());
Boolean foundUrl = urlHistoryHashSet.contains(url);
LOGGER.debug("Found url in hash set: " + foundUrl.toString());
return foundUrl;
}

/**
* Ensures inheriting ripper can rip this URL, raises exception if not.
Expand Down Expand Up @@ -333,6 +389,7 @@ protected boolean addURLToDownload(URL url, String prefix, String subdirectory,
LOGGER.info("[+] Creating directory: " + Utils.removeCWD(saveFileAs.getParent()));
saveFileAs.getParentFile().mkdirs();
}

if (Utils.getConfigBoolean("remember.url_history", true) && !isThisATest()) {
LOGGER.info("Writing " + url.toExternalForm() + " to file");
try {
Expand Down Expand Up @@ -613,6 +670,7 @@ public void sendUpdate(STATUS status, Object message) {
*/
public void run() {
try {
initializeHistoryStore();
rip();
} catch (HttpStatusException e) {
LOGGER.error("Got exception while running ripper:", e);
Expand Down
113 changes: 98 additions & 15 deletions src/test/java/com/rarchives/ripme/tst/AbstractRipperTest.java
Original file line number Diff line number Diff line change
@@ -1,32 +1,115 @@
package com.rarchives.ripme.tst;

import com.rarchives.ripme.ripper.AbstractRipper;
import com.rarchives.ripme.tst.TestRipper;
import org.junit.jupiter.api.Test;

import java.io.IOException;
import java.net.URL;

import com.rarchives.ripme.utils.Utils;
import redis.clients.jedis.Jedis;
import static org.junit.jupiter.api.Assertions.assertEquals;

import redis.embedded.RedisServer;

public class AbstractRipperTest {

@Test
public void testGetFileName() throws IOException {
String fileName = AbstractRipper.getFileName(new URL("http://www.tsumino.com/Image/Object?name=U1EieteEGwm6N1dGszqCpA%3D%3D"), "test", "test");
assertEquals("test.test", fileName);
public void testGetFileName() throws IOException {
String fileName = AbstractRipper.getFileName(new URL("http://www.tsumino.com/Image/Object?name=U1EieteEGwm6N1dGszqCpA%3D%3D"), "test", "test");
assertEquals("test.test", fileName);

fileName = AbstractRipper.getFileName(new URL("http://www.tsumino.com/Image/Object?name=U1EieteEGwm6N1dGszqCpA%3D%3D"), "test", null);
assertEquals("test", fileName);
fileName = AbstractRipper.getFileName(new URL("http://www.tsumino.com/Image/Object?name=U1EieteEGwm6N1dGszqCpA%3D%3D"), "test", null);
assertEquals("test", fileName);

fileName = AbstractRipper.getFileName(new URL("http://www.tsumino.com/Image/Object?name=U1EieteEGwm6N1dGszqCpA%3D%3D"), null, null);
assertEquals("Object", fileName);
fileName = AbstractRipper.getFileName(new URL("http://www.tsumino.com/Image/Object?name=U1EieteEGwm6N1dGszqCpA%3D%3D"), null, null);
assertEquals("Object", fileName);

fileName = AbstractRipper.getFileName(new URL("http://www.test.com/file.png"), null, null);
assertEquals("file.png", fileName);
fileName = AbstractRipper.getFileName(new URL("http://www.test.com/file.png"), null, null);
assertEquals("file.png", fileName);

fileName = AbstractRipper.getFileName(new URL("http://www.test.com/file."), null, null);
assertEquals("file.", fileName);
}

@Test
public void testHasDownloadedURL() throws IOException {
int testRedisPort = 6379;
RedisServer redisServer = new RedisServer(testRedisPort);
try {

fileName = AbstractRipper.getFileName(new URL("http://www.test.com/file."), null, null);
assertEquals("file.", fileName);
}
URL ripURL = new URL("https://example.com");
URL fileURL = new URL("https://example.com/picture.jpg");
TestRipper ripper = new TestRipper(ripURL);
redisServer.start();
Jedis jedis = new Jedis("localhost", testRedisPort);
// Test with empty redis
Utils.setConfigString("url_history.redis_cache.host", "localhost");
Utils.setConfigString("url_history.redis_cache.port", Integer.toString(testRedisPort));
// Make the ripper connect to redis
ripper.callInitializeHistoryStore();
boolean hasAlreadyDownloaded = ripper.callHasDownloadedURL(fileURL.toString());
assertEquals(false, hasAlreadyDownloaded);

// Test with URL loaded into redis
String keyPrefix = "somePrefix";
Utils.setConfigString("url_history.redis_cache.key_prefix", keyPrefix);
String key = keyPrefix + fileURL.toString().trim();
jedis.set(key, "true");
hasAlreadyDownloaded = ripper.callHasDownloadedURL(fileURL.toString());
assertEquals(true, hasAlreadyDownloaded);

redisServer.stop();


// Re-initialize and test using hash set instead
Utils.setConfigString("url_history.redis_cache.host", "");
ripper.callInitializeHistoryStore();
hasAlreadyDownloaded = ripper.callHasDownloadedURL(fileURL.toString());
assertEquals(false, hasAlreadyDownloaded);

// Test using hashset with URL added
ripper.insertToURLHashSet(fileURL.toString());
hasAlreadyDownloaded = ripper.callHasDownloadedURL(fileURL.toString());
assertEquals(true, hasAlreadyDownloaded);
} catch (Exception exception) {
// Ensure that the redis server is destroyed, otherwise test re-runs will fail because it can't start a new server
// on the same port
redisServer.stop();
throw exception;
}
}

@Test
public void testWriteDownloadedURL() throws IOException {
int testRedisPort = 6379;
RedisServer redisServer = new RedisServer(testRedisPort);
try {
URL ripURL = new URL("https://example.com");
URL fileURL = new URL("https://example.com/picture.jpg");
TestRipper ripper = new TestRipper(ripURL);
redisServer.start();
Jedis jedis = new Jedis("localhost", testRedisPort);
String keyPrefix = "somePrefix";
String key = keyPrefix + fileURL.toString().trim();
Utils.setConfigString("url_history.redis_cache.key_prefix", keyPrefix);
Utils.setConfigString("url_history.redis_cache.host", "localhost");
Utils.setConfigString("url_history.redis_cache.port", Integer.toString(testRedisPort));
Boolean urlInHashSet = ripper.checkURLInHashSet(fileURL.toString());
String jedisResult = jedis.get(key);
assertEquals(null, jedisResult);
assertEquals(false, urlInHashSet);
ripper.callInitializeHistoryStore();
ripper.callWriteDownloadedURL(fileURL.toString());
urlInHashSet = ripper.checkURLInHashSet(fileURL.toString());
assertEquals(true, urlInHashSet);
jedisResult = jedis.get(key);
assertEquals("true", jedisResult);
redisServer.stop();
} catch (Exception exception) {
// Ensure that the redis server is destroyed, otherwise test re-runs will fail because it can't start a new server
// on the same port
redisServer.stop();
throw exception;
}
}

}
Loading