-
Notifications
You must be signed in to change notification settings - Fork 440
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement youtube subscription import from Google takeout
- Loading branch information
Showing
6 changed files
with
310 additions
and
162 deletions.
There are no files selected for viewing
127 changes: 36 additions & 91 deletions
127
...rg/schabi/newpipe/extractor/services/youtube/extractors/YoutubeSubscriptionExtractor.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,126 +1,71 @@ | ||
package org.schabi.newpipe.extractor.services.youtube.extractors; | ||
|
||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import com.grack.nanojson.JsonArray; | ||
import com.grack.nanojson.JsonObject; | ||
import com.grack.nanojson.JsonParser; | ||
import com.grack.nanojson.JsonParserException; | ||
|
||
import org.schabi.newpipe.extractor.exceptions.ExtractionException; | ||
import org.schabi.newpipe.extractor.services.youtube.YoutubeService; | ||
import org.schabi.newpipe.extractor.subscription.SubscriptionExtractor; | ||
import org.schabi.newpipe.extractor.subscription.SubscriptionItem; | ||
import org.schabi.newpipe.extractor.utils.Parser; | ||
|
||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.List; | ||
|
||
import javax.annotation.Nonnull; | ||
|
||
import static org.schabi.newpipe.extractor.subscription.SubscriptionExtractor.ContentSource.INPUT_STREAM; | ||
|
||
/** | ||
* Extract subscriptions from a YouTube export (OPML format supported) | ||
* Extract subscriptions from a Google takout export (the user has to get the JSON out of the zip) | ||
*/ | ||
public class YoutubeSubscriptionExtractor extends SubscriptionExtractor { | ||
private static final String BASE_CHANNEL_URL = "https://www.youtube.com/channel/"; | ||
|
||
public YoutubeSubscriptionExtractor(YoutubeService service) { | ||
super(service, Collections.singletonList(INPUT_STREAM)); | ||
public YoutubeSubscriptionExtractor(final YoutubeService youtubeService) { | ||
super(youtubeService, Collections.singletonList(INPUT_STREAM)); | ||
} | ||
|
||
@Override | ||
public String getRelatedUrl() { | ||
return "https://www.youtube.com/subscription_manager?action_takeout=1"; | ||
return "https://takeout.google.com/takeout/custom/youtube"; | ||
} | ||
|
||
@Override | ||
public List<SubscriptionItem> fromInputStream(InputStream contentInputStream) throws ExtractionException { | ||
if (contentInputStream == null) throw new InvalidSourceException("input stream is null"); | ||
|
||
return getItemsFromOPML(contentInputStream); | ||
} | ||
|
||
/*////////////////////////////////////////////////////////////////////////// | ||
// OPML implementation | ||
//////////////////////////////////////////////////////////////////////////*/ | ||
|
||
private static final String ID_PATTERN = "/videos.xml\\?channel_id=([A-Za-z0-9_-]*)"; | ||
private static final String BASE_CHANNEL_URL = "https://www.youtube.com/channel/"; | ||
|
||
private List<SubscriptionItem> getItemsFromOPML(InputStream contentInputStream) throws ExtractionException { | ||
final List<SubscriptionItem> result = new ArrayList<>(); | ||
|
||
final String contentString = readFromInputStream(contentInputStream); | ||
Document document = Jsoup.parse(contentString, "", org.jsoup.parser.Parser.xmlParser()); | ||
|
||
if (document.select("opml").isEmpty()) { | ||
throw new InvalidSourceException("document does not have OPML tag"); | ||
} | ||
|
||
if (document.select("outline").isEmpty()) { | ||
throw new InvalidSourceException("document does not have at least one outline tag"); | ||
} | ||
|
||
for (Element outline : document.select("outline[type=rss]")) { | ||
String title = outline.attr("title"); | ||
String xmlUrl = outline.attr("abs:xmlUrl"); | ||
|
||
try { | ||
String id = Parser.matchGroup1(ID_PATTERN, xmlUrl); | ||
result.add(new SubscriptionItem(service.getServiceId(), BASE_CHANNEL_URL + id, title)); | ||
} catch (Parser.RegexException ignored) { /* ignore invalid subscriptions */ } | ||
} | ||
|
||
return result; | ||
} | ||
|
||
/*////////////////////////////////////////////////////////////////////////// | ||
// Utils | ||
//////////////////////////////////////////////////////////////////////////*/ | ||
|
||
/** | ||
* Throws an exception if the string does not have the right tag/string from a valid export. | ||
*/ | ||
private void throwIfTagIsNotFound(String content) throws InvalidSourceException { | ||
if (!content.trim().contains("<opml")) { | ||
throw new InvalidSourceException("input stream does not have OPML tag"); | ||
} | ||
} | ||
|
||
private String readFromInputStream(InputStream inputStream) throws InvalidSourceException { | ||
StringBuilder contentBuilder = new StringBuilder(); | ||
boolean hasTag = false; | ||
public List<SubscriptionItem> fromInputStream(@Nonnull final InputStream contentInputStream) | ||
throws ExtractionException { | ||
final JsonArray subscriptions; | ||
try { | ||
byte[] buffer = new byte[16 * 1024]; | ||
int read; | ||
while ((read = inputStream.read(buffer)) != -1) { | ||
String currentPartOfContent = new String(buffer, 0, read, "UTF-8"); | ||
contentBuilder.append(currentPartOfContent); | ||
subscriptions = JsonParser.array().from(contentInputStream); | ||
} catch (JsonParserException e) { | ||
throw new InvalidSourceException("Invalid json input stream", e); | ||
} | ||
|
||
// Fail-fast in case of reading a long unsupported input stream | ||
if (!hasTag && contentBuilder.length() > 128) { | ||
throwIfTagIsNotFound(contentBuilder.toString()); | ||
hasTag = true; | ||
} | ||
boolean foundInvalidSubscription = false; | ||
final List<SubscriptionItem> subscriptionItems = new ArrayList<>(); | ||
for (final Object subscriptionObject : subscriptions) { | ||
if (!(subscriptionObject instanceof JsonObject)) { | ||
foundInvalidSubscription = true; | ||
continue; | ||
} | ||
} catch (InvalidSourceException e) { | ||
throw e; | ||
} catch (Throwable e) { | ||
throw new InvalidSourceException(e); | ||
} finally { | ||
try { | ||
inputStream.close(); | ||
} catch (IOException ignored) { | ||
|
||
final JsonObject subscription = ((JsonObject) subscriptionObject).getObject("snippet"); | ||
final String id = subscription.getObject("resourceId").getString("channelId", ""); | ||
if (id.length() != 24) { // e.g. UCsXVk37bltHxD1rDPwtNM8Q | ||
foundInvalidSubscription = true; | ||
continue; | ||
} | ||
} | ||
|
||
final String fileContent = contentBuilder.toString().trim(); | ||
if (fileContent.isEmpty()) { | ||
throw new InvalidSourceException("Empty input stream"); | ||
subscriptionItems.add(new SubscriptionItem(service.getServiceId(), | ||
BASE_CHANNEL_URL + id, subscription.getString("title", ""))); | ||
} | ||
|
||
if (!hasTag) { | ||
throwIfTagIsNotFound(fileContent); | ||
if (foundInvalidSubscription && subscriptionItems.isEmpty()) { | ||
throw new InvalidSourceException("Found only invalid channel ids"); | ||
} | ||
|
||
return fileContent; | ||
return subscriptionItems; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.