Skip to content

Commit

Permalink
Merge pull request #78 from Alch-Emi/load-css-imports
Browse files Browse the repository at this point in the history
Load URLs in CSS and style attributes
  • Loading branch information
Sunshine committed Dec 10, 2019
2 parents 35f5e13 + cf347e0 commit 919e626
Show file tree
Hide file tree
Showing 4 changed files with 191 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
[package]
name = "monolith"
version = "2.0.23"
version = "2.1.0"
authors = [
"Sunshine <sunshine@uberspace.net>",
"Mahdi Robatipoor <mahdi.robatipoor@gmail.com>",
"Emmanuel Delaborde <th3rac25@gmail.com>",
"Emi Simpson <emi@alchemi.dev>",
]
description = "CLI tool for saving web pages as a single HTML file"

Expand Down
70 changes: 64 additions & 6 deletions src/html.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use http::retrieve_asset;
use js::attr_is_event_handler;
use std::collections::HashMap;
use std::default::Default;
use utils::{data_to_dataurl, is_valid_url, resolve_url, url_has_protocol};
use utils::{data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol};

lazy_static! {
static ref EMPTY_STRING: String = String::new();
Expand Down Expand Up @@ -129,18 +129,38 @@ pub fn walk_and_embed_assets(
let href_full_url: String =
resolve_url(&url, &attr.value.to_string())
.unwrap_or(EMPTY_STRING.clone());
let (css_dataurl, _) = retrieve_asset(
let replacement_text = match retrieve_asset(
cache,
&href_full_url,
true,
false,
"text/css",
opt_user_agent,
opt_silent,
opt_insecure,
)
.unwrap_or((EMPTY_STRING.clone(), EMPTY_STRING.clone()));
) {
// On successful retrieval, traverse CSS
Ok((css_data, _)) => resolve_css_imports(
cache,
&css_data,
true,
&href_full_url,
opt_no_images,
opt_user_agent,
opt_silent,
opt_insecure,
),

// If a network error occured, warn
Err(e) => {
eprintln!("Warning: {}", e,);

// If failed to resolve, replace with absolute URL
href_full_url
}
};

attr.value.clear();
attr.value.push_slice(css_dataurl.as_str());
attr.value.push_slice(&replacement_text);
}
}
}
Expand Down Expand Up @@ -273,6 +293,24 @@ pub fn walk_and_embed_assets(
if opt_no_css {
// Empty inner content of STYLE tags
node.children.borrow_mut().clear();
} else {
for node in node.children.borrow_mut().iter_mut() {
if let NodeData::Text { ref contents } = node.data {
let mut tendril = contents.borrow_mut();
let replacement = resolve_css_imports(
cache,
tendril.as_ref(),
false,
&url,
opt_no_images,
opt_user_agent,
opt_silent,
opt_insecure,
);
tendril.clear();
tendril.push_slice(&replacement);
}
}
}
}
"form" => {
Expand Down Expand Up @@ -372,6 +410,7 @@ pub fn walk_and_embed_assets(
_ => {}
}

// Process style attributes
if opt_no_css {
// Get rid of style attributes
let mut style_attr_indexes = Vec::new();
Expand All @@ -384,6 +423,25 @@ pub fn walk_and_embed_assets(
for attr_index in style_attr_indexes {
attrs_mut.remove(attr_index);
}
} else {
// Otherwise, parse any links found in the attributes
for attribute in attrs_mut
.iter_mut()
.filter(|a| a.name.local.as_ref().eq_ignore_ascii_case("style"))
{
let replacement = resolve_css_imports(
cache,
attribute.value.as_ref(),
false,
&url,
opt_no_images,
opt_user_agent,
opt_silent,
opt_insecure,
);
attribute.value.clear();
attribute.value.push_slice(&replacement);
}
}

if opt_no_js {
Expand Down
124 changes: 124 additions & 0 deletions src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,46 @@
extern crate base64;

use self::base64::encode;
use http::retrieve_asset;
use regex::Regex;
use std::collections::HashMap;
use url::{ParseError, Url};

/// This monster of a regex is used to match any kind of URL found in CSS.
///
/// There are roughly three different categories that a found URL could fit
/// into:
/// - Font [found after a src: property in an @font-family rule]
/// - Stylesheet [denoted by an @import before the url
/// - Image [covers all other uses of the url() function]
///
/// This regex aims to extract the following information:
/// - What type of URL is it (font/image/css)
/// - Where is the part that needs to be replaced (incl any wrapping quotes)
/// - What is the URL (excl any wrapping quotes)
///
/// Essentially, the regex can be broken down into two parts:
///
/// `(?:(?P<import>@import)|(?P<font>src\s*:)\s+)?`
/// This matches the precursor to a font or CSS URL, and fills in a match under
/// either `<import>` (if it's a CSS URL) or `<font>` (if it's a font).
/// Determining whether or not it's an image can be done by the negation of both
/// of these. Either zero or one of these can match.
///
/// `url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)`
/// This matches the actual URL part of the url(), and must always match. It also
/// sets `<to_repl>` and `<url>` which correspond to everything within
/// `url(...)` and a usable URL, respectively.
///
/// Note, however, that this does not perform any validation of the found URL.
/// Malformed CSS could lead to an invalid URL being present. It is therefore
/// recomended that the URL gets manually validated.
const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P<stylesheet>@import)|(?P<font>src\s*:))\s+)?url\((?P<to_repl>['"]?(?P<url>[^"'\)]+)['"]?)\)"###;

lazy_static! {
static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap();
static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap();
static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap();
}

const MAGIC: [[&[u8]; 2]; 19] = [
Expand Down Expand Up @@ -75,3 +109,93 @@ pub fn resolve_url<T: AsRef<str>, U: AsRef<str>>(from: T, to: U) -> Result<Strin
};
Ok(result)
}

pub fn resolve_css_imports(
cache: &mut HashMap<String, String>,
css_string: &str,
as_dataurl: bool,
href: &str,
opt_no_images: bool,
opt_user_agent: &str,
opt_silent: bool,
opt_insecure: bool,
) -> String {
let mut resolved_css = String::from(css_string);

for link in REGEX_CSS_URL.captures_iter(&css_string) {
let target_link = link.name("url").unwrap().as_str();

// Determine the type of link
let is_stylesheet = link.name("stylesheet").is_some();
let is_font = link.name("font").is_some();
let is_image = !is_stylesheet && !is_font;

// Generate absolute URL for content
let embedded_url = match resolve_url(href, target_link) {
Ok(url) => url,
Err(_) => continue, // Malformed URL
};

// Download the asset. If it's more CSS, resolve that too
let content = if is_stylesheet {
// The link is an @import link
retrieve_asset(
cache,
&embedded_url,
false, // Formating as data URL will be done later
"text/css", // Expect CSS
opt_user_agent,
opt_silent,
opt_insecure,
)
.map(|(content, _)| {
resolve_css_imports(
cache,
&content,
true, // Finally, convert to a dataurl
&embedded_url,
opt_no_images,
opt_user_agent,
opt_silent,
opt_insecure,
)
})
} else if (is_image && !opt_no_images) || is_font {
// The link is some other, non-@import link
retrieve_asset(
cache,
&embedded_url,
true, // Format as data URL
"", // Unknown MIME type
opt_user_agent,
opt_silent,
opt_insecure,
)
.map(|(a, _)| a)
} else {
// If it's a datatype that has been opt_no'd out of, replace with
// absolute URL

Ok(embedded_url.clone())
}
.unwrap_or_else(|e| {
eprintln!("Warning: {}", e);

// If failed to resolve, replace with absolute URL
embedded_url
});

let replacement = format!("\"{}\"", &content);
let dest = link.name("to_repl").unwrap();
let offset = resolved_css.len() - css_string.len();
let target_range = (dest.start() + offset)..(dest.end() + offset);

resolved_css.replace_range(target_range, &replacement);
}

if as_dataurl {
data_to_dataurl("text/css", resolved_css.as_bytes())
} else {
resolved_css
}
}

0 comments on commit 919e626

Please sign in to comment.