Skip to content

Commit

Permalink
Merge pull request #7 from Y2Z/more-tests-and-improvements
Browse files Browse the repository at this point in the history
More tests and improvements
  • Loading branch information
Sunshine committed Oct 30, 2021
2 parents c9ad5e8 + 9aea6b9 commit 40393e1
Show file tree
Hide file tree
Showing 9 changed files with 227 additions and 155 deletions.
11 changes: 6 additions & 5 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "dataurl"
version = "0.1.1"
version = "0.1.2"
authors = [
"Sunshine <sunshine@uberspace.net>",
]
Expand All @@ -18,6 +18,7 @@ include = [
license = "CC0-1.0"

[dependencies]
atty = "0.2.14"
base64 = "0.13.0"
clap = "2.33.3"
encoding_rs = "0.8.29"
Expand Down
52 changes: 26 additions & 26 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,34 +41,12 @@ Every release contains pre-built binaries for Windows, GNU/Linux, as well as pla
---------------------------------------------------


## Usage (crate)

```rust
use dataurl::DataUrl;

let data_url: DataUrl = DataUrl::parse("data:,Hello,%20World!")?;

assert_eq!(data_url.get_media_type(), "text/plain".to_string());
assert_eq!(data_url.get_media_type_no_default(), None);
assert_eq!(data_url.get_charset(), "US-ASCII".to_string());
assert_eq!(data_url.get_charset_no_default(), None);
assert!(!data_url.get_is_base64_encoded());
assert_eq!(data_url.get_data(), [72, 101, 108, 108, 111, 44, 32, 87, 111, 114, 108, 100, 33]);
assert_eq!(data_url.get_fragment(), None);
assert_eq!(data_url.to_string(), "data:,Hello%2C%20World%21");
assert_eq!(data_url.get_text(), "Hello, World!");
```


---------------------------------------------------


## Usage (CLI)

```console
dataurl "some text"
```
val#f' > index.html

```console
dataurl -d 'data:text/html,text<a id%3D"b">ok</a>?a=v#f' > index.html
```
Expand All @@ -88,6 +66,28 @@ cat file.png | dataurl
---------------------------------------------------


## Usage (crate)

```rust
use dataurl::DataUrl;

let data_url: DataUrl = DataUrl::parse("data:,Hello,%20World!")?;

assert_eq!(data_url.get_media_type(), "text/plain".to_string());
assert_eq!(data_url.get_media_type_no_default(), None);
assert_eq!(data_url.get_charset(), "US-ASCII".to_string());
assert_eq!(data_url.get_charset_no_default(), None);
assert!(!data_url.get_is_base64_encoded());
assert_eq!(data_url.get_data(), [72, 101, 108, 108, 111, 44, 32, 87, 111, 114, 108, 100, 33]);
assert_eq!(data_url.get_fragment(), None);
assert_eq!(data_url.to_string(), "data:,Hello%2C%20World%21");
assert_eq!(data_url.get_text(), "Hello, World!");
```


---------------------------------------------------


## Flags and options

- `-b`: Encode data using base64
Expand All @@ -96,16 +96,16 @@ cat file.png | dataurl
- `-c`: Use custom `charset`
- `-f`: Append `fragment`
- `-i`: Specify `file` to read data from (use `-` for STDIN)
- `-m`: Adjust `media type`
- `-t`: Adjust `media type`


---------------------------------------------------


## References

- https://datatracker.ietf.org/doc/html/rfc2397
- https://datatracker.ietf.org/doc/html/rfc6838
- [RFC 2397 (The "data" URL scheme)](https://datatracker.ietf.org/doc/html/rfc2397)
- [RFC 6838 (Media Type Specifications and Registration Procedures)](https://datatracker.ietf.org/doc/html/rfc6838)


---------------------------------------------------
Expand Down
162 changes: 88 additions & 74 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,35 @@ use url::Url;

const DEFAULT_MEDIA_TYPE: &'static str = "text/plain";
const DEFAULT_CHARSET: &'static str = "US-ASCII";
const TEXTUAL_MEDIA_TYPES: &'static [&str] = &[
"application/atom+xml",
"application/dart",
"application/ecmascript",
"application/javascript",
"application/json",
"application/jwt",
"application/rdf+xml",
"application/rss+xml",
"application/soap+xml",
"application/vnd.mozilla.xul+xml",
"application/x-javascript",
"application/x-yaml",
"application/xhtml+xml",
"application/xml",
"application/xml-dtd",
"application/xop+xml",
"application/yaml",
"image/svg+xml",
"message/imdn+xml",
"model/x3d+xml",
];

// TODO: add support for other optional parameters besides charset (filename, etc)
pub struct DataUrl {
media_type: Option<String>, // Media type
charset: Option<String>, // US-ASCII is default, according to the spec
is_base64_encoded: bool, // Indicates if it's a base64-encoded data URL
data: Vec<u8>, // Data, bytes, always UTF-8
data: Vec<u8>, // Data, bytes, UTF-8 if text
fragment: Option<String>, // #something-at-the-end, None by default
}

Expand All @@ -27,11 +49,6 @@ impl fmt::Debug for DataUrlParseError {
}
}

pub(crate) fn validate_media_type(input: &str) -> bool {
// Must contain one slash
input.split('/').collect::<Vec<&str>>().len() == 2
}

pub(crate) fn parse_data_url_meta_data(
meta_data_string: String,
) -> (Option<String>, Option<String>, bool) {
Expand Down Expand Up @@ -67,6 +84,11 @@ pub(crate) fn parse_data_url_meta_data(
(media_type, charset, is_base64_encoded)
}

pub(crate) fn validate_media_type(media_type: &str) -> bool {
// Must contain one slash
media_type.split('/').collect::<Vec<&str>>().len() == 2
}

impl DataUrl {
pub fn new() -> DataUrl {
DataUrl {
Expand All @@ -92,24 +114,22 @@ impl DataUrl {
parse_data_url_meta_data(meta_data_string);

// Parse raw data into vector of bytes
let mut data_string: String = percent_decode_str(&path[comma_offset + 1..])
.decode_utf8_lossy()
.to_string();
let mut d: Vec<u8> = percent_decode_str(&path[comma_offset + 1..]).collect();
if let Some(query) = url.query() {
data_string += "?";
data_string += &percent_decode_str(&query).decode_utf8_lossy().to_string();
d.push("?".as_bytes()[0]);
d.append(&mut percent_decode_str(&query).collect());
}
let mut unable_to_decode_base64: bool = false;
let blob: Vec<u8> = if is_base64_encoded {
match base64::decode(&data_string) {
match base64::decode(&d) {
Ok(decoded) => decoded,
Err(_) => {
unable_to_decode_base64 = true;
[].to_vec()
}
}
} else {
data_string.as_bytes().to_vec()
d
};

if unable_to_decode_base64 {
Expand All @@ -135,6 +155,26 @@ impl DataUrl {
}
}

pub fn is_binary(&self) -> bool {
if self.media_type.is_none() {
return false;
}

let current_media_type: &str = &self.media_type.as_ref().unwrap();
let is_textual: bool = if current_media_type.split('/').collect::<Vec<&str>>()[0]
.eq_ignore_ascii_case("text")
{
true
} else {
TEXTUAL_MEDIA_TYPES
.iter()
.find(|mt| current_media_type.eq_ignore_ascii_case(mt))
.is_some()
};

!is_textual
}

pub fn get_media_type(&self) -> &str {
if let Some(mt) = &self.media_type {
mt
Expand Down Expand Up @@ -184,59 +224,21 @@ impl DataUrl {
}

pub fn set_charset(&mut self, new_charset: Option<String>) -> bool {
let c: Option<String>;
let success;

if let Some(nc) = new_charset {
// Validate the input
if let Some(e) = Encoding::for_label_no_replacement(nc.as_bytes()) {
c = Some(e.name().to_string());
success = true;
self.charset = Some(e.name().to_string());
true
} else {
// Since browsers fall back to US-ASCII, so does this
c = None;
success = false;
self.charset = None;
false
}
} else {
// Unset
c = None;
success = true;
}

/*
// Check if already has the same charset
if self.charset != c {
if self.data.len() > 0 {
// Re-encode existing data from old charset into new one
// Can be lossy if not careful
// 1. Decode our current data into UTF-8 (if needed)
if self.charset.is_some()
&& self.charset != Some("US-ASCII".to_string())
&& self.charset != Some("windows-1252".to_string())
&& self.charset != Some("UTF-8".to_string())
{
if let Some(encoding) = Encoding::for_label_no_replacement(self.charset.as_ref().unwrap().as_bytes()) {
let (decoded, _, _) = encoding.decode(&self.data);
self.data = decoded.as_bytes().to_vec();
}
}
// 2. Encode our UTF-8 data into whatever encoding it's now set to have
if let Some(encoding) = Encoding::for_label_no_replacement(c.clone().unwrap().as_bytes()) {
let input = &String::from_utf8_lossy(&self.data);
let (encoded, _, _) = encoding.encode(input);
self.data = encoded.to_vec();
}
}
self.charset = c;
self.charset = None;
true
}
*/

self.charset = c;

success
}

// TODO: ditch get/set_is_base64_encode and implement two separate functions, to_precent_encoded_string, and to_base64_encoded_string?
Expand Down Expand Up @@ -314,7 +316,7 @@ impl DataUrl {
}

if let Some(c) = &self.charset {
// windows-1252 is another name for US-ASCII, the default charset in data URLs
// windows-1252 is another name for US-ASCII, the default charset for data URLs
if c != "windows-1252" {
result += ";charset=";
result += &c;
Expand All @@ -328,25 +330,37 @@ impl DataUrl {
result += ",";

if self.data.len() > 0 {
let data_as_utf8_string: String = String::from_utf8_lossy(&self.data).to_string();
let fallback_charset: String = if data_as_utf8_string.is_ascii() {
DEFAULT_CHARSET.to_string()
if self.is_binary() {
// Just encode as base64 or URI if data is binary
if self.is_base64_encoded {
result += &base64::encode(&self.data);
} else {
result += &percent_encode(&self.data, NON_ALPHANUMERIC).to_string();
}
} else {
"UTF-8".to_string()
};
// Charset only matters for textual data
let data_as_utf8_string: String =
String::from_utf8_lossy(&self.data).to_string();
let fallback_charset: String = if data_as_utf8_string.is_ascii() {
DEFAULT_CHARSET.to_string()
} else {
"UTF-8".to_string()
};

if let Some(encoding) = Encoding::for_label_no_replacement(
self.charset
.as_ref()
.unwrap_or(&fallback_charset)
.as_bytes(),
) {
let (encoded, _, _) = encoding.encode(&data_as_utf8_string);
if let Some(encoding) = Encoding::for_label_no_replacement(
self.charset
.as_ref()
.unwrap_or(&fallback_charset)
.as_bytes(),
) {
let (encoded, _, _) = encoding.encode(&data_as_utf8_string);

if self.is_base64_encoded {
result += &base64::encode(&encoded.to_vec());
} else {
result += &percent_encode(&encoded.to_vec(), NON_ALPHANUMERIC).to_string();
if self.is_base64_encoded {
result += &base64::encode(&encoded.to_vec());
} else {
result +=
&percent_encode(&encoded.to_vec(), NON_ALPHANUMERIC).to_string();
}
}
}
}
Expand Down
Loading

0 comments on commit 40393e1

Please sign in to comment.