forked from edgi-govdata-archiving/web-monitoring-ui
-
Notifications
You must be signed in to change notification settings - Fork 0
/
html-transforms.js
164 lines (145 loc) · 5.35 KB
/
html-transforms.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
/**
* HtmlTransforms are functions that take an HTML Document and modify it in
* some useful way, such as removing scripts.
* @typedef {(document: HTMLDocument) => HTMLDocument} HtmlTransform
*/
/**
* Takes several transforms and returns a new function that takes an html document,
* runs the transform functions on it, and returns the resulting document.
* @param {...HtmlTransform} The transforms to combine.
* @returns {HtmlTransform}
*/
export function compose (...transforms) {
transforms = transforms.filter(transform => !!transform);
if (transforms.length === 0) {
return x => x;
}
return (input) => {
return transforms.reduce((output, transform) => {
return transform(output);
}, input);
};
}
/**
* Takes an html document, removes all the stylesheets and scripts from
* the document. If any of them have a class or id that starts with 'wm-',
* it keeps them as an exception.
* Returns the resulting document.
* @param {HTMLDocument} The html document to change.
* @returns {HTMLDocument}
*/
export function removeStyleAndScript (document) {
// Stylesheets and scripts
document.querySelectorAll('link[rel="stylesheet"], style, script').forEach(node => {
const isDiffNode = node.id.startsWith('wm-') ||
Array.from(node.classList).some(name => name.startsWith('wm-'));
if (!isDiffNode) {
node.remove();
}
});
// Inline style attributes
document.querySelectorAll('[style]').forEach(node => node.removeAttribute('style'));
return document;
}
/**
* Prevents navigation from within a diff by forcing links to open in a new
* tab when clicked (excluding intra-page links). This helps ensure we don’t get in a state where one side
* of a side-by-side diff has been navigated and viewer does not realize they
* are no longer actually looking at a *diff*.
*
* NOTE: This requires the iframe displaying the diff to allow popups with the
* `sandbox="allow-popups"` attribute.
* @param {HTMLDocument} document The html document to transform.
* @returns {HTMLDocument}
*/
export function addTargetBlank (document) {
// Add target="_blank" to all <a>tags
document.querySelectorAll('a').forEach(node => {
// set href to empty string in case attribute is null
const href = node.getAttribute('href') || '';
if (href.charAt(0) !== '#' && href.indexOf('javascript:') !== 0) {
node.setAttribute('target', '_blank');
}
});
return document;
}
/**
* Creates a transform that will rewrite subresource URLs to point to the
* Wayback Machine. This is useful when we have snapshots of the page itself,
* but not its subresources. It won't always work (Wayback won't always have
* a snapshot of the subresource from a similar point in time), but it'll work
* a lot better than just pointing to the original URL, which might be missing
* or significantly altered by the time a diff is viewed.
*
* Note this *creates* the transform and is not the transform itself (because
* the transform must be custom to a particular source URL and point in time).
* @param {WebMonitoringDb.Page} page
* @param {WebMonitoringDb.Version} version
*/
export function loadSubresourcesFromWayback (page, version) {
return document => {
const timestamp = createWaybackTimestamp(version.capture_time);
document.querySelectorAll('link[rel="stylesheet"]').forEach(node => {
node.href = createWaybackUrl(node.getAttribute('href'), timestamp, page.url);
});
document.querySelectorAll('script[src],img[src]').forEach(node => {
node.src = createWaybackUrl(node.getAttribute('src'), timestamp, page.url);
});
// TODO: handle <picture> with all its subelements
// TODO: SVG <use> directives
// TODO: video/audio (similar structure to <picture>)
return document;
};
}
// ---------------------- Support Functions -----------------------------
/**
* Convert a Date object to to a Wayback-Machine style timestamp string.
* @param {Date} date A JS date object to convert
* @returns String
*/
function createWaybackTimestamp (date) {
return '' + date.getUTCFullYear()
+ twoDigit(date.getUTCMonth() + 1)
+ twoDigit(date.getUTCDate())
+ twoDigit(date.getUTCHours())
+ twoDigit(date.getUTCMinutes())
+ twoDigit(date.getUTCSeconds());
}
const PROTOCOL_PATTERN = /^[^/]+:\/\//;
/**
* Create a URL that points to a Wayback Machine-archived version of another
* URL near a particular date.
* @param {String} originalUrl URL of the resource to get from the Wayback Machine
* @param {Date|String} timestamp Date of the
*/
function createWaybackUrl (originalUrl, timestamp, baseUrl) {
if (typeof timestamp !== 'string') {
timestamp = createWaybackTimestamp(timestamp);
}
const url = resolveUrl(originalUrl, baseUrl);
return `https://web.archive.org/web/${timestamp}id_/${url}`;
}
/**
* Resolve a full URL from a relative one.
* @param {String} url The URL to resolve
* @param {String} baseUrl The base URL to resolve from
*/
function resolveUrl (url, baseUrl) {
if (url.startsWith('//')) {
return `https:${url}`;
}
else if (!PROTOCOL_PATTERN.test(url)) {
const base = new URL(baseUrl);
if (url.startsWith('/')) {
return `${base.origin}${url}`;
}
else {
const path = base.pathname.split('/').slice(0, -1).join('/');
return `${base.origin}${path}/${url}`;
}
}
return url;
}
function twoDigit (number) {
return number.toString().padStart(2, '0');
}