/
transformer.js
118 lines (101 loc) · 4.07 KB
/
transformer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
const HtmlExtractor = require(`algolia-html-extractor`);
const Extractor = new HtmlExtractor();
/**
* Utility function, takes the output of HTML Extractor, and reduces it back down
* So that there is a group of HTML/content per heading
*
* @param {Array} accumulator
* @param {Object} fragment
*/
const reduceFragmentsUnderHeadings = (accumulator, fragment) => {
const existingFragment = accumulator.find(existing => existing.anchor === fragment.anchor);
if (existingFragment) {
// Merge our fragments together
if (fragment.node && fragment.node.tagName === `PRE`) {
// For pre-tags, we don't keep all the markup
existingFragment.html += ` ${fragment.content}`; // keep a space
existingFragment.content += ` ${fragment.content}`; // keep a space
} else {
existingFragment.html += fragment.html;
existingFragment.content += ` ${fragment.content}`; // keep a space
}
} else {
// If we don't already have a matching fragment with this anchor, add it
accumulator.push(fragment);
}
return accumulator;
};
/**
* Fragment Transformer
* breaks down large HTML strings into sensible fragments based on headings
*/
module.exports.fragmentTransformer = (recordAccumulator, node) => {
let htmlFragments = Extractor
// These are the top-level HTML elements that we keep - this results in a lot of fragments
.run(node.html, {cssSelector: `p,pre,td,li`})
// Use the utility function to merge fragments so that there is one-per-heading
.reduce(reduceFragmentsUnderHeadings, []);
// convert our fragments for this node into valid objects, and merge int the
const records = htmlFragments.reduce((fragmentAccumulator, fragment, index) => {
// Don't need a reference to the html node type
delete fragment.node;
// For now at least, we're not going to index the content string
// The HTML string is already very long, and there are size limits
delete fragment.content;
// If we have an anchor, change the URL to be a deep link
if (fragment.anchor) {
fragment.url = `${node.url}#${fragment.anchor}`;
}
let objectID = `${node.objectID}_${index}`;
// TODO: switch this on in verbose mode only
// // If fragments are too long, we need this to see which fragment it was
// console.log(`Created fragment: `, objectID, fragment.url || node.url, fragment.html.length); // eslint-disable-line no-console
return [
...fragmentAccumulator,
{...node, ...fragment, objectID: objectID}
];
}, []);
return [...recordAccumulator, ...records];
};
module.exports._testReduceFragmentsUnderHeadings = reduceFragmentsUnderHeadings;
/**
* Algolia Object Transformer
* takes a Ghost post and selects the properties needed to send to Algolia
*
* @param {Array} posts
*/
module.exports.transformToAlgoliaObject = (posts, ignoreSlugs) => {
const algoliaObjects = [];
posts.map((post) => {
// Define the properties we need for Algolia
const algoliaPost = {
objectID: post.id,
slug: post.slug,
url: post.url,
html: post.html,
image: post.feature_image,
title: post.title,
tags: [],
authors: []
};
// If we have an array of slugs to ignore, and the current
// post slug is in that list, skip this loop iteration
if (ignoreSlugs) {
if (ignoreSlugs.includes(post.slug)) {
return false;
}
}
if (post.tags && post.tags.length) {
post.tags.forEach((tag) => {
algoliaPost.tags.push({name: tag.name, slug: tag.slug});
});
}
if (post.authors && post.authors.length) {
post.authors.forEach((author) => {
algoliaPost.authors.push({name: author.name, slug: author.slug});
});
}
algoliaObjects.push(algoliaPost);
});
return algoliaObjects;
};