Skip to content

Commit

Permalink
# 2016/04/07
Browse files Browse the repository at this point in the history
- remove useless anchors i.e.: paginations, contactors...
- remove related links if neccessary
- improve link density algorithm
- options recognition
- cheerio object can be passed in
- more test cases
  • Loading branch information
Tjatse committed Apr 7, 2016
1 parent e565310 commit 9e92ed0
Show file tree
Hide file tree
Showing 6 changed files with 371 additions and 121 deletions.
117 changes: 95 additions & 22 deletions index.js
@@ -1,6 +1,7 @@
'use strict'

var req = require('req-fast')
var cheerio = require('cheerio')
var util = require('util')
var debug = require('debug')('read-art.main')
var Article = require('./lib/article')
Expand All @@ -15,18 +16,40 @@ module.exports = read
* 1: error
* 2: article
*/
function read (uri, options, callback) {
function read () {
return handle.apply(null, arguments) // eslint-disable-line no-useless-call
}

/**
* Custom settings.
* @type {readArt.use}
*/
read.use = Article.use

function handle (uri, options, callback) {
if (arguments.length === 0) {
return new Error('Incorrect arguments.')
}
// organize parameters
if ((typeof options === 'function') && !callback) {
callback = options
}
if (options && typeof options === 'object') {
options.uri = uri
} else if (typeof uri === 'string') {
} else if (typeof uri === 'string' || isCheerio(uri)) {
options = { uri: uri }
} else {
options = uri
uri = options.uri || options.html
}

if (typeof options !== 'object') {
return new Error('options are required!')
}

uri = options.cheerio || options.html || options.uri

if ((typeof uri !== 'string' && !isCheerio(uri)) || !uri) {
return new Error('only accept cheerio, url or HTML as article content.')
}

options = util._extend({
Expand All @@ -52,12 +75,37 @@ function read (uri, options, callback) {
if (isNaN(options.minParagraphs)) {
options.minParagraphs = 3
}

if (typeof callback !== 'function') {
callback = false
}
// indicating whether uri is a cheerio object or not.
if (typeof uri !== 'string') {
if (isCheerio(uri)) {
if (!isHtml(options.html)) {
delete options.html
}
if (isHtml(options.uri)) {
if (!options.html) {
options.html = options.uri
}
delete options.uri
}
options.cheerio = uri
return parse({
cheerio: uri,
options: options,
callback: callback
})
}
var err = new Error('only accept cheerio, url or HTML as article content.')
return callback ? callback(err) : err
}
// indicating uri is html or url.
var isHTML = uri.match(/^\s*</)
if (isHTML && options.uri && !options.html) {
options.html = options.uri
delete options.uri
if (isHtml(uri) && options.uri && !options.html) {
options.html = uri
if (isHtml(options.uri)) {
delete options.uri
}
}

var parsingData = {
Expand All @@ -75,12 +123,12 @@ function read (uri, options, callback) {
if (debug.enabled) {
debug(' ∟ Error: ' + (err ? err.message : 'no response'))
}
return callback(err || new Error('Response is empty.'))
return callback && callback(err || new Error('Response is empty.'))
}
if (!resp.body) {
var errMsg = 'No body was found, status code: ' + resp.statusCode
debug(' ∟ Warning: ' + errMsg)
return callback(new Error(errMsg))
return callback && callback(new Error(errMsg))
}
debug(' ∟ succeed')

Expand All @@ -93,30 +141,55 @@ function read (uri, options, callback) {
}
}

/**
* Custom settings.
* @type {readArt.use}
*/
read.use = Article.use

/**
* Parse html to cheerio dom.
* @param o options
* @param e extra data
* @return {String}
*/
function parse (o, e) {
debug(' analyzing HTML')
if (!o.html) {
debug(' ∟ HTML content could not be found, simply returned')
return ''
debug(' parsing...')
if (!o.html && !o.cheerio) {
var errMsg = 'Article content could not be found.'
debug(' ∟ ' + errMsg)
return o.callback && o.callback(new Error(errMsg))
}
if (o.options.killBreaks) {
if (o.html && o.options.killBreaks) {
// replace <br />(blanks goes here) to <br />.
o.html = o.html.replace(/<br[^\/>]*\/?>/ig, '<br />')
// remove tab symbols like \r\t\n
o.html = o.html.replace(/[\n\r\t]{2,}/gi, ' ')
}

o.callback(null, new Article(o), o.options, e)
o.callback && o.callback(null, new Article(o), o.options, e)
}

// from cheerio module.
var quickExpr = /^(?:[^#<]*(<[\w\W]+>)[^>]*$|#([\w\-]*)$)/
/**
* Check if string is HTML
* @param {String} str
* @return {Boolean}
*/
function isHtml (str) {
if (typeof str !== 'string') {
return false
}
// Faster than running regex, if str starts with `<` and ends with `>`, assume it's HTML
if (str.charAt(0) === '<' && str.charAt(str.length - 1) === '>' && str.length >= 3) {
return true
}

// Run the regex
var match = quickExpr.exec(str)
return !!(match && match.length > 1)
}

/**
* Check if object is an instance of Cheerio
* @param {Object} o
* @return {Boolean}
*/
function isCheerio (o) {
return o && typeof o.root === 'function' && o.root() instanceof cheerio
}
20 changes: 12 additions & 8 deletions lib/article.js
Expand Up @@ -15,15 +15,19 @@ module.exports = Article
* @constructor
*/
function Article (o) {
var co = {
decodeEntities: false
var $
if (o.html) {
var co = {
decodeEntities: false
}
var cheerioOptions = ['normalizeWhitespace', 'xmlMode', 'lowerCaseTags']
cheerioOptions.forEach(function (n) {
co[n] = !!o.options[n]
})
$ = cheerio.load(o.html, co)
} else {
$ = o.cheerio
}
var cheerioOptions = ['normalizeWhitespace', 'xmlMode', 'lowerCaseTags']
cheerioOptions.forEach(function (n) {
co[n] = !!o.options[n]
})

var $ = cheerio.load(o.html, co)
this.$ = $

this.caches = {}
Expand Down
58 changes: 53 additions & 5 deletions lib/reader.js
Expand Up @@ -21,6 +21,7 @@ var extRegexps = {
unlikely: /agegate|auth?or|bookmark|cat|com(?:bx|ment|munity)|date|disqus|extra|foot|header|ignore|link|menu|nav|pag(?:er|ination)|popup|related|remark|rss|share|shoutbox|sidebar|similar|social|sponsor|teaserlist|time|tweet|twitter|\bad[\s_-]?\b/i,
maybe: /and|article|body|column|main|column/i,
div2p: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul|span|font|label)/i,
uselessAnchors: /(\d+|next|prev|first|last|print|comment|mail|font|about|contact|(下|下|前|后)一|(首|尾)页)|打印|评论|邮件|信箱|转发|关于|联系|^(大|中|小)$/i,
images: /\.(gif|jpe?g|png)$/i
}
var tagsToSkip = ''
Expand Down Expand Up @@ -503,11 +504,13 @@ function grabArticle ($, topCandidate, options) {
}
}
}
var shouldRemoveRelatedLinks = false
if (shouldUseParent(topCandidate, options) && (parent = getNBParent(topCandidate))) {
// 1. topCandidate has not enough [P] children
// 2. parent exist and not [BODY]
debugRd(' ∟ top candidate has not enough <p /> children, take the parent node (not <body />) instead of it')
siblings = parent.children()
shouldRemoveRelatedLinks = true
} else {
// self children.
siblings = topCandidate.children()
Expand Down Expand Up @@ -539,6 +542,19 @@ function grabArticle ($, topCandidate, options) {
append = true
}
}

var extraDebugInfo
var ignoreMedias = false
if (append && shouldRemoveRelatedLinks) {
var linkDensity = getLinkDensity($, node, true) // eslint-disable-line no-redeclare
if (linkDensity >= 0.5) {
append = false
ignoreMedias = true
if (debugDo.enabled) {
extraDebugInfo = 'Probably related links which scores ' + linkDensity.toFixed(2)
}
}
}
if (append) {
// remove comments.
node.contents().filter(function (index, ele) {
Expand All @@ -549,16 +565,36 @@ function grabArticle ($, topCandidate, options) {
}
if (debugDo.enabled) {
debugDo(' ∟ <' + tagName + ' /> ' + na.classAndId)
debugDo(' ∟ ' + (append ? 'append' : 'remove'))
debugDo(' ∟ ' + (append ? 'append' : 'remove') + (extraDebugInfo ? ' (' + extraDebugInfo + ')' : ''))
}
// append medias.
if (!append) {
// append medias if neccessary
if (!append && !ignoreMedias) {
var medias = node.find(tagsOfMedia)
if (medias.length > 0) {
article.append(medias)
}
}
})

if (!options.keepAllLinks) {
article.find('a+a+a').each(function () {
var node = $(this)
var parent = node.parent()
if (!parent || parent.length === 0 || parent.is(article)) {
return
}
var reg = regexps.uselessAnchors
var prev
if (reg.test(node.text() || '') || reg.test((prev = node.prev()).text() || '') || reg.test(prev.prev().text() || '')) {
parent.remove()
if (debugDo.enabled) {
var na = getNodeAttr(parent)
debugDo(' ∟ <' + na.tagName + ' /> ' + na.classAndId)
debugDo(' ∟ removed \'Cause contains useless anchors')
}
}
})
}
// fix links
readArt.fixLink($, options.uri, article, options)
return article
Expand Down Expand Up @@ -782,9 +818,10 @@ function getClassWeight (node) {
* Get the density of links as a percentage of the content.
* @param $ dom
* @param node the node element.
* @param strictMode length must be calculated.
* @return {Number}
*/
function getLinkDensity ($, node) {
function getLinkDensity ($, node, strictMode) {
var textLen = node.text().length
if (textLen === 0) {
return 0
Expand All @@ -796,7 +833,18 @@ function getLinkDensity ($, node) {
if (!href || href[0] === '#') {
return
}
linkLen += anchor.text().length
var len = (anchor.text() || anchor.attr('title') || '').length
if (strictMode && len === 0) {
var children
if ((children = anchor.children()).length === 1 && children.get(0).tagName === 'img') {
len = (children.attr('alt') || children.attr('title') || '').length
}
if (len === 0) {
len = href.length
}
textLen += len
}
linkLen += len
})
return linkLen / textLen
}
Expand Down
2 changes: 1 addition & 1 deletion package.json
@@ -1,6 +1,6 @@
{
"name": "read-art",
"version": "0.4.8",
"version": "0.4.9-alpha",
"description": "Scrape/Crawl article from any site automatically. Make any web page readable, no matter Chinese or English.",
"main": "index.js",
"scripts": {
Expand Down
24 changes: 24 additions & 0 deletions test/anchors.js
@@ -0,0 +1,24 @@
var read = require('../')
var chai = require('chai')
var expect = chai.expect
var should = chai.should()

describe('grab content', function () {
describe('by requesting url', function () {
it('should without related links and useless anchors', function (done) {
read({
uri: 'http://www.cq.xinhuanet.com/2016-03/28/c_1118467794.htm',
output: 'text'
}, function (err, art) {
should.not.exist(err)
expect(art).to.be.an('object')
art.content.should.not.contains('打印')
art.content.should.not.contains('下一页')
art.content.should.not.contains('评论')
art.content.should.not.contains('信箱')
art.content.should.not.contains('推荐')
done()
})
})
})
})

0 comments on commit 9e92ed0

Please sign in to comment.