Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ReedD committed Aug 20, 2017
0 parents commit 4f4a970
Show file tree
Hide file tree
Showing 6 changed files with 1,216 additions and 0 deletions.
65 changes: 65 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@

# Created by https://www.gitignore.io/api/node

### Node ###
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*

# Runtime data
pids
*.pid
*.seed
*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov

# Coverage directory used by tools like istanbul
coverage

# nyc test coverage
.nyc_output

# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
.grunt

# Bower dependency directory (https://bower.io/)
bower_components

# node-waf configuration
.lock-wscript

# Compiled binary addons (http://nodejs.org/api/addons.html)
build/Release

# Dependency directories
node_modules/
jspm_packages/

# Typescript v1 declaration files
typings/

# Optional npm cache directory
.npm

# Optional eslint cache
.eslintcache

# Optional REPL history
.node_repl_history

# Output of 'npm pack'
*.tgz

# Yarn Integrity file
.yarn-integrity

# dotenv environment variables file
.env


# End of https://www.gitignore.io/api/node
37 changes: 37 additions & 0 deletions crawl
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env node
require('dotenv').config();
const argv = require('yargs')
.usage('Usage: $0 [options]')
.option('url', {
alias: 'u',
describe: 'The URL the crawler should enter the site from',
})
.option('resume', {
alias: 'r',
boolean: true,
describe: 'Resume crawler from existing queue',
})
.option('max-radius', {
alias: 'm',
number: true,
describe: 'The maximum radius from the entry URL to crawl',
default: Infinity,
})
.check(({ url, resume }) => {
if (url && resume) {
throw new Error('--url and --resume are mutually exclusive');
}
return true;
})
.help().argv;

const db = require('./db');
const crawl = require('./crawler');

(async () => {
if (!argv.resume) {
await db.flush();
}
await crawl(argv.url, { maxRadius: argv.maxRadius });
db.close();
})();
54 changes: 54 additions & 0 deletions crawler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
const _ = require('lodash');
const db = require('./db');
const puppeteer = require('puppeteer');
const url = require('url');

const debug = {
crawl: require('debug')('crawler:crawl'),
page: require('debug')('crawler:page'),
};

const crawl = async (entry, options = {}) => {
debug.crawl('Crawler started');
let target = (await db.getCrawlUrl()) || { url: entry, radius: 0 };
const { maxRadius = Infinity } = options;
if (!target.url) {
debug.crawl('Nothing to crawl');
return;
}

const entryUrl = url.parse(target.url);
const browser = await puppeteer.launch();
const page = await browser.newPage();
debug.crawl('Puppeteer started');

let count = 0;
while (target) {
if (target.radius >= maxRadius) {
debug.page(`Max radius reached ${target.url} not scraped`);
} else {
count++;
debug.page(`Crawling: ${target.url}`);
await page.goto(target.url);
debug.page(`Page loaded`);
const links = await page.evaluate(() => {
return Array.from(document.querySelectorAll('a')).map(
link => link.href
);
});
const urls = _.chain(links)
.filter(link => {
return url.parse(link).host === entryUrl.host;
})
.value();
debug.page(`Scraped ${urls.length} urls`);
await db.addCrawlUrls(urls, ++target.radius);
}
target = await db.getCrawlUrl();
}
debug.crawl(`Crawler finished after crawling ${count} pages`);

browser.close();
};

module.exports = crawl;
74 changes: 74 additions & 0 deletions db.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
const bluebird = require('bluebird');
const redis = require('redis');

bluebird.promisifyAll(redis.RedisClient.prototype);
bluebird.promisifyAll(redis.Multi.prototype);

const debug = {
redis: require('debug')('crawler:redis'),
};

const client = redis.createClient(
process.env.REDIS_PORT || 6379,
process.env.REDIS_HOST || 'localhost'
);

module.exports = {
addCrawlUrls: async (urls, radius) => {
debug.redis('Add scraped urls to redis');
const multi = client.multi();
urls.forEach(url => {
multi.sadd('discoveredPages', url);
});
const result = await multi.execAsync();
debug.redis('Added urls to discovered set');

let count = 0;
result.forEach((notDiscovered, i) => {
if (notDiscovered) {
count++;
const url = urls[i];
multi.rpush('pageQueue', `${url} ${radius}`);
}
});
await multi.execAsync();
debug.redis(`Added ${count} new urls to queue`);
debug.redis(`${urls.length - count} duplicates found`);
},

addCrawlUrl: async (url, radius) => {
const notDiscovered = await client.saddAsync('discoveredPages', url);
if (!notDiscovered) {
await client.rpushAsync('pageQueue', `${url} ${radius}`);
}
},

getCrawlUrl: async () => {
debug.redis('Pop url from queue');
const reply = await client.lpopAsync('pageQueue');
if (reply) {
debug.redis('Url popped');
if (debug.redis.enabled) {
const length = await client.llenAsync('pageQueue');
debug.redis(`${length} urls in queue`);
}
const parts = reply.match(/(.+) ([0-9]+)$/);
return {
url: parts[1],
radius: parseInt(parts[2]),
};
}
debug.redis('Queue empty');
return null;
},

flush: async () => {
debug.redis('Flush db');
await client.del('discoveredPages', 'pageQueue');
debug.redis('Redis flushed');
},

close: () => {
client.end(true);
},
};
31 changes: 31 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"name": "crawler",
"version": "1.0.0",
"main": "crawl",
"license": "MIT",
"author": {
"email": "reed@dadoune.com",
"name": "Reed Dadoune",
"url": "https://www.dadoune.com"
},
"scripts": {
"precommit": "lint-staged"
},
"lint-staged": {
"*.{js, jsx}": ["prettier --single-quote --trailing-comma es5 --write", "git add"]
},
"dependencies": {
"bluebird": "^3.5.0",
"debug": "^3.0.0",
"dotenv": "^4.0.0",
"lodash": "^4.17.4",
"puppeteer": "^0.9.0",
"redis": "^2.8.0",
"yargs": "^8.0.2"
},
"devDependencies": {
"husky": "^0.14.3",
"lint-staged": "^4.0.3",
"prettier": "^1.5.3"
}
}
Loading

0 comments on commit 4f4a970

Please sign in to comment.