Skip to content

Commit

Permalink
Add mongo
Browse files Browse the repository at this point in the history
  • Loading branch information
ReedD committed Aug 20, 2017
1 parent 338891f commit 0afe5b9
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 32 deletions.
1 change: 1 addition & 0 deletions crawl
Expand Up @@ -29,6 +29,7 @@ const db = require('./db');
const crawl = require('./crawler');

(async () => {
await db.connect();
if (!argv.resume) {
await db.flush();
}
Expand Down
16 changes: 11 additions & 5 deletions crawler.js
Expand Up @@ -10,7 +10,7 @@ const debug = {

const crawl = async (entry, options = {}) => {
debug.crawl('Crawler started');
let target = (await db.getCrawlUrl()) || { url: entry, radius: 0 };
let target = (await db.popUrl()) || { url: entry, radius: 0 };
const { maxRadius = Infinity } = options;
if (!target.url) {
debug.crawl('Nothing to crawl');
Expand All @@ -36,18 +36,24 @@ const crawl = async (entry, options = {}) => {
link => link.href
);
});
const urls = _.chain(links)
const outboundUrls = _.chain(links)
.filter(link => {
return url.parse(link).host === entryUrl.host;
})
.value();
debug.page(`Scraped ${urls.length} urls`);
await db.addCrawlUrls(urls, ++target.radius);
debug.page(`Scraped ${outboundUrls.length} urls`);
await db.store({
outboundUrls,
radius: ++target.radius,
url: target.url,
});
}
target = await db.getCrawlUrl();
target = await db.popUrl();
}
debug.crawl(`Crawler finished after crawling ${count} pages`);

await db.getNodes();

browser.close();
};

Expand Down
71 changes: 48 additions & 23 deletions db.js
@@ -1,23 +1,34 @@
const bluebird = require('bluebird');
const redis = require('redis');
const MongoClient = require('mongodb').MongoClient;

bluebird.promisifyAll(redis.RedisClient.prototype);
bluebird.promisifyAll(redis.Multi.prototype);

const debug = {
db: require('debug')('crawler:db'),
redis: require('debug')('crawler:redis'),
mongo: require('debug')('crawler:mongo'),
};

const client = redis.createClient(
process.env.REDIS_PORT || 6379,
process.env.REDIS_HOST || 'localhost'
);

module.exports = {
addCrawlUrls: async (urls, radius) => {
connect: async () => {
this.db = await MongoClient.connect('mongodb://localhost:27017/crawler');
this.client = redis.createClient(
process.env.REDIS_PORT || 6379,
process.env.REDIS_HOST || 'localhost'
);
},
store: async page => {
debug.db(`Store page ${page.url}`);

debug.mongo('Add page to mongo');
await this.db.collection('pages').insertOne(page);
debug.mongo('Mongo save complete');

debug.redis('Add scraped urls to redis');
const multi = client.multi();
urls.forEach(url => {
const multi = this.client.multi();
page.outboundUrls.forEach(url => {
multi.sadd('discoveredPages', url);
});
const result = await multi.execAsync();
Expand All @@ -27,29 +38,23 @@ module.exports = {
result.forEach((notDiscovered, i) => {
if (notDiscovered) {
count++;
const url = urls[i];
multi.rpush('pageQueue', `${url} ${radius}`);
const url = page.outboundUrls[i];
multi.rpush('pageQueue', `${url} ${page.radius}`);
}
});
await multi.execAsync();
debug.redis(`Added ${count} new urls to queue`);
debug.redis(`${urls.length - count} duplicates found`);
},

addCrawlUrl: async (url, radius) => {
const notDiscovered = await client.saddAsync('discoveredPages', url);
if (!notDiscovered) {
await client.rpushAsync('pageQueue', `${url} ${radius}`);
}
debug.redis(`${page.outboundUrls.length - count} duplicates found`);
debug.db('Page stored');
},

getCrawlUrl: async () => {
popUrl: async () => {
debug.redis('Pop url from queue');
const reply = await client.lpopAsync('pageQueue');
const reply = await this.client.lpopAsync('pageQueue');
if (reply) {
debug.redis('Url popped');
if (debug.redis.enabled) {
const length = await client.llenAsync('pageQueue');
const length = await this.client.llenAsync('pageQueue');
debug.redis(`${length} urls in queue`);
}
const parts = reply.match(/(.+) ([0-9]+)$/);
Expand All @@ -62,13 +67,33 @@ module.exports = {
return null;
},

getNodes: async () => {
const pages = await this.db.collection('pages').find().toArray();

const nodes = [];
pages.forEach(page => {
page.outboundUrls.forEach(url => {
nodes.push({ source: page.url, target: url });
});
});

console.log(JSON.stringify(nodes));
},

flush: async () => {
debug.mongo('Drop page collection');
const pages = await this.db.collection('pages');
if (pages) {
await pages.drop();
}
debug.mongo('Page collection dropped');
debug.redis('Flush db');
await client.del('discoveredPages', 'pageQueue');
await this.client.del('discoveredPages', 'pageQueue');
debug.redis('Redis flushed');
},

close: () => {
client.end(true);
this.client.end(true);
this.db.close();
},
};
6 changes: 5 additions & 1 deletion package.json
Expand Up @@ -12,13 +12,17 @@
"precommit": "lint-staged"
},
"lint-staged": {
"*.{js, jsx}": ["prettier --single-quote --trailing-comma es5 --write", "git add"]
"*.{js, jsx}": [
"prettier --single-quote --trailing-comma es5 --write",
"git add"
]
},
"dependencies": {
"bluebird": "^3.5.0",
"debug": "^3.0.0",
"dotenv": "^4.0.0",
"lodash": "^4.17.4",
"mongodb": "^2.2.31",
"puppeteer": "^0.9.0",
"redis": "^2.8.0",
"yargs": "^8.0.2"
Expand Down
56 changes: 53 additions & 3 deletions yarn.lock
Expand Up @@ -43,6 +43,14 @@ brace-expansion@^1.1.7:
balanced-match "^1.0.0"
concat-map "0.0.1"

bson@~1.0.4:
version "1.0.4"
resolved "https://registry.yarnpkg.com/bson/-/bson-1.0.4.tgz#93c10d39eaa5b58415cbc4052f3e53e562b0b72c"

buffer-shims@~1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/buffer-shims/-/buffer-shims-1.0.0.tgz#9978ce317388c649ad8793028c3477ef044a8b51"

builtin-modules@^1.0.0:
version "1.1.1"
resolved "https://registry.yarnpkg.com/builtin-modules/-/builtin-modules-1.1.1.tgz#270f076c5a72c02f5b65a47df94c5fe3a278892f"
Expand Down Expand Up @@ -179,6 +187,10 @@ error-ex@^1.2.0:
dependencies:
is-arrayish "^0.2.1"

es6-promise@3.2.1:
version "3.2.1"
resolved "https://registry.yarnpkg.com/es6-promise/-/es6-promise-3.2.1.tgz#ec56233868032909207170c39448e24449dd1fc4"

escape-string-regexp@^1.0.2, escape-string-regexp@^1.0.5:
version "1.0.5"
resolved "https://registry.yarnpkg.com/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz#1b61c0562190a8dff6ae3bb2cf0200ca130b86d4"
Expand Down Expand Up @@ -305,7 +317,7 @@ inflight@^1.0.4:
once "^1.3.0"
wrappy "1"

inherits@2, inherits@^2.0.3, inherits@~2.0.3:
inherits@2, inherits@^2.0.3, inherits@~2.0.1, inherits@~2.0.3:
version "2.0.3"
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.3.tgz#633c2c83e3da42a502f52466022480f4208261de"

Expand Down Expand Up @@ -513,6 +525,21 @@ mkdirp@0.5.0:
dependencies:
minimist "0.0.8"

mongodb-core@2.1.15:
version "2.1.15"
resolved "https://registry.yarnpkg.com/mongodb-core/-/mongodb-core-2.1.15.tgz#841f53b87ffff4c7458189c35c8ae827e1169764"
dependencies:
bson "~1.0.4"
require_optional "~1.0.0"

mongodb@^2.2.31:
version "2.2.31"
resolved "https://registry.yarnpkg.com/mongodb/-/mongodb-2.2.31.tgz#1940445c661e19217bb3bf8245d9854aaef548db"
dependencies:
es6-promise "3.2.1"
mongodb-core "2.1.15"
readable-stream "2.2.7"

ms@0.7.1:
version "0.7.1"
resolved "https://registry.yarnpkg.com/ms/-/ms-0.7.1.tgz#9cd13c03adbff25b65effde7ce864ee952017098"
Expand Down Expand Up @@ -695,6 +722,18 @@ read-pkg@^2.0.0:
normalize-package-data "^2.3.2"
path-type "^2.0.0"

readable-stream@2.2.7:
version "2.2.7"
resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-2.2.7.tgz#07057acbe2467b22042d36f98c5ad507054e95b1"
dependencies:
buffer-shims "~1.0.0"
core-util-is "~1.0.0"
inherits "~2.0.1"
isarray "~1.0.0"
process-nextick-args "~1.0.6"
string_decoder "~1.0.0"
util-deprecate "~1.0.1"

readable-stream@^2.2.2:
version "2.3.3"
resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-2.3.3.tgz#368f2512d79f9d46fdfc71349ae7878bbc1eb95c"
Expand Down Expand Up @@ -741,6 +780,17 @@ require-main-filename@^1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/require-main-filename/-/require-main-filename-1.0.1.tgz#97f717b69d48784f5f526a6c5aa8ffdda055a4d1"

require_optional@~1.0.0:
version "1.0.1"
resolved "https://registry.yarnpkg.com/require_optional/-/require_optional-1.0.1.tgz#4cf35a4247f64ca3df8c2ef208cc494b1ca8fc2e"
dependencies:
resolve-from "^2.0.0"
semver "^5.1.0"

resolve-from@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/resolve-from/-/resolve-from-2.0.0.tgz#9480ab20e94ffa1d9e80a804c7ea147611966b57"

restore-cursor@^1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/restore-cursor/-/restore-cursor-1.0.1.tgz#34661f46886327fed2991479152252df92daa541"
Expand All @@ -764,7 +814,7 @@ safe-buffer@~5.1.0, safe-buffer@~5.1.1:
version "5.1.1"
resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.1.1.tgz#893312af69b2123def71f57889001671eeb2c853"

"semver@2 || 3 || 4 || 5":
"semver@2 || 3 || 4 || 5", semver@^5.1.0:
version "5.4.1"
resolved "https://registry.yarnpkg.com/semver/-/semver-5.4.1.tgz#e059c09d8571f0540823733433505d3a2f00b18e"

Expand Down Expand Up @@ -831,7 +881,7 @@ string-width@^2.0.0:
is-fullwidth-code-point "^2.0.0"
strip-ansi "^4.0.0"

string_decoder@~1.0.3:
string_decoder@~1.0.0, string_decoder@~1.0.3:
version "1.0.3"
resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.0.3.tgz#0fc67d7c141825de94282dd536bec6b9bce860ab"
dependencies:
Expand Down

0 comments on commit 0afe5b9

Please sign in to comment.