Skip to content

Commit

Permalink
parsing ads is supported now for google, bing and duckduckgo
Browse files Browse the repository at this point in the history
  • Loading branch information
Nikolai Tschacher committed Jul 6, 2019
1 parent 09c1255 commit bbebe3c
Show file tree
Hide file tree
Showing 14 changed files with 338 additions and 1,567 deletions.
Binary file removed debug_se_scraper_google_apple tree.png
Binary file not shown.
Binary file added debug_se_scraper_google_cloud service.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 6 additions & 3 deletions examples/quickstart.js
Expand Up @@ -4,19 +4,22 @@ const se_scraper = require('./../src/node_scraper.js');
let browser_config = {
debug_level: 2,
output_file: 'examples/results/data.json',
test_evasion: true,
test_evasion: false,
headless: false,
block_assets: false,
random_user_agent: true,
};

let scrape_job = {
search_engine: 'google',
keywords: ['news', 'se-scraper'],
keywords: ['cloud service'],
num_pages: 1,
// add some cool google search settings
google_settings: {
gl: 'us', // The gl parameter determines the Google country to use for the query.
hl: 'en', // The hl parameter determines the Google UI language to return results.
start: 0, // Determines the results offset to use, defaults to 0.
num: 100, // Determines the number of results to show, defaults to 10. Maximum is 100.
num: 10, // Determines the number of results to show, defaults to 10. Maximum is 100.
},
};

Expand Down
1,626 changes: 77 additions & 1,549 deletions examples/results/data.json

Large diffs are not rendered by default.

Binary file added headless-test-result.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion package.json
@@ -1,6 +1,6 @@
{
"name": "se-scraper",
"version": "1.3.12",
"version": "1.3.13",
"description": "A module using puppeteer to scrape several search engines such as Google, Duckduckgo, Bing or Baidu",
"homepage": "https://scrapeulous.com/",
"main": "index.js",
Expand Down
13 changes: 13 additions & 0 deletions src/modules/bing.js
Expand Up @@ -18,6 +18,18 @@ class BingScraper extends Scraper {
})
});

// parse bing ads
const ads = [];
$('.b_ad .sb_add').each((i, element) => {
ads.push({
ad_visible_url: $(element).find('.b_adurl cite').text(),
ads_link: $(element).find('h2 a').attr('href'),
ads_link_target: $(element).find('h2 link').attr('href'),
title: $(element).find('h2 a').text(),
snippet: $(element).find('.b_caption').text(),
})
});

// 'Including results for', 'Einschließlich Ergebnisse'
let no_results = this.no_results(
['There are no results', 'Es gibt keine Ergebnisse'],
Expand All @@ -41,6 +53,7 @@ class BingScraper extends Scraper {
effective_query: effective_query,
num_results: $('#b_content .sb_count').text(),
results: cleaned,
ads: ads,
}
}

Expand Down
13 changes: 12 additions & 1 deletion src/modules/duckduckgo.js
Expand Up @@ -19,6 +19,16 @@ class DuckduckgoScraper extends Scraper {
});
});

const ads = [];
$('.results--ads.has-ad').each((i, element) => {
ads.push({
ad_visible_url: $(element).find('.result__url').text(),
ads_link: $(element).find('.result__title .result__a').attr('href'),
title: $(element).find('.result__title .result__a').text(),
snippet: $(element).find('.result__snippet').text(),
})
});

let effective_query = $('a.js-spelling-suggestion-link').attr('data-query') || '';

const cleaned = [];
Expand All @@ -33,7 +43,8 @@ class DuckduckgoScraper extends Scraper {
return {
time: (new Date()).toUTCString(),
effective_query: effective_query,
results: cleaned
results: cleaned,
ads: ads,
}
}

Expand Down
44 changes: 41 additions & 3 deletions src/modules/google.js
Expand Up @@ -13,7 +13,6 @@ class GoogleScraper extends Scraper {
// load the page source into cheerio
const $ = cheerio.load(html);

// perform queries
const results = [];
$('#center_col .g').each((i, link) => {
results.push({
Expand All @@ -25,6 +24,41 @@ class GoogleScraper extends Scraper {
})
});

// parse top ads
const top_ads = [];
$('#tads .ads-ad').each((i, element) => {
top_ads.push({
ad_visible_url: $(element).find('.ads-visurl cite').text(),
ads_link: $(element).find('a:first-child').attr('href'),
ads_link_target: $(element).find('a:nth-child(2)').attr('href'),
title: $(element).find('a h3').text(),
snippet: $(element).find('.ads-creative').text(),
})
});

// parse bottom ads
const bottomads = [];
$('#tadsb .ads-ad').each((i, element) => {
bottomads.push({
ad_visible_url: $(element).find('.ads-visurl cite').text(),
ads_link: $(element).find('a:first-child').attr('href'),
ads_link_target: $(element).find('a:nth-child(2)').attr('href'),
title: $(element).find('a h3').text(),
snippet: $(element).find('.ads-creative').text(),
})
});

// parse google places
const places = [];
$('.rllt__link').each((i, element) => {
places.push({
heading: $(element).find('[role="heading"] span').text(),
rating: $(element).find('.rllt__details div:first-child').text(),
contact: $(element).find('.rllt__details div:nth-child(2)').text(),
hours: $(element).find('.rllt__details div:nth-child(3)').text(),
})
});

// 'Ergebnisse für', 'Showing results for'
let no_results = this.no_results(
['Es wurden keine mit deiner Suchanfrage', 'did not match any documents', 'Keine Ergebnisse für',
Expand All @@ -51,8 +85,12 @@ class GoogleScraper extends Scraper {
num_results: $('#resultStats').text(),
no_results: no_results,
effective_query: effective_query,
results: cleaned
top_ads: top_ads,
bottom_ads: bottomads,
places: places,
results: cleaned,
}

}

async load_start_page() {
Expand Down Expand Up @@ -105,7 +143,7 @@ class GoogleScraper extends Scraper {
}

async wait_for_results() {
await this.page.waitForSelector('#center_col .g', { timeout: this.STANDARD_TIMEOUT });
await this.page.waitForSelector('#fbarcnt', { timeout: this.STANDARD_TIMEOUT });
}

async detected() {
Expand Down
2 changes: 1 addition & 1 deletion src/modules/se_scraper.js
Expand Up @@ -197,7 +197,7 @@ module.exports = class Scraper {
let html = await this.page.content();

if (this.config.html_output) {
this.html_output[keyword][page_num] = html;
this.html_output[keyword][this.page_num] = html;
}

let parsed = this.parse(html);
Expand Down
5 changes: 5 additions & 0 deletions src/node_scraper.js
Expand Up @@ -104,6 +104,8 @@ class ScrapeManager {
num_pages: 1,
// path to output file, data will be stored in JSON
output_file: '',
// whether to also passthru all the html output of the serp pages
html_output: false,
// whether to prevent images, css, fonts and media from being loaded
// will speed up scraping a great deal
block_assets: true,
Expand Down Expand Up @@ -357,7 +359,10 @@ class ScrapeManager {

let res = await this.scraper.run(this.page);
results = res.results;
metadata = this.scraper.metadata;
num_requests = this.scraper.num_requests;
html_output = this.scraper.html_output;

} else {
// Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine.
// https://github.com/GoogleChrome/puppeteer/issues/678
Expand Down
92 changes: 85 additions & 7 deletions test/test_bing.js
@@ -1,6 +1,7 @@
const se_scraper = require('./../index.js');
var assert = require('chai').assert;

const chai = require('chai');
chai.use(require('chai-string'));
const assert = chai.assert;
/*
* Use chai and mocha for tests.
* https://mochajs.org/#installation
Expand Down Expand Up @@ -189,8 +190,85 @@ function test_case_effective_query(response) {
}
}

(async () => {
await normal_search_test();
await no_results_test();
await effective_query_test();
})();

const ads_keywords = ['cloud services', 'buy shoes'];

async function ads_test() {
let config = {
compress: false,
debug_level: 1,
headless: true,
block_assets: false,
random_user_agent: true,
};

let scrape_config = {
search_engine: 'bing',
keywords: ads_keywords,
num_pages: 1,
};

console.log('ads_test()');
test_case_ads_test( await se_scraper.scrape(config, scrape_config) );
}

function test_case_ads_test(response) {
assert.equal(response.metadata.num_requests, 2);

for (let query in response.results) {

assert.containsAllKeys(response.results, ads_keywords, 'not all keywords were scraped.');

for (let page_number in response.results[query]) {

assert.isNumber(parseInt(page_number), 'page_number must be numeric');

let obj = response.results[query][page_number];

assert.containsAllKeys(obj, ['results', 'time', 'no_results', 'num_results', 'effective_query', 'ads'], 'not all keys are in the object');

assert.isAtLeast(obj.results.length, 5, 'results must have at least 5 SERP objects');
assert.equal(obj.no_results, false, 'no results should be false');
assert.typeOf(obj.num_results, 'string', 'num_results must be a string');
assert.isAtLeast(obj.num_results.length, 5, 'num_results should be a string of at least 5 chars');
assert.typeOf(Date.parse(obj.time), 'number', 'time should be a valid date');

assert.isAtLeast(obj.ads.length, 2, 'ads must have at least 2 SERP object');

for (let res of obj.ads) {

assert.isOk(res.ads_link, 'link must be ok');
assert.typeOf(res.ads_link, 'string', 'link must be string');
assert.isAtLeast(res.ads_link.length, 5, 'link must have at least 5 chars');

assert.isOk(res.ads_link_target, 'link must be ok');
assert.typeOf(res.ads_link_target, 'string', 'link must be string');
assert.isAtLeast(res.ads_link_target.length, 5, 'link must have at least 5 chars');

assert.isOk(res.ad_visible_url, 'visible_link must be ok');
assert.typeOf(res.ad_visible_url, 'string', 'visible_link must be string');
assert.isAtLeast(res.ad_visible_url.length, 5, 'visible_link must have at least 5 chars');

assert.isOk(res.title, 'title must be ok');
assert.typeOf(res.title, 'string', 'title must be string');
assert.isAtLeast(res.title.length, 10, 'title must have at least 10 chars');

assert.isOk(res.snippet, 'snippet must be ok');
assert.typeOf(res.snippet, 'string', 'snippet must be string');
assert.isAtLeast(res.snippet.length, 10, 'snippet must have at least 10 chars');
}

}
}
}




describe('Bing', function(){
this.timeout(30000);
it('normal search', normal_search_test);
it('no results', no_results_test);
it('effective query', effective_query_test);
it('finds ads', ads_test);
});

0 comments on commit bbebe3c

Please sign in to comment.