Skip to content

Commit

Permalink
Update scraper.php
Browse files Browse the repository at this point in the history
  • Loading branch information
T346 committed Jul 22, 2020
1 parent d4ad77e commit 7765f70
Showing 1 changed file with 93 additions and 26 deletions.
119 changes: 93 additions & 26 deletions scraper.php
@@ -1,27 +1,94 @@
<?
// This is a template for a PHP scraper on morph.io (https://morph.io)
// including some code snippets below that you should find helpful

// require 'scraperwiki.php';
// require 'scraperwiki/simple_html_dom.php';
//
// // Read in a page
// $html = scraperwiki::scrape("http://foo.com");
//
// // Find something on the page using css selectors
// $dom = new simple_html_dom();
// $dom->load($html);
// print_r($dom->find("table.list"));
//
// // Write out to the sqlite database using scraperwiki library
// scraperwiki::save_sqlite(array('name'), array('name' => 'susan', 'occupation' => 'software developer'));
//
// // An arbitrary query against the database
// scraperwiki::select("* from data where 'name'='peter'")

// You don't have to do things with the ScraperWiki library.
// You can use whatever libraries you want: https://morph.io/documentation/php
// All that matters is that your final data is written to an SQLite database
// called "data.sqlite" in the current working directory which has at least a table
// called "data".
<?php
require 'scraperwiki.php';
require 'scraperwiki/simple_html_dom.php';

$startProductId = scraperwiki::get_var("currentId", -1);

if($startProductId == -1)
{
print "No previous saved position found. Starting from scratch.";
}
else
{
print "Resuming from product id $startProductId\n";
}

scraperwiki::attach("hobbyking_batteryidlist");
$batteries = scraperwiki::select("id from hobbyking_batteryidlist.data where id > $startProductId order by id asc");
$remainingCount = count($batteries);

print "Found $remainingCount batteries left to be scraped.";

$maxPerRun = 100;
$loopCount = 0;

foreach($batteries as $bat)
{
if ($loopCount > $maxPerRun)
{
print "Ending run after $maxPerRun iterations.";
break;
}

$productId = $bat['id'];
print "Retrieving " . $productId . "\n";
$html = scraperWiki::scrape("http://www.hobbyking.com/hobbyking/store/uh_viewItem.asp?idProduct=$productId");
//print $html . "\n";

$dom = new simple_html_dom();
$dom->load($html);

// Get the product data (located in a span tag). Should only be one product data area!
$productDataAreasDom = $dom->find("SPAN[id=prodDataArea]");
$productDataDom = $productDataAreasDom[0];
//print $productData . "\n";

$data = array();

// Loop over each row in the product data.
foreach ($productDataDom->find("tr") as $tr)
{
// Get the columns for this row of info.
$columns = $tr->find("td");

$attribute = $columns[0]->plaintext;
$value = intval($columns[1]->plaintext);

// Some rows are empty, and we should exclude them.
if (strlen($attribute) > 0)
{
//print $attribute . ": ";
//print $value . "\n";
$data["id"] = $productId;
$data[$attribute] = $value;
}
}

// Get the price.
$priceDom = $dom->find("td[background*=add_cart_bgd02.jpg]");
$price = floatval(str_replace('$', "", $priceDom[0]->plaintext));
$data["price"] = $price;

// Calculate out a few extra fields:
$cells = $data["Config(s)"];
$capacity = $data["Capacity(mAh)"];
$energy = $cells * 3.7 * $capacity / 1000;
$value = $energy / $price;

$data["Energy (Wh)"] = $energy;
$data["Value (Wh/$)"] = $value;

//print_r($data);
scraperwiki::save(array("id"), $data);
scraperwiki::save_var("currentId", $productId);
$loopCount++;
}

// Check to see if we have scraped everything - if so, start again!
$lastBattery = end($batteries);
if($lastBattery['id'] == $productId)
{
print "All known batteries processed. Clearing progress marker so scraper can start again.";
scraperwiki::save_var("currentId", -1);
}
?>

0 comments on commit 7765f70

Please sign in to comment.