Skip to content

Commit

Permalink
Update scraper.php
Browse files Browse the repository at this point in the history
  • Loading branch information
Uzlopak committed Nov 18, 2015
1 parent ebb159d commit 0c725a6
Showing 1 changed file with 85 additions and 2 deletions.
87 changes: 85 additions & 2 deletions scraper.php
@@ -1,7 +1,6 @@
<?
// This is a template for a PHP scraper on morph.io (https://morph.io)
// including some code snippets below that you should find helpful

// require 'scraperwiki.php';
// require 'scraperwiki/simple_html_dom.php';
//
Expand All @@ -18,10 +17,94 @@
//
// // An arbitrary query against the database
// scraperwiki::select("* from data where 'name'='peter'")

// You don't have to do things with the ScraperWiki library.
// You can use whatever libraries you want: https://morph.io/documentation/php
// All that matters is that your final data is written to an SQLite database
// called "data.sqlite" in the current working directory which has at least a table
// called "data".
?>


<?php
require 'scraperwiki.php';
for ($id = 900001; $id <= 1200000; $id++) {
$i = 1;
$delay = 250000;
if (!validateEntry($id))
{
print $id;
while (!validateEntry($id))
{
print ".";
$delay = $delay + $i * 250000;
//limit to 5 secs
if ($delay > 5000000) {
$delay = 5000000;
}
usleep($delay);
ripById($id);
$i++;
}
print "! ";
}
}
function ripById($id){
$pathToDetails = 'http://www.beheshtezahra.ir/Default.aspx?tabid=92&ctl=SearchDetails&mid=653&srid=' . $id;

$output = scraperwiki::scrape($pathToDetails);
$firstnamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblNameBound_0"><b>(.*)<\//smiU';
$surnamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblLastNameBound_0"><b>(.*)<\//smiU';
$fathernamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblFatherNameBound_0"><b>(.*)<\//smiU';
$birthdatepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblBirthDateBound_0"><b>(.*)<\//smiU';
$deathdatepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDafnDateBound_0"><b>(.*)<\//smiU';
$deathplacepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDeastTownshipTitle_0"><b>(.*)<\//smiU';
$graveplacepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDafnPlace_0"><b>(.*)<\//smiU';


preg_match($firstnamepattern, $output, $temp);
$firstname = (isset($temp[1])) ? $temp[1] : '';

preg_match($surnamepattern, $output, $temp);
$surname = (isset($temp[1])) ? $temp[1] : '';

preg_match($fathernamepattern, $output, $temp);
$fathername = (isset($temp[1])) ? $temp[1] : '';

preg_match($birthdatepattern, $output, $temp);
$birthdate = (isset($temp[1])) ? $temp[1] : '';

preg_match($deathdatepattern, $output, $temp);
$deathdate = (isset($temp[1])) ? $temp[1] : '';

preg_match($deathplacepattern, $output, $temp);
$deathplace = (isset($temp[1])) ? $temp[1] : '';

preg_match($graveplacepattern, $output, $temp);
$graveplace = (isset($temp[1])) ? $temp[1] : '';


scraperwiki::save_sqlite(array('data'),
array(
'id' => $id,
'firstname' => $firstname,
'surname' => $surname,
'fathername' => $fathername,
'birthdate' => $birthdate,
'deathdate' => $deathdate,
'deathplace' => $deathplace,
'graveplace' => $graveplace));
}
function validateEntry($id){
$result = false;
// Set total number of rows
try {
$recordSet = scraperwiki::select("* from data where id ='". $id . "'");
if (!empty($recordSet[0]['id'])) {
if ($recordSet[0]['firstname'] != "" and $recordSet[0]['surname'] != ""){
$result = true;
}
}
} catch (Exception $e) {
}
return $result;
}

0 comments on commit 0c725a6

Please sign in to comment.