Skip to content

Commit

Permalink
Update scraper.php
Browse files Browse the repository at this point in the history
  • Loading branch information
RedFoxBot committed Mar 9, 2014
1 parent 710d085 commit 012ad89
Showing 1 changed file with 3 additions and 59 deletions.
62 changes: 3 additions & 59 deletions scraper.php
Expand Up @@ -8,7 +8,7 @@
#return;

$max = 10045263;
$counter = scraperwiki::get_metadata('counter');
$counter = scraperwiki::get_var('counter');
if($counter<10000000)
{
$counter=10000000;
Expand All @@ -17,7 +17,7 @@
for ($i=0; $i< 1000; $i++) {
$counter++;
if ($counter == $max) {
scraperwiki::save_metadata('counter',10000000);
scraperwiki::save_var('counter',10000000);
$i= 1001;
}
$html = oneline(scraperwiki::scrape("http://www.ukrlp.co.uk/ukrlp/ukrlp_provider.page_pls_provDetails?x=&pn_p_id=".$counter."&pv_status=VERIFIED&pv_vis_code=L"));
Expand Down Expand Up @@ -52,60 +52,4 @@
'legal_fax' => clean($legal['fax']),'legal_email' => clean($legal['email']),
'legal_web' => clean($legal['web']),
'primary_address' => clean($primary['address']),'primary_phone' => clean($primary['phone']),
'primary_fax' => clean($primary['fax']),'primary_email' => clean($primary['email']),
'primary_web' => clean($primary['web']), 'primary_courses' => clean($primary['courses']) ));
}
scraperwiki::save_metadata('counter',$counter);
}
}

function parseAddress($val) {

preg_match_all('|<strong>Telephone: </strong>(.*?)<br />|',$val,$phone);
if (isset($phone[1][0])) { $dat['phone'] = trim($phone[1][0]);} else { $dat['phone']='';}
preg_match_all('|<strong>E-mail: </strong><a href="mailto:(.*?)">.*?</a><br />|',$val,$email);
if (isset($email[1][0])) { $dat['email'] = trim($email[1][0]);} else { $dat['email']='';}
preg_match_all('|<strong>Website: </strong><a target="_blank" href="(.*?)">.*?</a><br />|',$val,$web);
if (isset($web[1][0])) { $dat['web'] = trim($web[1][0]);} else { $dat['web']='';}
preg_match_all('|<strong>Fax: </strong>(.*?)<br />|',$val,$fax);
if (isset($fax[1][0])) { $dat['fax'] = trim($fax[1][0]);} else { $dat['fax']='';}
if (isset($courses[1][0])) { $dat ['courses'] = trim($courses[1][0]);} else { $dat['courses']='';}
preg_match_all('|<strong>Courses: </strong>(.*?)<br />|',$val,$courses);
$p = explode('<strong>',$val);

$p = explode('<br />',$p[0]);

$dat['address'] = '';
foreach ($p as $a) {
$a = trim($a);
if ($a !='') {
if ($dat['address']!='') { $dat['address'] .=', '; }
$dat['address'] .= $a;
}
}
if ($dat['address'] == 'Not specified. Please use the above.') {
$dat['address'] = '';
}

return $dat;

}

function clean($val) {
$val = str_replace('&nbsp;',' ',$val);
$val = str_replace('&amp;','&',$val);
$val = html_entity_decode($val);
$val = strip_tags($val);
$val = trim($val);
$val = utf8_decode($val);
return($val);
}

function oneline($code) {
$code = str_replace("\n",'',$code);
$code = str_replace("\r",'',$code);
return $code;
}


?>
'primary_

0 comments on commit 012ad89

Please sign in to comment.