From 012ad89c713ab8e4db2b882db9f3150fe59f4528 Mon Sep 17 00:00:00 2001 From: RedFoxBot Date: Sun, 9 Mar 2014 14:26:28 +0000 Subject: [PATCH] Update scraper.php --- scraper.php | 62 +++-------------------------------------------------- 1 file changed, 3 insertions(+), 59 deletions(-) diff --git a/scraper.php b/scraper.php index 9fe6ef4..643b76e 100644 --- a/scraper.php +++ b/scraper.php @@ -8,7 +8,7 @@ #return; $max = 10045263; -$counter = scraperwiki::get_metadata('counter'); +$counter = scraperwiki::get_var('counter'); if($counter<10000000) { $counter=10000000; @@ -17,7 +17,7 @@ for ($i=0; $i< 1000; $i++) { $counter++; if ($counter == $max) { - scraperwiki::save_metadata('counter',10000000); + scraperwiki::save_var('counter',10000000); $i= 1001; } $html = oneline(scraperwiki::scrape("http://www.ukrlp.co.uk/ukrlp/ukrlp_provider.page_pls_provDetails?x=&pn_p_id=".$counter."&pv_status=VERIFIED&pv_vis_code=L")); @@ -52,60 +52,4 @@ 'legal_fax' => clean($legal['fax']),'legal_email' => clean($legal['email']), 'legal_web' => clean($legal['web']), 'primary_address' => clean($primary['address']),'primary_phone' => clean($primary['phone']), - 'primary_fax' => clean($primary['fax']),'primary_email' => clean($primary['email']), - 'primary_web' => clean($primary['web']), 'primary_courses' => clean($primary['courses']) )); - } - scraperwiki::save_metadata('counter',$counter); - } - } - - function parseAddress($val) { - - preg_match_all('|Telephone: (.*?)
|',$val,$phone); - if (isset($phone[1][0])) { $dat['phone'] = trim($phone[1][0]);} else { $dat['phone']='';} - preg_match_all('|E-mail: .*?
|',$val,$email); - if (isset($email[1][0])) { $dat['email'] = trim($email[1][0]);} else { $dat['email']='';} - preg_match_all('|Website: .*?
|',$val,$web); - if (isset($web[1][0])) { $dat['web'] = trim($web[1][0]);} else { $dat['web']='';} - preg_match_all('|Fax: (.*?)
|',$val,$fax); - if (isset($fax[1][0])) { $dat['fax'] = trim($fax[1][0]);} else { $dat['fax']='';} - if (isset($courses[1][0])) { $dat ['courses'] = trim($courses[1][0]);} else { $dat['courses']='';} - preg_match_all('|Courses: (.*?)
|',$val,$courses); - $p = explode('',$val); - - $p = explode('
',$p[0]); - - $dat['address'] = ''; - foreach ($p as $a) { - $a = trim($a); - if ($a !='') { - if ($dat['address']!='') { $dat['address'] .=', '; } - $dat['address'] .= $a; - } - } - if ($dat['address'] == 'Not specified. Please use the above.') { - $dat['address'] = ''; - } - - return $dat; - - } - - function clean($val) { - $val = str_replace(' ',' ',$val); - $val = str_replace('&','&',$val); - $val = html_entity_decode($val); - $val = strip_tags($val); - $val = trim($val); - $val = utf8_decode($val); - return($val); - } - - function oneline($code) { - $code = str_replace("\n",'',$code); - $code = str_replace("\r",'',$code); - return $code; - } - - -?> + 'primary_