Permalink
Browse files

Bad scraper prototype hell

  • Loading branch information...
1 parent 402f801 commit f24ccda4584bf1f685b46a1ca15dae1473ba7659 @Almad Almad committed Jun 2, 2012
Showing with 25 additions and 4 deletions.
  1. +7 −0 scrapes/convert.py
  2. +18 −4 scrapes/get_chmi_objects.py
View
@@ -38,3 +38,10 @@ def convert(file):
if __name__ == "__main__":
convert('stations.json')
+
+
+
+http://hydro.chmi.cz/isarrow/object.php?agenda=POV&objekty_chemdata=1&objekty_biodata=&taxon_tree=&seq=2000855701&data_sel=chemdata&chemie=1&biota=&rok_od=2007&rok_do=2012&matrice=2000868184&typodb=&tscongrp=&tscon=&data_mez_stanovitelnosti=&data_od=&data_do=&send=Chemick%E9+vzorky
+http://hydro.chmi.cz/isarrow/object.php?agenda=POV&objekty_chemdata=1&objekty_biodata=&taxon_tree=&seq=2000844377&data_sel=chemdata&chemie=1&biota=&rok_od=2007&rok_do=2012&matrice=2000868184&typodb=&tscongrp=&tscon=&data_mez_stanovitelnosti=&data_od=&data_do=&send=Chemick%E9+vzorky
+
+
@@ -1,13 +1,14 @@
import anyjson
import csv
+from threading import Thread
import urllib2
from lxml.html.soupparser import fromstring
from lxml.cssselect import CSSSelector
STATIONS = {}
-def get_station(link):
+def get_station(link, no):
link = "http://hydro.chmi.cz/isarrow/" + link
tree = fromstring(urllib2.urlopen(link).read().decode('cp1250'))
@@ -19,18 +20,27 @@ def get_station(link):
# 'name' : CSSSelector("table tr:nth-child(1) td")(tree)[0].text,
'x' : tree.xpath("//table/tr[14]/td")[0].text,
'y' : tree.xpath("//table/tr[15]/td")[0].text,
+ 'sequenceMagicNumber' : CSSSelector("input[name='seq']")(tree)[0].value,
}
+ print 'retrieved station', no
+
def scrape():
complete_url = "http://hydro.chmi.cz/isarrow/objects.php?ukol_p=1&vod_typ=R&nadmh_sign=%3E&rickm_sign=%3E&rok_od=2007&rok_do=2012&objekty_chemdata=1&matrice=2000868184&typodb=41&seq=364787&ordrstr=NM&agenda=POV&limit_clsf=&matrice_clsf=&tscon_clsf=&rok_od_clsf=&rok_do_clsf=&val_sign_clsf=&val_clsf=&agg_clsf=&startpos=0&recnum=2770"
tree = fromstring(urllib2.urlopen(complete_url).read().decode('cp1250'))
links = CSSSelector("table.tbl a")(tree)
i = 1
+ pool = []
for link in links:
- print "Retrieving station " + str(i)
+ print "Scheduling station " + str(i)
+ t = Thread(target=lambda: get_station(link.get("href"), i))
+ pool.append(t)
+ t.start()
i += 1
- get_station(link.get("href"))
+
+ for t in pool:
+ t.join()
def store():
f = open('stations.json', 'w')
@@ -42,7 +52,11 @@ def store():
w = csv.writer(f)
for k in STATIONS:
row = STATIONS[k]
- w.writerow([row['id'].encode('utf-8'), row['x'].encode('utf-8'), row['y'].encode('utf-8')])
+ try:
+ w.writerow([row['id'].encode('utf-8'), row['x'].encode('utf-8'), row['y'].encode('utf-8')])
+ except Exception, e:
+ print "Error while writing row", e
+
f.close()
if __name__ == "__main__":

0 comments on commit f24ccda

Please sign in to comment.