Permalink
Browse files

added some scraper code

  • Loading branch information...
1 parent a2b5554 commit 8eaa3e94c691a9f53f3615f19b633dbf8fbd0cfb @sweemeng sweemeng committed Apr 30, 2012
Showing with 363 additions and 0 deletions.
  1. +148 −0 extract_engine.py
  2. +157 −0 extractor.py
  3. +58 −0 scrape.py
View
@@ -0,0 +1,148 @@
+import re
+import json
+import types
+
+# Extract Table for CIDB Database. Using the parser pattern
+class TallTableExtractor(object):
+ def __init__(self,table):
+ self.table = table
+ self.result = []
+ self.keys = []
+
+ def extract_value(self):
+ rows = self.table.findAll('tr')
+ last_key = ''
+ for row in rows:
+ if row.parent.parent.name == 'td':
+ continue
+ data = row.findAll('td')
+ if len(data) == 1:
+ obj = TableObject(data[0].text)
+ self.result.append(obj)
+ working_obj = self.result[-1]
+ self.keys = []
+
+ else:
+ if not self.result:
+ obj = TableObject('')
+ self.result.append(obj)
+ working_obj = self.result[-1]
+ key = normalize_value(data[0].text)
+
+ if key == '':
+ temp = getattr(working_obj,last_key)
+ temp = temp + ' ' +data[1].text
+ else:
+ last_key = key
+ temp = data[1].text
+ if last_key not in self.keys:
+ self.keys.append(last_key)
+
+ if data[1].find('table'):
+ inner_rows = data[1].find('table').findAll('tr')
+ temp = ''
+ t_list = []
+ for inner_row in inner_rows:
+ inner_data = inner_row.findAll('td')
+ t_list.append(' '.join(
+ [i_data.text for i_data in inner_data]))
+ temp = ','.join(t_list)
+ setattr(working_obj,last_key,temp)
+ working_obj.keys = self.keys
+ assert self.result, "result list is empty"
+
+
+class WideTableExtractor(object):
+ def __init__(self,table):
+ self.table = table
+ self.result = []
+ self.keys = []
+
+ def extract_value(self):
+ keys = []
+ rows = self.table.findAll('tr')
+ title = ''
+ if len(rows) <= 2:
+
+ return
+ for row in rows:
+ data = row.findAll('td')
+ if len(data) == 1:
+ title = data[0].text
+ keys = []
+ else:
+ if not keys:
+ for d in data:
+ keys.append(d.text)
+ self.keys = keys
+ else:
+ obj = TableObject(title)
+ obj.keys = self.keys
+ self.result.append(obj)
+ working_obj = self.result[-1]
+ temp = dict(zip(keys,[i.text for i in data]))
+ for k in temp:
+ k_ = normalize_value(k)
+ setattr(working_obj,k_,temp[k])
+
+ assert self.result, "result list is empty"
+
+
+class StringExtractor(object):
+ def __init__(self,table):
+ self.table = table
+ self.result = []
+
+ def extract_value(self):
+ rows = self.table.findAll('tr')
+ title = rows[0].text
+ self.obj = TableObject(title)
+ if len(rows) > 1:
+ keys = []
+ for row in rows[1:]:
+ data = row.text.split(':')
+ key = normalize_value(data[0])
+ keys.append(key)
+ setattr(self.obj,key,data[1])
+ self.obj.keys = keys
+
+ self.result.append(self.obj)
+
+
+class TableObject(object):
+ def __init__(self,title):
+ self.title = title
+
+ def to_dict(self):
+ keys=dir(self)
+ temp = {}
+ for key in keys:
+ if not re.match('^__',key):
+ if not type(getattr(self,key))==types.MethodType:
+ if key != 'title':
+ if key != 'keys':
+ temp[key] = getattr(self,key)
+
+ return temp
+
+ def to_json(self):
+ temp = self.to_dict()
+ return json.dumps(temp)
+
+
+def normalize_value(value):
+ value = value.replace(' ','')
+ value = value.replace('(','')
+ value = value.replace(')','')
+ value = value.replace(':','')
+ value = value.replace('*','')
+ value = value.replace('-','')
+ value = value.replace('\r\n','')
+ value = value.replace('/','')
+ value = value.replace('.','')
+ value = value.replace('&nbsp;','')
+ pattern = re.compile('_+')
+ value = pattern.sub('',value)
+ pattern = re.compile('^((i+|v)|(v|i+))')
+ value = pattern.sub('',value)
+ return value
View
@@ -0,0 +1,157 @@
+import re
+from BeautifulSoup import BeautifulSoup
+from extract_engine import TallTableExtractor
+from extract_engine import WideTableExtractor
+from extract_engine import StringExtractor
+
+class CIDBEntry(object):
+ def __init__(self,page):
+ self.page = open(page)
+ self.reference = page.split('.')[0].split('/')[-1]
+ self.soup = BeautifulSoup(self.page)
+ self.target = self.soup.find('div',{'id':'todaysoftware'})
+ self.result = []
+ self.url = "http://202.190.73.10/directory/local_contractor_details.php?cont_id=%s"
+
+ def process(self):
+ tables = self.target.findAll('table')
+ extend_table = ['A','B','F','G']
+ append_table = ['C','D','E','H','I','J']
+ for table in tables:
+ check = table.find('tr')
+ if not check:
+ continue
+ check = check.text
+ if re.match('\S\.',check):
+ if check[0] in extend_table:
+ self.result.extend(self.process_table(table))
+ elif check[0] in append_table:
+ self.result.append(self.process_table(table))
+ elif re.match('^Status',check):
+ self.result.extend(self.process_table(table))
+
+
+ def process_table(self,table):
+ tall_table = ['A','B']
+ wide_table = ['C','D','E','H','I','J']
+ string_table = ['F','G']
+
+ check = table.find('tr').text
+ if check[0] in tall_table:
+ extractor = TallTableExtractor(table)
+
+ elif check[0] in wide_table:
+ extractor = WideTableExtractor(table)
+
+ elif check[0] in string_table:
+ extractor = StringExtractor(table)
+
+ elif re.match('^Status',check):
+ extractor = TallTableExtractor(table)
+ else:
+ return []
+
+ extractor.extract_value()
+ return extractor.result
+
+ def get_keys(self):
+ keys = []
+ for item in self.result:
+ if type(item) == list:
+ if not item:
+ continue
+ obj = item[0]
+ else:
+ obj = item
+ temp = obj.to_dict()
+ if not temp:
+ continue
+ keys.append([k.lower() for k in obj.keys])
+ for k in keys:
+ k.append('reference')
+ k.append('source')
+ keys.pop(4)
+ return keys
+
+ def get_worksheet(self):
+ sheet_list = []
+ for item in self.result:
+ if type(item) == list:
+ if not item:
+ continue
+ obj = item[0]
+ else:
+ obj = item
+ if not obj.to_dict():
+ continue
+ temp = obj.title
+ sheet_list.append(temp)
+ sheet_list = [normalize_value(i) for i in sheet_list]
+ sheet_list.pop(4)
+ return sheet_list
+
+ def is_good_record(self):
+ company_info = self.result[0]
+ status = False
+ company_dict = company_info.to_dict()
+ for key in company_dict:
+ if company_dict[key]:
+ if re.match('^\S+$',company_dict[key]):
+ status = True
+ return status
+
+ def get_data(self):
+ datas = self.result
+ result = []
+ for data in datas:
+ if type(data) == list:
+ if not data:
+ continue
+ temp = []
+ for d in data:
+ tdata = d.to_dict()
+ if not tdata:
+ continue
+ t = {}
+ for td in tdata:
+ t[td.lower()] = tdata[td]
+ temp.append(t)
+ temp[-1]['reference'] = self.reference
+ temp[-1]['source'] = self.url % self.reference
+ else:
+ temp = {}
+ tdata = data.to_dict()
+ if not tdata:
+ continue
+ for t in tdata:
+ temp[t.lower()] = tdata[t]
+ temp['reference'] = self.reference
+ temp['source'] = self.url % self.reference
+ result.append(temp)
+ result[0].update(result[4])
+ result.pop(4)
+ return result
+
+def normalize_value(value):
+ value = value.replace('\r\n','')
+ value = value.replace('&nbsp;','')
+ value = value.replace('/','')
+
+ pattern = re.compile(':$')
+ value = pattern.sub('',value)
+ pattern = re.compile('^\s+')
+ value = pattern.sub('',value)
+ pattern = re.compile('\s+$')
+ value = pattern.sub('',value)
+ pattern = re.compile('\s\s+')
+ value = pattern.sub(' ',value)
+ pattern = re.compile('\.\s')
+ value = pattern.sub('.',value)
+ pattern = re.compile('\S\.')
+ value = pattern.sub('',value)
+ pattern = re.compile(':$')
+ value = pattern.sub('',value)
+ value = value.replace(' ','_')
+ return value.lower()
+
+
View
@@ -0,0 +1,58 @@
+import requests
+import random
+import sqlite3
+import time
+import datetime
+import os
+
+# Run through a privoxy proxy, again to hide request, through a few machine
+PROXY = "127.0.0.1:8118"
+
+PROXY_DICT = {'http':PROXY,'https':PROXY}
+
+# This is ugly, but then so is the page, trying to find a fix
+ID_LIST = range(1,158000)
+
+ADDRESS = "http://202.190.73.10/directory/local_contractor_details.php?cont_id=%s"
+
+USER_AGENT = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"
+
+def crawler():
+ crawl_completed = False
+ while not crawl_completed:
+
+ page_id = random.sample(ID_LIST,1)[0]
+ output_path = 'output/%s.html' % (str(page_id))
+
+ while os.path.exists(output_path):
+ # just so we can hide our request a bit more, most probably an overkill
+ page_id = random.sample(ID_LIST,1)[0]
+ output_path = 'output/%s.html' % (str(page_id))
+ ID_LIST.pop(ID_LIST.index(page_id))
+
+ header = {'User-Agent':USER_AGENT}
+ try:
+ data = requests.get(ADDRESS%str(page_id),headers=header,proxies=PROXY_DICT)
+ data.raise_for_status()
+ except requests.HTTPError:
+ print ADDRESS%str(page_id)
+ return
+
+ f = open(output_path,"w")
+ f.write(data.text)
+ f.close()
+
+# next_crawl = random.sample(xrange(5,10),1)[0]
+# now = datetime.datetime.now()
+# next_time = now + datetime.timedelta(0,next_crawl)
+ print "write to %s" % output_path
+ # print "next print in %d second at %s" % (next_crawl,next_time)
+
+
+# time.sleep(next_crawl)
+ if len(os.listdir('output')) >= len(ID_LIST):
+ crawl_completed = True
+ break
+
+if __name__ == "__main__":
+ crawler()

0 comments on commit 8eaa3e9

Please sign in to comment.