In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
import config

account = config.account
pwd = config.pd

In [None]:
br = webdriver.Chrome('./drivers/chromedriver_mac64')

# fill info to login 
br.get('https://case.104.com.tw/logout_finish.cfm')
br.get('https://login.104.com.tw/login.cfm')
br.find_element_by_id('id_name').send_keys(account)
br.find_element_by_id('password').send_keys(pwd)
br.find_element_by_xpath('//*[@id="wrapper"]/div[3]/div[4]/ul/li[1]/input').click()

In [None]:
%%time
# get all case list

case_url = ('https://case.104.com.tw/postcase_list.cfm?'
            'cat=0&area=0&role=0&iType=1&caseno=&cat_s=0&money=&enddays=&orderby=0&'
            'page=1&other=cat&otherVal=1&casetype=0&begin=0&cfrom=&IDNO=20000000737006')
br.get(case_url)
soup = BeautifulSoup(br.page_source, 'html.parser')

case_num = soup.select_one('div.insider_right_t h2 b').text.strip()
final_page = int(soup.select_one('div.insider_right_t h2 em').text.strip())

def get_case_list_by_page(url):    
    record = None
    while not record:
        try:
            br.get(url) # after login
            time.sleep(1)
            soup = BeautifulSoup(br.page_source, 'html.parser')
            caselist = soup.select_one('div.caselist')
            titles = [a.text for a in caselist.select('li dt a')]
            links = ['https://case.104.com.tw/' + a.get('href').split('&')[0] for a in soup.select('li dt a')]
            records = [(t, l) for t,l in zip(titles, links)]
            break
        except AttributeError as e:
            print(url, e)
            time.sleep(5)
            pass
    
    return pd.DataFrame(records, columns=['title', 'link'])

frame = []
for page in tqdm(range(1, final_page+1)):
    page_url = case_url.replace('page=1', 'page={}'.format(page))
    _df = get_case_list_by_page(page_url)
    frame.append(_df)

basic_info_df = pd.concat(frame).reset_index(drop=True)

In [None]:
%%time
# get advance info by case_link

record = []
case_link_list = list(basic_df['link'])
for case_link in tqdm(case_link_list):
    xpath_pair = {
        # example: 
        # https://case.104.com.tw/seek_view.cfm?caseno=1182014&cfrom=postcase_list_seeknum&clist=0&begin=2
        'case_type': '//*[@id="insider_wrapper"]/div[2]/div[1]/h1/span',
        'budget': '//*[@id="insider_wrapper"]/div[2]/div[3]/div[1]/dl/dd[1]',
        'content': '//*[@id="caseDesc1"]',
        'view': '//*[@id="insider_wrapper"]/div[2]/div[3]/div[2]/div[3]/dl/dd[1]',
        'propose_amount': '//*[@id="insider_wrapper"]/div[2]/div[3]/div[2]/div[3]/dl/dd[2]',
        'location': '//*[@id="insider_wrapper"]/div[2]/div[3]/div[3]/dl/dd[1]',
        'office': '//*[@id="insider_wrapper"]/div[2]/div[3]/div[3]/dl/dd[2]',
        'owner_type': '//*[@id="insider_wrapper"]/div[2]/div[3]/div[2]/div[2]/dl/dd[2]',
        'average_price': '//*[@id="insider_wrapper"]/div[2]/div[3]/div[2]/div[3]/dl/dd[4]',
        'highest_price': '//*[@id="insider_wrapper"]/div[2]/div[3]/div[2]/div[3]/dl/dd[5]',
        'lowest_price': '//*[@id="insider_wrapper"]/div[2]/div[3]/div[2]/div[3]/dl/dd[6]',
    }
    row = None
    count = 0
    while not row:
        try:
            br.get(case_link)
            time.sleep(1.5)
            
            case_type = br.find_element_by_xpath(xpath_pair['case_type']).text.strip()
            budget = br.find_element_by_xpath(xpath_pair['budget']).text.strip()
            content = br.find_element_by_xpath(xpath_pair['content']).text.strip()
            view = br.find_element_by_xpath(xpath_pair['view']).text.strip()
            propose_amount = br.find_element_by_xpath(xpath_pair['propose_amount']).text.strip()
            location = br.find_element_by_xpath(xpath_pair['location']).text.strip()
            office = br.find_element_by_xpath(xpath_pair['office']).text.strip()
            owner_type = br.find_element_by_xpath(xpath_pair['owner_type']).text.strip()
                       
            if '提案金額' in br.page_source:
                average_price = br.find_element_by_xpath(xpath_pair['average_price']).text.strip()
                highest_price = br.find_element_by_xpath(xpath_pair['highest_price']).text.strip()
                lowest_price = br.find_element_by_xpath(xpath_pair['lowest_price']).text.strip()
            else:
                average_price = None
                highest_price = None
                lowest_price = None
            
            row = (case_link, case_type, budget, content, view, 
                   propose_amount, location, office, owner_type,
                   average_price, highest_price, lowest_price)
            record.append(row)
            break
        except Exception as e:
            time.sleep(5)
            count += 1
        
        if count > 3:
            break


br.quit()
advance_info_df = pd.DataFrame(record, columns=['link'] + [k for k in xpath_pair])
rs_df = basic_info_df.merge(advance_info_df, on='link')

In [None]:
# simple parse

def transform_price_col(val):
    if val:
        val = val.split('$')[1].replace(',', '')
        return int(val)
    else:
        return None

rs_df['average_price'] = rs_df['average_price'].map(transform_price_col)
rs_df['highest_price'] = rs_df['highest_price'].map(transform_price_col)
rs_df['lowest_price'] = rs_df['lowest_price'].map(transform_price_col)
rs_df['view'] = rs_df['view'].str.replace(',', '').astype(int)

rs_df.head(10)

In [None]:
rs_df.to_csv('../data/case104.tsv', sep='\t',
                                    index=False, 
                                    encoding='utf-8')

rs_df.to_json('../data/case104.json', force_ascii=False, 
                                      orient='records')