In [128]:
import time
from tqdm import tqdm
import requests 
from bs4 import BeautifulSoup
import pandas as pd

In [145]:
def get_ptt_codejob_case_info(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    meta_values = soup.select('span.article-meta-value')

    try:
        author = meta_values[0].text
        title = meta_values[2].text
        time = meta_values[3].text

        main_contents = soup.select_one('div#main-content').text.split('\n')[1:]
        content = '\n'.join([line.replace('\t', '') for line in main_contents if line != ''])
    except IndexError as e:
        print(url, e)
        return {'url': url}
        
    rs_dict = {
        'url': url,
        'author': author,
        'title': title,
        'time': time,
        'content': content
    }
    return rs_dict

def get_ptt_codejob_case_urls(index_start=355, index_end=365): 
    """
    # crawl index from 355 untill to "500 - Internal Server Error"
    """
    urls = ['https://www.ptt.cc/bbs/CodeJob/index{i}.html'.format(i=i) 
            for i in range(index_start, index_end+1)]
    rs_case_urls = []
    for url in tqdm(urls):
        time.sleep(1)
        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')
        
        a_tags = soup.select('div.title a')
        if a_tags == []:
            # out of index
            print('{} has no contents!'.format(url))
            break
        else:
            # Add cases that has [發案] in title only
            _case_urls = ['https://www.ptt.cc' + a.get('href') 
                          for a in a_tags if '[發案]' in a.text]
            rs_case_urls += _case_urls
    return rs_case_urls


In [130]:
%%time
index_start = 355
index_end = 365

codejob_case_urls = get_ptt_codejob_case_urls(index_start, index_end)

print('from index {0} to {1} has {2} cases.\n'.format(index_start, 
                                                    index_end, 
                                                    len(codejob_case_urls)))

 82%|████████▏ | 9/11 [00:18<00:04,  2.02s/it]

https://www.ptt.cc/bbs/CodeJob/index364.html has no contents!
from index 355 to 365 has 134 cases.

CPU times: user 403 ms, sys: 20 ms, total: 423 ms
Wall time: 20.5 s


In [143]:
# exception: 徵得後編輯文章，移除必要資訊後產生 IndexError
print(codejob_case_urls[40])

https://www.ptt.cc/bbs/CodeJob/M.1486637432.A.F77.html


In [146]:
rs_list = []
for case_url in tqdm(codejob_case_urls):
    _dict = get_ptt_codejob_case_info(case_url)
    time.sleep(1)
    rs_list.append(_dict)
    
df = pd.DataFrame(rs_list)


  0%|          | 0/134 [00:00<?, ?it/s][A
  1%|          | 1/134 [00:02<04:58,  2.24s/it][A
  1%|▏         | 2/134 [00:04<04:47,  2.18s/it][A
  2%|▏         | 3/134 [00:06<04:33,  2.09s/it][A
 30%|██▉       | 40/134 [01:25<03:07,  2.00s/it]

https://www.ptt.cc/bbs/CodeJob/M.1486637432.A.F77.html list index out of range


 70%|███████   | 94/134 [03:15<01:24,  2.12s/it]

https://www.ptt.cc/bbs/CodeJob/M.1489736995.A.CA3.html list index out of range


 87%|████████▋ | 117/134 [04:02<00:37,  2.22s/it]

https://www.ptt.cc/bbs/CodeJob/M.1490807722.A.A9C.html list index out of range


100%|██████████| 134/134 [04:38<00:00,  2.16s/it]


In [162]:
df['time'] = df['time'].map(pd.to_datetime)
df.to_csv('../data/codejob_from_index_{}_to_{}.tsv'.format(index_start, index_end), 
                                                            index=False,
                                                            sep='\t',
                                                            encoding='utf-8')

## 已知問題
- content 欄位會抓到推文，太多的時候有點阿雜

In [161]:
df['content'].str.len().sort_values(ascending=False).head(20)

3      11173.0
38      3080.0
61      1726.0
74      1654.0
26      1626.0
33      1597.0
54      1580.0
75      1544.0
4       1514.0
71      1365.0
2       1348.0
44      1307.0
41      1305.0
22      1279.0
119     1278.0
105     1272.0
99      1271.0
96      1242.0
25      1240.0
17      1221.0
Name: content, dtype: float64

In [174]:
df.to_json('../data/codejob_from_index_355_to_365.json', force_ascii=False, orient='records')