# Crawl the judgement documents

In [33]:
import numpy as np
import pandas as pd
import time
import json
import html
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from docx import Document

In [54]:
browser = webdriver.Chrome('../../../../chromedriver')

In [3]:
df = pd.read_csv('./case_urls_20190610.csv').drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,casecourt,casenumber,judgedate,realid,stage,title,url,year,prosecution,downloaded
0,广东省高级人民法院,（2017）粤刑终67号,2017-03-08,cd9a2e03-feca-455a-8063-a82d011d12da,二审,郑秀怀走私珍贵动物、珍贵动物制品二审刑事裁定书,http://wenshu.court.gov.cn/content/content?Doc...,2017,刑事,False
1,浙江省高级人民法院,（2014）浙刑二终字第78号,2014-11-10,df9c3b1f-a0f9-4c6f-99aa-709f3fd6f87e,二审,朱新潮受贿罪二审刑事裁定书,http://wenshu.court.gov.cn/content/content?Doc...,2014,刑事,True
2,云南省高级人民法院,（2018）云刑终235号,2018-03-12,d0da7477-3229-449b-9b95-a91100f4c001,二审,杨忠华、苏家华走私、贩卖、运输、制造毒品二审刑事裁定书,http://wenshu.court.gov.cn/content/content?Doc...,2018,刑事,True
3,云南省高级人民法院,（2018）云刑终1128号,2018-11-19,75bfc4c4-f965-444a-a305-a9bb00bc13ef,二审,赵东国、赵信国走私、贩卖、运输、制造毒品二审刑事裁定书,http://wenshu.court.gov.cn/content/content?Doc...,2018,刑事,False
4,海南省高级人民法院,（2015）琼刑二终字第20号,2015-09-06,bd7c279d-6903-4cd7-8e17-98772b7db89a,二审,陈某甲、魏某甲等犯走私珍贵动物、珍贵动物制品罪二审刑事判决书,http://wenshu.court.gov.cn/content/content?Doc...,2015,刑事,False


In [4]:
def export_doc(ps, rowid):
    if len(ps) > 1:
        doc = Document()
        for p in ps:
            doc.add_paragraph(p.text)
        doc.save('./docs/{}.docx'.format(rowid))
        return True
    else:
        return False

In [18]:
#https://selenium-python.readthedocs.io/waits.html
for index, row in df.iterrows():
    if row['downloaded'] == False:
        time.sleep(2)
        try:
            browser.get('http://wenshu.court.gov.cn') # !! Important to avoid Error 500 caused by parent path error
            time.sleep(3)
            browser.get(row['url'])
            pdiv = WebDriverWait(browser, 30).until(
                EC.presence_of_element_located((By.ID, "DivContent"))
            )
            ps = pdiv.find_elements_by_css_selector('div')
            df.loc[index, 'downloaded'] = export_doc(ps, index)
        except:
            print('Exception occured in:')
    print(row['title'], row['downloaded'])

郑秀怀走私珍贵动物、珍贵动物制品二审刑事裁定书 True
朱新潮受贿罪二审刑事裁定书 True
杨忠华、苏家华走私、贩卖、运输、制造毒品二审刑事裁定书 True
赵东国、赵信国走私、贩卖、运输、制造毒品二审刑事裁定书 True
陈某甲、魏某甲等犯走私珍贵动物、珍贵动物制品罪二审刑事判决书 False
杨恩强、杨恩大走私、贩卖、运输、制造毒品二审刑事裁定书 False
走私珍贵动物、珍贵动物制品二审刑事裁定书 False
陈洪波、赵志勇走私珍贵动物、珍贵动物制品二审刑事判决书 False
Exception occured in:
余国荣走私珍贵动物制品一审刑事判决书 False
Exception occured in:
石贵荣、石玉开走私、贩卖、运输、制造毒品一审刑事判决书 False
Exception occured in:
冯栋彬犯走私珍贵动物、珍贵动物制品罪一审刑事判决书 False
Exception occured in:
瓦咪走私珍贵动物、珍贵动物制品一审刑事判决书 False
Exception occured in:
林玉辉受贿一审刑事判决书 False


KeyboardInterrupt: 

## Try extracting info from input tag

In [47]:
def extract_doc_from_tag(_json, rowid):
    doc = Document()
    for key, value in _json.items():
        if (value != None) & (value != ''):
            value = key + ': ' + html.unescape(value)
            doc.add_paragraph(value)
    doc.save('./docs/{}.docx'.format(rowid))


In [50]:
for index, row in df.iterrows():
    if row['downloaded'] == False:
        time.sleep(2)
        try:
            browser.get('http://wenshu.court.gov.cn') # !! Important to avoid Error 500 caused by parent path error
            time.sleep(3)
            browser.get(row['url'])
            time.sleep(3)
            
            info = browser.find_element_by_css_selector('input#hidCaseInfo')
            info_json = json.loads(info.get_attribute('value'))
            
            extract_doc_from_tag(info_json, index)
            df.loc[index, 'downloaded'] = True
        except:
            print('Exception occured in:')
    print(row['title'], df.loc[index, 'downloaded'])

郑秀怀走私珍贵动物、珍贵动物制品二审刑事裁定书 True
朱新潮受贿罪二审刑事裁定书 True
杨忠华、苏家华走私、贩卖、运输、制造毒品二审刑事裁定书 True
赵东国、赵信国走私、贩卖、运输、制造毒品二审刑事裁定书 True
陈某甲、魏某甲等犯走私珍贵动物、珍贵动物制品罪二审刑事判决书 True
Exception occured in:
杨恩强、杨恩大走私、贩卖、运输、制造毒品二审刑事裁定书 False
Exception occured in:
走私珍贵动物、珍贵动物制品二审刑事裁定书 False
Exception occured in:
陈洪波、赵志勇走私珍贵动物、珍贵动物制品二审刑事判决书 False
余国荣走私珍贵动物制品一审刑事判决书 False
Exception occured in:
石贵荣、石玉开走私、贩卖、运输、制造毒品一审刑事判决书 False
冯栋彬犯走私珍贵动物、珍贵动物制品罪一审刑事判决书 False
Exception occured in:
瓦咪走私珍贵动物、珍贵动物制品一审刑事判决书 False
Exception occured in:
林玉辉受贿一审刑事判决书 False


KeyboardInterrupt: 

In [63]:
df[df['downloaded']==True]

Unnamed: 0,casecourt,casenumber,judgedate,realid,stage,title,url,year,prosecution,downloaded
0,广东省高级人民法院,（2017）粤刑终67号,2017-03-08,cd9a2e03-feca-455a-8063-a82d011d12da,二审,郑秀怀走私珍贵动物、珍贵动物制品二审刑事裁定书,http://wenshu.court.gov.cn/content/content?Doc...,2017,刑事,True
1,浙江省高级人民法院,（2014）浙刑二终字第78号,2014-11-10,df9c3b1f-a0f9-4c6f-99aa-709f3fd6f87e,二审,朱新潮受贿罪二审刑事裁定书,http://wenshu.court.gov.cn/content/content?Doc...,2014,刑事,True
2,云南省高级人民法院,（2018）云刑终235号,2018-03-12,d0da7477-3229-449b-9b95-a91100f4c001,二审,杨忠华、苏家华走私、贩卖、运输、制造毒品二审刑事裁定书,http://wenshu.court.gov.cn/content/content?Doc...,2018,刑事,True
3,云南省高级人民法院,（2018）云刑终1128号,2018-11-19,75bfc4c4-f965-444a-a305-a9bb00bc13ef,二审,赵东国、赵信国走私、贩卖、运输、制造毒品二审刑事裁定书,http://wenshu.court.gov.cn/content/content?Doc...,2018,刑事,True


In [62]:
# browser.get('http://wenshu.court.gov.cn')
# browser.get(df.loc[10, 'url'])

In [64]:
# execute before logging out
df.to_csv('case_urls_20190610.csv', encoding='utf_8_sig')