# 企查查企业信息收集

---

> 创建人 ： 王佳何

**环境配置：请运行下列代码确保selenium,bs4安装。**

In [None]:
!pip install selenium bs4

## 第一步，导入库

直接运行即可

In [13]:
from pandas import read_csv, DataFrame
from time import sleep
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

## 第二步，定义将要使用的函数

In [2]:
# utilities

def get_page_soup():
    '''
    get page source and turn into Beautiful soup.
    '''
    return bs(driver.page_source, 'html.parser')


def get_company_list(path_to_company_csv):
    '''
    read the company file and only return company names.
    '''
    company_data = read_csv(path_to_company_csv)
    
    return list(company_data['企业名称'])


def click_button(CSS_Selector):
    '''
    click a button by its CSS_Slector.
    '''
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, CSS_Selector)))
    sleep(1)
    button = driver.find_element(By.CSS_SELECTOR, CSS_Selector)
    button.click()
    # waitting time after click button
    sleep(3)
    
    return None


def search(company_name):
    '''
    search comany.
    '''
    search_box = driver.find_element(By.CSS_SELECTOR, '#searchKey')
    search_box.clear()
    search_box.send_keys(company_name)
    search_box.send_keys(Keys.RETURN)
    sleep(3)
    
    return None


def get_data(company_name):
    '''
    get all company data.
    '''
    page = get_page_soup()
    # 工商信息
    business = page.select_one('#cominfo')
    # 股东信息
    stock_holder = page.select_one('.app-tree-table > table:nth-child(1)')
    # 主要人员
    main = page.select_one('#mainmember > div:nth-child(2) > div:nth-child(2) > table:nth-child(1)')
    
    # data processing
    business, stock_holder, main = clean_data(company_name, business, stock_holder, main)

    sleep(3)
    
    return business, stock_holder, main


def clean_data(company_name, business, stock_holder, main):
    '''
    clean raw source data into dic
    '''
    def get_txt(target, CSS_Selector):
        '''
        just get the text
        '''
        try:
            return target.select_one(CSS_Selector).text
        except:
            return 'None'
    
    def remove_spaces_and_newlines(s):
        '''
        remove \n and space.
        '''
        return s.replace(" ", "").replace("\n", "")
    
    # setting up result
    com_info, sh, man = {}, {}, {}
    com_info['企业名称'], sh['企业名称'], man['企业名称'] = company_name, company_name, company_name
    
    # business
    com_info['统一社会信用代码'] = get_txt(business, '.cominfo-normal > table:nth-child(1) > tr:nth-child(1) > td:nth-child(2) > span:nth-child(1) > span:nth-child(1)')
    com_info['法定代表人'] = get_txt(business, 'span:nth-child(1) > a:nth-child(1)')
    com_info['组织机构代码'] = get_txt(business, '.cominfo-normal > table:nth-child(1) > tr:nth-child(4) > td:nth-child(2) > span:nth-child(1) > span:nth-child(1)')
    com_info['企业类型'] = get_txt(business, 'tr:nth-child(5) > td:nth-child(2)')
    com_info['人员规模 '] = get_txt(business, 'tr:nth-child(6) > td:nth-child(2)')
    com_info['所属地区'] = get_txt(business, 'table:nth-child(1) > tr:nth-child(7) > td:nth-child(2) > span:nth-child(1) > span:nth-child(1)')
    com_info['国标行业'] = get_txt(business, '.gb-wrapper > span:nth-child(1)')
    com_info['注册地址'] = get_txt(business, '.cominfo-normal > table:nth-child(1) > tr:nth-child(9) > td:nth-child(2) > span:nth-child(1) > span:nth-child(1) > a:nth-child(1)')
    com_info['登记状态'] = get_txt(business, '.cominfo-normal > table:nth-child(1) > tr:nth-child(2) > td:nth-child(4)')
    com_info['注册资本'] = get_txt(business, '.cominfo-normal > table:nth-child(1) > tr:nth-child(3) > td:nth-child(2)')
    com_info['工商注册号'] = get_txt(business, '.cominfo-normal > table:nth-child(1) > tr:nth-child(4) > td:nth-child(4) > span:nth-child(1) > span:nth-child(1)')
    com_info['营业期限'] = remove_spaces_and_newlines(get_txt(business, '.cominfo-normal > table:nth-child(1) > tr:nth-child(5) > td:nth-child(4)'))
    com_info['参保人数'] = get_txt(business, '.cominfo-normal > table:nth-child(1) > tr:nth-child(6) > td:nth-child(4) > span:nth-child(1)')
    com_info['登记机关'] = get_txt(business, '.cominfo-normal > table:nth-child(1) > tr:nth-child(7) > td:nth-child(4) > span:nth-child(1) > span:nth-child(1)')
    com_info['英文名'] = get_txt(business, '.cominfo-normal > table:nth-child(1) > tr:nth-child(8) > td:nth-child(4) > span:nth-child(1) > span:nth-child(1) > span:nth-child(1)')
    com_info['成立日期'] = get_txt(business, '.cominfo-normal > table:nth-child(1) > tr:nth-child(2) > td:nth-child(6) > span:nth-child(1) > span:nth-child(1)').strip()
    com_info['实缴资本'] = remove_spaces_and_newlines(get_txt(business, '.cominfo-normal > table:nth-child(1) > tr:nth-child(3) > td:nth-child(4)'))
    com_info['纳税人识别号'] = get_txt(business, '.cominfo-normal > table:nth-child(1) > tr:nth-child(4) > td:nth-child(6) > span:nth-child(1) > span:nth-child(1)')
    com_info['纳税人资质'] = get_txt(business, '.cominfo-normal > table:nth-child(1) > tr:nth-child(5) > td:nth-child(6)')
    
    # share holders
    columan_names = stock_holder.select_one('tr').select('th')
    columan_names = [remove_spaces_and_newlines(i.get_text()) for i in columan_names]
    columan_length = len(stock_holder.select_one('tr').select('th'))

    for j in range(1, len(stock_holder.select('tr'))):
        values = stock_holder.select('tr')[j].select('td')
        values = [remove_spaces_and_newlines(i.get_text()) for i in values]

        for i in range(1, columan_length):
            if i == 1:
                sh[columan_names[1]+str(j)] = stock_holder.select('tr')[j].select_one('td:nth-child(2) > div > span:nth-child(2) > span').get_text()
            else:
                sh[f'股东{j}{columan_names[i]}'] = values[i]
                
    
    # main
    columan_names = main.select_one('tr').select('th')
    columan_names = [remove_spaces_and_newlines(i.get_text()) for i in columan_names]
    columan_length = len(columan_names)

    for j in range(1, len(main.select('tr'))):
        values = main.select('tr')[j].select('td')
        values = [remove_spaces_and_newlines(i.get_text()) for i in values]

        for i in range(1, columan_length):
            if i == 1:
                man[f'主要人员{str(j)}{columan_names[1]}'] = main.select('tr')[j].select_one('td:nth-child(2) > div > span:nth-child(2) > span').get_text()
            else:
                man[f'主要人员{j}{columan_names[i]}'] = values[i]
    
    return com_info, sh, man

## 测试
以下的代码仅为测试能否正常运行，

运行后会弹出firefox浏览器，并转到中国知网

> 这里firefox不是必须的，如果你有chrome或者ie或者edge都可以，只要修改下面的```driver = webdriver.Firefox()```为```driver = webdriver.Edge()```或你想用的浏览器即可。

在这个过程中你完全可以移动鼠标，点击其他的东西，或者将firefox隐藏到后台，不会影响程序进行。

然后三秒后关闭浏览器。

下面的代码单元左侧的 ```In [*]``` 也会变成 ```In [3]``` （3或者数字）

```In [*]```代表这个代码单元正在运行。

In [None]:
# testing
driver = webdriver.Firefox()
driver.get('https://chn.oversea.cnki.net/index/')

sleep(3)

driver.quit()

## 第三步，初始化

如果上面的代码按照预期的运行，那么可以开始正式的爬取数据。

- 运行下面的代码，会弹出企查查

- 这时候需要你手动完成下面任务：
 
  - 扫码登陆
  
- 当完成上述所有后，你回到这里，在下面弹出的输入框输入任何东西即可。
(这里的输入其实毫无意义，我这里加个用户输入只是为了提醒你完成所有任务)

In [3]:
# Initialization....
print('Starting driver......')
driver = webdriver.Firefox()
driver.get('https://www.qcc.com/')

# Parameters
path_to_company_files = '/home/nolan/Downloads/专精特新小巨人.csv'

# setting up waitting time
wait = WebDriverWait(driver, 10)
input("Just input anything : ");

Starting driver......
Just input anything : 12


## 第四步，开始下载

下面的代码会从执行以下操作：

- 对于公司列表中的每一个公司名称：
  - 搜索公司名称
  - 点击搜索结果的第一个
  - 获取数据
  - 关闭弹出的页面

**请注意** ： 
> 企查查非会员用户在搜索45次后会禁止搜索，进入1小时冷却。

In [7]:
# get company list
company_list = get_company_list(path_to_company_files)

# prepare output
business, stock_holder, main = [], [], []

  company_data = read_csv(path_to_company_csv)


In [8]:
# start collecting
for company in company_list[45:]:
    
    driver.switch_to.window(driver.window_handles[-1])
        
    # print out info
    print(f'searching for : \033[1;32m{company}\033[0m')
    search(company)
    sleep(5)
    
    # click the first tr of the table
    click_button('.app-ltable > tr:nth-child(1) > td:nth-child(3) > div > span > span > a')
    sleep(5)
    
    # switch to the new tab
    print('switch tab to the company page.')
    driver.switch_to.window(driver.window_handles[-1])
    
    # collecting data
    print('Start collecting data......')
    sleep(3)
    page = get_page_soup()
    a, b, c = get_data(company)
    
    # save
    print(f'\033[1;32m{company}\033[0m\'data Successfully collected!')
    business.append(a)
    stock_holder.append(b),
    main.append(c)
    
    # close page
    driver.close()
    print('')
    sleep(15)

searching for : [1;32m杭州趣链科技有限公司[0m
switch tab to the company page.
Start collecting data......
[1;32m杭州趣链科技有限公司[0m'data Successfully collected!

searching for : [1;32m福建魔方电子科技有限公司[0m
switch tab to the company page.
Start collecting data......
[1;32m福建魔方电子科技有限公司[0m'data Successfully collected!

searching for : [1;32m南方环境有限公司[0m
switch tab to the company page.
Start collecting data......
[1;32m南方环境有限公司[0m'data Successfully collected!

searching for : [1;32m深圳大普微电子股份有限公司[0m
switch tab to the company page.
Start collecting data......
[1;32m深圳大普微电子股份有限公司[0m'data Successfully collected!

searching for : [1;32m深圳半岛医疗集团股份有限公司[0m
switch tab to the company page.
Start collecting data......
[1;32m深圳半岛医疗集团股份有限公司[0m'data Successfully collected!

searching for : [1;32m欧冶云商股份有限公司[0m
switch tab to the company page.
Start collecting data......
[1;32m欧冶云商股份有限公司[0m'data Successfully collected!

searching for : [1;32m永臻科技股份有限公司[0m
switch tab to the company page.
Start collecting

KeyboardInterrupt: 

## 存储

这里工商信息，股东信息，主要人员分了三个文件存储。

注意修改路径。

In [12]:
# saving
DataFrame(business).to_csv('/home/nolan/Downloads/工商信息.csv')
DataFrame(stock_holder).to_csv('/home/nolan/Downloads/股东信息.csv')
DataFrame(main).to_csv('/home/nolan/Downloads/主要人员.csv')