# 天眼查企业信息收集
---

## 第一步，运行下面的代码即可

In [1]:
from pandas import read_csv, DataFrame
from time import sleep
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

# utilities
def get_page_soup():
    '''
    get page source and turn into Beautiful soup.
    '''
    return bs(driver.page_source, 'html.parser')


def get_company_list(path_to_company_csv):
    '''
    read the company file and only return company names.
    '''
    company_data = read_csv(path_to_company_csv)
    
    return list(company_data['企业名称'])


def click_button(CSS_Selector):
    '''
    click a button by its CSS_Slector.
    '''
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, CSS_Selector)))
    sleep(1.5)
    button = driver.find_element(By.CSS_SELECTOR, CSS_Selector)
    button.click()
    # waitting time after click button
    sleep(1.5)
    
    return None


def search(company_name):
    '''
    search comany.
    '''
    search_box = driver.find_element(By.CSS_SELECTOR, "input[class^='_']")
    search_box.clear()
    search_box.send_keys(company_name)
    search_box.send_keys(Keys.RETURN)
    sleep(2)
    
    return None


def get_data(company_name):
    '''
    get all company data.
    '''
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.dim-group > div:nth-child(2)')))
    sleep(1)
    page = get_page_soup()
    page = page.select("table", class_='table-wrap')
    # 工商信息
    business = page[0]
    # 股东信息
    stock_holder = page[2]
    
    # data processing
    company_data = clean_data(company_name, business, stock_holder)
    
    return company_data


def clean_data(company_name, com, stock_holder):
    '''
    clean raw source data into dic
    '''
    def get_txt(target, CSS_Selector):
        '''
        just get the text
        '''
        try:
            return target.select_one(CSS_Selector).get_text()
        except:
            return 'None'
    
    def remove_spaces_and_newlines(s):
        '''
        remove \n and space.
        '''
        return s.replace(" ", "").replace("\n", "")
    
    # setting up result
    com_info = {}
    com_info['企业名称'] = company_name
    
    # business
    com_info['法定代表人'] = get_txt(com, '.index_legal-representative__Kfdqv > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1)')
    com_info['统一社会信用代码'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(4) > td:nth-child(2) > div:nth-child(1) > span:nth-child(1)')
    com_info['组织机构代码'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(5) > td:nth-child(6) > div:nth-child(1) > span:nth-child(1)')
    com_info['企业类型'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(7) > td:nth-child(2)')
    com_info['人员规模 '] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(7) > td:nth-child(6)')
    com_info['行业'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(7) > td:nth-child(4)')
    com_info['注册地址'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(9) > td:nth-child(4) > div:nth-child(1) > span:nth-child(1)')
    com_info['经营状态'] = get_txt(com, '.num-opening')
    com_info['工商注册号'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(5) > td:nth-child(2) > div:nth-child(1) > span:nth-child(1)')
    com_info['参保人数'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(8) > td:nth-child(2)')
    com_info['登记机关'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(9) > td:nth-child(2) > div:nth-child(1) > span:nth-child(1)')
    com_info['英文名'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(8) > td:nth-child(4) > div:nth-child(1) > span:nth-child(1)')
    com_info['成立日期'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(3) > td:nth-child(2)').strip()
    com_info['注册资本'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(4) > td:nth-child(4) > div:nth-child(1)')
    com_info['实缴资本'] = remove_spaces_and_newlines(get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(4) > td:nth-child(6)'))
    com_info['纳税人识别号'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(5) > td:nth-child(4) > div:nth-child(1) > span:nth-child(1)')
    com_info['纳税人资质'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(6) > td:nth-child(4)')
    com_info['核准日期'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(6) > td:nth-child(6)')

    # share holders
    stock_holder_info = ''
    for j in range(1, len(stock_holder.select('tr'))):
        values = stock_holder.select('tr')[j].select('td')
        values = [remove_spaces_and_newlines(i.get_text()) for i in values]

        stock_holder_info += stock_holder.select('tr')[j].select_one('td:nth-child(2) > div > div:nth-child(2) > div > div').get_text() + ' ' + values[2] + ' '
    
    com_info['股东及持股比例'] = stock_holder_info.strip()
    com_info['经营范围'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(10) > td:nth-child(2) > div:nth-child(1) > span:nth-child(1)')
    
    return com_info

## 第二步，运行下面的代码并在弹出天眼查网页后请登陆，并在搜索框输入任何文字并返回

还要改一下下面公司文件的路径，改为你的专精特新文件或者要它的子集。

In [2]:
# Initialization....
print('Starting driver......')
driver = webdriver.Firefox()
driver.get('https://www.tianyancha.com/')

# Parameters
path_to_company_files = './南山区待爬取名单.csv'

# is checking
is_checking = False

# setting up waitting time
wait = WebDriverWait(driver, 10)

Starting driver......


In [8]:
driver.switch_to.window(driver.window_handles[-1])
stock_holder = get_page_soup().select("table", class_='table-wrap')[3]

In [None]:
def get_data(company_name):
    '''
    get all company data.
    '''
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.dim-group > div:nth-child(2)')))
    sleep(1)
    page = get_page_soup()
    page = page.select("table", class_='table-wrap')
    # 工商信息
    business = page[0]
    # 股东信息
    stock_holder = page[3]
    
    # data processing
    company_data = clean_data(company_name, business, stock_holder)
    
    return company_data


def clean_data(company_name, com, stock_holder):
    '''
    clean raw source data into dic
    '''
    def get_txt(target, CSS_Selector):
        '''
        just get the text
        '''
        try:
            return target.select_one(CSS_Selector).get_text()
        except:
            return 'None'
    
    def remove_spaces_and_newlines(s):
        '''
        remove \n and space.
        '''
        return s.replace(" ", "").replace("\n", "")
    
    # setting up result
    com_info = {}
    com_info['企业名称'] = company_name
    
    # share holders
    stock_holder_info = ''
    for j in range(1, len(stock_holder.select('tr'))):
        values = stock_holder.select('tr')[j].select('td')
        values = [remove_spaces_and_newlines(i.get_text()) for i in values]

        stock_holder_info += stock_holder.select('tr')[j].select_one('td:nth-child(2) > div > div:nth-child(2) > div > div').get_text() + ' ' + values[2] + ' '
    
    com_info['股东及持股比例'] = stock_holder_info.strip()
    com_info['经营范围'] = get_txt(com, '.index_tableBox__ZadJW > tbody:nth-child(1) > tr:nth-child(10) > td:nth-child(2) > div:nth-child(1) > span:nth-child(1)')
    
    return com_info

## 接下来就运行下面的代码开启爬虫

In [19]:
# get company list
company_list = get_company_list(path_to_company_files)
task_length = len(company_list)

# prepare output
company_data = []

In [37]:
# start collecting
a = []
failed = []
for company in company_list[199:]:
    
    driver.switch_to.window(driver.window_handles[-1])
        
    # print out info
    print(f"==========Finished:\033[1;32m{round((len(company_data) / task_length) * 100, 2)}%\033[0m==========", end='\r')
    search(company)
    
    # click the first tr of the table
    click_button('.index_list-wrap___axcs > div > div > div:last-child > div:nth-child(2) > div > div')
    sleep(2)
    
    # switch to the new tab
    driver.switch_to.window(driver.window_handles[-1])
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.index_detail__JSmQM')))
    
    if is_checking:
        a.append(int(input('Checking....')))
    else:
        try:
            # collecting data
            company_data.append(get_data(company))
            # close page
            driver.close()
            sleep(5)
        except:
            sleep(2)
            company_data.append(get_data(company))
            # close page
            driver.close()
            sleep(5)



## 前十个数据展示

In [11]:
company_data[:10]

[{'企业名称': '深圳市越疆科技股份有限公司',
  '法定代表人': '刘培超',
  '统一社会信用代码': '91440300349770526R',
  '组织机构代码': '34977052-6',
  '企业类型': '其他股份有限公司(非上市)',
  '人员规模 ': '-',
  '行业': '计算机、通信和其他电子设备制造业',
  '注册地址': '深圳市南山区桃源街道福光社区留仙大道3370号南山智园崇文园区2号楼1003',
  '经营状态': '存续',
  '工商注册号': '440307113525907',
  '参保人数': '-',
  '登记机关': '深圳市市场监督管理局',
  '英文名': 'None',
  '成立日期': '2015-07-30',
  '注册资本': '36000万人民币',
  '实缴资本': '953.8472万人民币',
  '纳税人识别号': '91440300349770526R',
  '纳税人资质': '一般纳税人',
  '核准日期': '2023-11-29',
  '股东及持股比例': '刘培超 26.6242% 深圳市松禾成长股权投资合伙企业（有限合伙） 6.0272% 前海股权投资基金（有限合伙） 5.4368% 中金祺智（上海）股权投资中心（有限合伙） 4.4913% 深圳市鲁墨咨询合伙企业（有限合伙） 4.1381% 宁波梅山保税港区同伴投资管理合伙企业（有限合伙） 3.8539% 中国互联网投资基金（有限合伙） 3.6818% 深圳市齐墨投资合伙企业（有限合伙） 3.6003% 深圳市越疆投资合伙企业（有限合伙） 3.5% 深圳市楚墨咨询合伙企业（有限合伙） 3.2316% 深圳市创新投资集团有限公司 2.8758% 温润振信壹号（珠海）股权投资基金合伙企业（有限合伙） 2.7531% 中车（青岛）科技创新创业股权投资合伙企业（有限合伙） 2.706% 深圳市南山红土股权投资基金合伙企业（有限合伙） 2.2941% 吴志文 2.2134% 郎需林 2.2134% 深圳群达科技有限合伙企业（有限合伙） 1.8674% 无锡产发服务贸易投资基金合伙企业（有限合伙） 1.6842% 无锡云晖物联网投资管理合伙企业（有限合伙） 1.6842% 深圳千帆企航壹号私募股权投资基

## 存储为csv

注意修改路径。

In [42]:
# saving
# 这里是存储到和这个代码文件同一个路径下，命名为南通市列表.csv
DataFrame(company_data).to_csv('./南山区.csv', index=False)