In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
@Author: Shuyue Jia
@Date: Arg 20, 2020
"""

# Import necessary packages
import os
import ssl
import json
import time
import requests
import numpy as np
import pandas as pd
from random import randint
import urllib3
import threading

# Disable all kinds of warnings
urllib3.disable_warnings()

# Avoid SSL Certificate to access the HTTP website
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
def read_url(PageNumber: str) -> str:
    """
    Read URL and get corresponding contents
    :param PageNumber: The page number of the NSTL data
    :return contents: The contents of the website
    """
    # URL of the website + ID for every word website
    url = 'https://www.nstl.gov.cn/execute?target=nstl4.search4&function=paper/pc/list/pl&query=%7B%22c%22%3A10%2C%22st%22%3A%220%22%2C%22f%22%3A%5B%5D%2C%22p%22%3A%22%22%2C%22q%22%3A%5B%7B%22k%22%3A%22%22%2C%22v%22%3A%22%22%2C%22e%22%3A1%2C%22es%22%3A%7B%7D%2C%22o%22%3A%22AND%22%2C%22a%22%3A0%7D%5D%2C%22op%22%3A%22AND%22%2C%22s%22%3A%5B%22yea%3Adesc%22%5D%2C%22t%22%3A%5B%22Concept%22%5D%7D&sl=2&pageSize=10&pageNumber=' + PageNumber

    # A fake device to avoid the Anti reptile
    USER_AGENTS = [
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
        "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
        "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
        "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
        "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    ]
    
    random_agent = USER_AGENTS[randint(0, len(USER_AGENTS) - 1)]
    headers = {
        'User-Agent': random_agent,
    }
    
    # Use try-except to avoid remote server refused connection 
    for j in range(10):
        try:
            res = requests.get(url, headers=headers, verify=False, timeout=(10, 10))
            contents = res.text
        except Exception as e:
            if j >= 9:
                print('The exception has happened', '-' * 100)
            else:
                time.sleep(1)
        else:
            time.sleep(1)
            break
    
    return contents

In [None]:
def getID(content: str):
    """
    Get all the IDs from the page, 
    typically, there are 10 IDs (websites) in a single page
    :param content: The contents of the website
    :return IDs: All the IDs in this page
    """
    IDs = []
    while '{"f":"id","v":"C0' in content:
        # START and END of the ID
        START = content.find('{"f":"id","v":"C0') + len('{"f":"id","v":"')
        END = START + 10
        ID = content[START:END]
        IDs.append(ID)
        
        # Update the content until get all the IDs
        content = content[END:]
        
    return IDs

In [None]:
def save_CSV(csv_path: str, ID: list):
    """
    Save the IDs to CSV file
    :param csv_path: The path and name of the CSV file
    :param ID: The IDs that wanna save
    """
    rows = np.shape(ID)[0]
    data = np.reshape(ID, [rows, 1])
    save_data = pd.DataFrame(data)
    save_data.to_csv(csv_path, sep=',', index=False, header=None)

In [None]:
def run_code(start_page: int, end_page: int):
    """
    Run Codes for using multiple threads
    :param start_page: The start page ID
    :param end_page: The end page ID
    """
    all_IDs = []

    # Iterate the pages and find all the IDs
    for i in range(start_page, end_page):
        PageNum = str(i)
        
        # Read the website page
        contents = read_url(PageNumber=PageNum)

        # Get the IDs in the page
        IDs = getID(content=contents)

        # Append the IDs
        for j in IDs:
            all_IDs.append(j)

        print('%s Page has been processed and saved!' % i)

    path = '%s_%s_pages.csv' % (str(start_page), str(end_page - 1))
    save_CSV(csv_path=path, ID=all_IDs)
    print('%s has been successfully saved!' % path)

In [None]:
# The main function
if __name__ == '__main__':
    # Website: https://www.nstl.gov.cn/stkos.html?t=Concept&q=
    # 共搜索到614959 条信息，每页显示 10 条信息，共 61496 页
    # to save time, we use a single process with multiple threads
    # There are 61496 pages in total, so we cut them into 7 parts
    threadl = []
    task1 = threading.Thread(target=run_code, args=(1, 10000))
    task2 = threading.Thread(target=run_code, args=(10000, 20000))
    task3 = threading.Thread(target=run_code, args=(20000, 30000))
    task4 = threading.Thread(target=run_code, args=(30000, 40000))
    task5 = threading.Thread(target=run_code, args=(40000, 50000))
    task6 = threading.Thread(target=run_code, args=(50000, 60000))
    task7 = threading.Thread(target=run_code, args=(60000, 61497))
    
    task1.start()
    task2.start()
    task3.start()
    task4.start()
    task5.start()
    task6.start()
    task7.start()