# WebCrawler Test
아주 간단한 WebCralwer 를 테스트 해보았다. 간단하게 WikiPedia 한글 사이트를 처음으로 시작해서 해당 사이트에 존재하는 "P" 태그를 수집하고 해당 페이지에서 존재하는 Link를 찾아서 이동하고, 해당 페이지에서 "P" 태그를 찾아서 저장하는 행위를 반복하는 코드이다.  

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import os

# BeautifulSoup 간단하게 연결해 보기

In [15]:
def crawler(iter) : 
    url = "http://naver.com"
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text , 'lxml')
    for raw in soup.find_all('title') : 
        print(raw)
crawler(1)

<title>NAVER</title>


# Wikipedia 한글 페이지 Link를 따라가면서 데이터 수집

In [16]:
def task(page, max_pages, url_path, file_w):
    """
    지정된 수만큼 제귀 형태로 모든 링크를 따라가서 전부 수집한다. 
    """
    if page == max_pages :
        get_single_article(url_path, file_w)
    else : 
        source_code = requests.get(url_path)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, 'lxml')
        page += 1
        for link in soup.find_all('a'):
            href = link.get('href')
            if (href != None and re.search("https://ko", href)) : 
                print("href : {0}".format(href))
                task(page, max_pages, href, file_w)

def get_single_article(item_url, file_w):
    """
    p 태그를 가지고와서 파싱한다 
    """
    source_code = requests.get(item_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml')
    for contents in soup.find_all('p'):
        #print("url : {0} ".format(item_url))
        #print("text : {0} ".format(contents.text))
        file_w.write(contents.text)

def spider(max_pages, url_path, path = "/home/dev/wiki/", file_name='test.txt') :
    """
    본 Function 을 실행하면 WikiPedia 첫 페이지에서 실행해서 
    지정된 횟수만큼 페이지를 따라 들어가서 정해진 패턴을 수집한다. 
    max_pages : 몇번 Page를 따라 들어갈 것인가를 정의하는 변수 
    """
    if not os.path.exists(path):
        os.makedirs(path)
    file_w = open(''.join([path, file_name]), "w")  
    print("# Job Start!!")
    task(1, max_pages, url_path, file_w)
    print("# Job Done!!")
    file_w.close()

# 주어진 횟수만큼 해당 사이트를 시작으로 크롤링 시작 
# first parm : Inception 횟수 
# second parm : initial site 
spider(2, 'https://ko.wikipedia.org/wiki/')

# Job Start!!
href : https://ko.wiktionary.org/wiki/
href : https://ko.wiktionary.org/wiki/
href : https://ko.wikinews.org/wiki/
href : https://ko.wikinews.org/wiki/
href : https://ko.wikisource.org/wiki/
href : https://ko.wikisource.org/wiki/
href : https://ko.wikiversity.org/wiki/
href : https://ko.wikiversity.org/wiki/
href : https://ko.wikivoyage.org/wiki/
href : https://ko.wikivoyage.org/wiki/%EB%8C%80%EB%AC%B8
href : https://ko.wikiquote.org/wiki/
href : https://ko.wikiquote.org/wiki/
href : https://ko.wikibooks.org/wiki/
href : https://ko.wikibooks.org/wiki/
href : https://ko.wikipedia.org/w/index.php?title=위키백과:대문&oldid=15252069
href : https://ko.wikibooks.org/wiki/%EC%9C%84%ED%82%A4%EC%B1%85:%EB%8C%80%EB%AC%B8
href : https://ko.wikinews.org/wiki/%EC%9C%84%ED%82%A4%EB%89%B4%EC%8A%A4:%EB%8C%80%EB%AC%B8
href : https://ko.wikiquote.org/wiki/%EC%9C%84%ED%82%A4%EC%9D%B8%EC%9A%A9%EC%A7%91:%EB%93%A4%EB%A8%B8%EB%A6%AC
href : https://ko.wikisource.org/wiki/%EC%9C%84%ED%82%A4%EB%AC%B8%ED

# 간단하게 Gensim Word2Vec 에 수집한 데이터 훈련
Word2Vec 훈련 및 서비스는 Djnago REST Service 기반으로 구현해 보자 

In [12]:
import requests
import json, os

url = "{0}:{1}".format(os.environ['HOSTNAME'] , "8000")
nn_id = "nn123"
nn_wf_ver_id ="1"

# Seq - 1
resp = requests.post('http://' + url + '/api/v1/type/common/target/nninfo/nnid/' + nn_id + '/',
                     json={
                         "biz_cate": "MES",
                         "biz_sub_cate": "M60",
                         "nn_title" : "test",
                         "nn_desc": "test desc",
                         "use_flag" : "Y",
                         "dir": "purpose?",
                         "config": "N"
                     })
data = json.loads(resp.json())
print("1.evaluation result : {0}".format(data))

# Seq - 2
resp = requests.post('http://' + url + '/api/v1/type/common/target/nninfo/nnid/' + nn_id + '/version/',
                 json={
                     "nn_def_list_info_nn_id": "",
                     "nn_wf_ver_info": "test version info",
                     "condition": "1",
                     "active_flag": "Y"
                 })
data = json.loads(resp.json())
print("2.evaluation result : {0}".format(data))

# Seq - 3
resp = requests.post('http://' + url + '/api/v1/type/wf/target/init/mode/simple/'+ nn_id + '/wfver/1/',
                     json={
                         "type": "word2vec"
                     })
data = json.loads(resp.json())
print("3.evaluation result : {0}".format(data))

# Seq - 4
return_dict = {}
return_dict['test'] = open('/home/dev/wiki/test.txt', 'rb')

resp = requests.post('http://' + url + '/api/v1/type/wf/state/textdata/src/local/form/raw/prg/source/nnid/'+nn_id+'/ver/1/node/data_node/',
                     files = return_dict)

data = json.loads(resp.json())
print("4.evaluation result : {0}".format(data))

# Seq - 5
resp = requests.put('http://' + url + '/api/v1/type/wf/state/textdata/src/local/form/raw/prg/source/nnid/'+ nn_id + '/ver/1/node/data_node/',
                     json={
                         "source_server": "local",
                         "source_sql": "all",
                         "max_sentence_len" : 10
                     })
data = json.loads(resp.json())
print("5.evaluation result : {0}".format(data))

# Seq - 6
resp = requests.put('http://' + url + '/api/v1/type/wf/state/textdata/src/local/form/raw/prg/pre/nnid/'+ nn_id + '/ver/1/node/data_node/',
                     json={
                         "preprocess":  "mecab",
                     })
data = json.loads(resp.json())
print("6.evaluation result : {0}".format(data))

# Seq - 7
resp = requests.put('http://' + url + '/api/v1/type/wf/state/textdata/src/local/form/raw/prg/store/nnid/'+ nn_id + '/ver/1/node/data_node/')
data = json.loads(resp.json())
print("7.evaluation result : {0}".format(data))

# Seq - 8
resp = requests.get('http://' + url + '/api/v1/type/wf/state/textdata/src/local/form/raw/prg/source/nnid/'+ nn_id + '/ver/1/node/data_node/')
data = json.loads(resp.json())
print("8.evaluation result : {0}".format(data))

# Seq - 9
resp = requests.put('http://' + url + '/api/v1/type/wf/state/netconf/detail/w2v/nnid/' + nn_id + '/ver/' + nn_wf_ver_id + '/node/netconf_node/',
                     json={
                        "model_path" : "test",
                        "window_size" : 5,
                        "vector_size" : 100,
                        "batch_size" : 100,
                        "iter" : 5,
                        "min_count" : 1
                     })
data = json.loads(resp.json())
print("9.evaluation result : {0}".format(data))

# Seq - 10
return_dict = {}
return_dict['test'] = open('/home/dev/wiki/test.txt', 'rb')
resp = requests.post('http://' + url + '/api/v1/type/wf/state/textdata/src/local/form/raw/prg/source/nnid/'+ nn_id + '/ver/1/node/test_data_node/',
                     files = return_dict)
data = json.loads(resp.json())
print("10.evaluation result : {0}".format(data))

# Seq - 11
resp = requests.put('http://' + url + '/api/v1/type/wf/state/textdata/src/local/form/raw/prg/source/nnid/'+ nn_id + '/ver/1/node/test_data_node/',
                     json={
                         "source_server": "local",
                         "source_sql": "all",
                         "max_sentence_len" : 50
                     })
data = json.loads(resp.json())
print("11.evaluation result : {0}".format(data))

# Seq - 12
resp = requests.put('http://' + url + '/api/v1/type/wf/state/textdata/src/local/form/raw/prg/pre/nnid/'+ nn_id + '/ver/1/node/test_data_node/',
                     json={
                         "preprocess":  "mecab",
                     })
data = json.loads(resp.json())
print("12.evaluation result : {0}".format(data))

# Seq - 13
resp = requests.put('http://' + url + '/api/v1/type/wf/state/textdata/src/local/form/raw/prg/store/nnid/'+ nn_id + '/ver/1/node/test_data_node/')
data = json.loads(resp.json())
print("13.evaluation result : {0}".format(data))

# Seq - 14
resp = requests.post('http://' + url + '/api/v1/type/runmanager/state/train/nnid/'+nn_id+'/ver/'+nn_wf_ver_id+'/')
data = json.loads(resp.json())
print("14.evaluation result : {0}".format(data))


evaluation result : nn123
evaluation result : nn123
evaluation result : word2vec
evaluation result : ['1 file upload success']
evaluation result : {'source_server': 'local', 'source_sql': 'all', 'source_type': 'local', 'source_parse_type': 'raw', 'source_path': '/hoya_src_root/nn123/1/data_node', 'max_sentence_len': 10}
evaluation result : mecab
evaluation result : /hoya_str_root/nn123/1/data_node
evaluation result : {'store_path': '/hoya_str_root/nn123/1/data_node', 'source_server': 'local', 'preprocess': 'mecab', 'source_sql': 'all', 'source_type': 'local', 'source_parse_type': 'raw', 'source_path': '/hoya_src_root/nn123/1/data_node', 'max_sentence_len': 10}
evaluation result : {'batch_size': 100, 'iter': 5, 'window_size': 5, 'model_path': '/hoya_model_root/nn123/1/netconf_node', 'min_count': 1, 'vector_size': 100}
evaluation result : ['1 file upload success']
evaluation result : {'source_server': 'local', 'source_sql': 'all', 'source_type': 'local', 'source_parse_type': 'raw', 'sour

In [14]:
# Run All Workflow
resp = requests.post('http://' + url + '/api/v1/type/service/state/predict/type/w2v/nnid/' + nn_id + '/ver/active/',
                     json={
                         "type": "sim",
                         "val_1":["포털"],
                         "val_2":["윤리"]
                     }
                     )
data = json.loads(resp.json())
print("evaluation result : {0}".format(data))

evaluation result : [[['침해/NNG', 0.02471858263015747], ['사전/NNG', 0.022121289744973183], ['도울/VV+ETM', 0.015608951449394226], ['모두/NNG', 0.015334065072238445], ['세요/EP+EF', 0.015333017334342003]]]
