#### 1. Creat new project

In [1]:
!rm -rf rp_crawler
!scrapy startproject rp_crawler

New Scrapy project 'rp_crawler', using template directory 'c:\anaconda3\lib\site-packages\scrapy\templates\project', created in:
    C:\Code\크롤링 프로젝트\rp_crawler

You can start your first spider with:
    cd rp_crawler
    scrapy genspider example example.com


#### 2. writefile "items.py"

In [2]:
%%writefile rp_crawler/rp_crawler/items.py
import scrapy

class CrawlerItem(scrapy.Item):
    name = scrapy.Field()
    date = scrapy.Field()
    ir_link = scrapy.Field()
    fs_link = scrapy.Field()

Overwriting rp_crawler/rp_crawler/items.py


#### 3. writefile "spiders.py"

In [3]:
%%writefile rp_crawler/rp_crawler/spiders/spider.py
import scrapy
import datetime

from rp_crawler.items import CrawlerItem
from datetime import timedelta

class Spider(scrapy.Spider):
    name = "RPCrawler"
    allow_domain = ["https://m.irgo.co.kr/"]
    start_urls = ["https://m.irgo.co.kr/IR%EC%9E%90%EB%A3%8C"]
    
    def parse(self, response):
        date = response.xpath('//*[@id="irDataList"]/div/div[2]/span[3]/text()').extract()
        urls = response.xpath('//*[@id="irDataList"]/div/@data-href').extract()
        links = []
        for i in range(len(date)):
            if date[i][-3:] == "1일전":
                links.append(urls[i])
        for link in links:
            yield scrapy.Request(link, callback=self.page_content)
            
    def page_content(self, response):
        item = CrawlerItem()
        item["name"] = response.xpath('//*[@id="content"]/div[1]/div/dl/dd[1]/a/span[2]/text()')[0].extract()
        item["date"] = str(datetime.datetime.now()-timedelta(days=1))[:10]
        try:
            item["ir_link"] = response.xpath('//*[@id="content"]/div[1]/div/dl/dd[3]/a/@href')[0].extract()
        except:
            item["ir_link"] = response.url
        number = response.xpath('//*[@id="content"]/div[1]/div/dl/dd[1]/a/@href')[0].extract()
        code= "A" + str(number[29:35])
        item["fs_link"] = "http://comp.fnguide.com/SVO2/ASP/SVD_Finance.asp?pGB=1&gicode={}".format(code)
        yield item

Writing rp_crawler/rp_crawler/spiders/spider.py


#### 4. writefile "mongodb.py"
- please enter your own server address instead of ##.###.###.###

In [4]:
%%writefile rp_crawler/rp_crawler/mongodb.py
import pymongo

client = pymongo.MongoClient('mongodb://##.###.###.###:27017/')
db = client.ir_report
collection = db.ir

Writing rp_crawler/rp_crawler/mongodb.py


#### 5. writefile "pipelines.py"

In [5]:
%%writefile rp_crawler/rp_crawler/pipelines.py
import datetime

from scrapy.exporters import CsvItemExporter
from datetime import timedelta
from .mongodb import collection

# Class for saving items to a csv file
class CsvPipeline(object):
    def __init__(self):
        self.file = open("save_csv/ir_{}.csv".format(str(datetime.datetime.now()-timedelta(days=1))[:10]), "wb")
        self.exporter = CsvItemExporter(self.file, encoding='utf-8')
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

# Class to save items in Mongodb
class MongdbPipeline(object):
    def process_item(self, item, spider):
        data = { "name": item["name"], 
                 "date": item["date"],
                 "ir_link": item["ir_link"], 
                 "fs_link": item["fs_link"],
               }
        
        collection.insert(data)
        return item

Overwriting rp_crawler/rp_crawler/pipelines.py


#### 6. writefile run.sh (a file that run the crawler)
- "run.sh" is stored in the folder where this jupiter_notebook file is located

In [13]:
%%writefile run.sh
#!/bin/bash
cd rp_crawler
scrapy crawl RPCrawler

Writing run.sh


#### 7. add "run.sh" file execution permissions to all users

In [14]:
!chmod +x run.sh