# DC Inside 만갤 개념글 다운로드
---

> 주의 : 국내에 출시되지 않은 만화책의 공유도 불법입니다.


- DCinsice robots 정책
    - 모든 유저의 크롤링 금지합니다.
    - 빠른 크롤링은 IP 차단됩니다.

```
User-agent: *
Disallow: /
```

In [1]:
!scrapy startproject comics

Error: scrapy.cfg already exists in C:\Users\yoon hwa\Documents\dev\Crawling_Project\comics


In [2]:
!tree comics

폴더 PATH의 목록입니다.
볼륨 일련 번호는 A876-8D28입니다.
C:\USERS\YOON HWA\DOCUMENTS\DEV\CRAWLING_PROJECT\COMICS
├─comics
│  ├─spiders
│  │  └─__pycache__
│  └─__pycache__
└─images
    └─full


list + contents

### 1. items.py 만들기

In [3]:
%%writefile comics/comics/items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class ComicsItem(scrapy.Item):
    # define the fields for your item here like:
    # mongo-db
    title = scrapy.Field()
    date = scrapy.Field()
    views = scrapy.Field()
    recommend = scrapy.Field()
    link = scrapy.Field()
    img_count = scrapy.Field()
    img_link = scrapy.Field()
    
    # downloads
    image_urls = scrapy.Field()
    images = scrapy.Field()

Overwriting comics/comics/items.py


### 2. spider 만들기
- time:sleep(1) 로 크롤링 차단 방지

In [4]:
%%writefile comics/comics/spiders/spider.py
import scrapy
import time
from comics.items import ComicsItem
from scrapy.pipelines.images import ImagesPipeline

class ComicsSpider(scrapy.Spider):
    name = "Comics"
    custom_settings = {
        'DOWNLOADER_MIDDLEWARES' : {
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
            'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
        }
    }
    
    def __init__(self, pages=1, **kwargs):
        self.start_url = "https://gall.dcinside.com/board/lists/?id=comic_new2&page={}&exception_mode=recommend".format(pages)
        super().__init__(**kwargs)
        
    def start_requests(self):
        url = self.start_url
        yield scrapy.Request(url, callback=self.parse)
        
    def parse(self, response):
        links = response.xpath('//*[@class="ub-content us-post"]/td[2]/a[1]/@href').extract()
        links = list(map(response.urljoin, links))
        for link in links:
            yield scrapy.Request(link, callback=self.page_parse)
    
    def page_parse(self, response):
        item = ComicsItem()
        #title = response.xpath('//*[@class="gallview_head clear ub-content"]/h3/span[2]/text()').extract_first()
        item["title"] = response.xpath('//*[@class="gallview_head clear ub-content"]/h3/span[2]/text()').extract_first()
        item["date"] = response.xpath('//*[@class="gall_date"]/text()').extract_first()
        item["views"] = response.xpath('//*[@class="fr"]/span[1]/text()').extract_first()[3:]
        item["recommend"] = response.xpath('//*[@class="gall_reply_num"]/text()').extract_first()[3:]
        item["link"] = response.url
            
        try:
            try:
                # for DB
                item["img_link"] = response.xpath('//*[@style="overflow:hidden;"]/p/img/@src').extract()[0]
                item["img_count"] = len(response.xpath('//*[@style="overflow:hidden;"]/p/img/@src').extract())
                item["image_urls"] = response.xpath('//*[@style="overflow:hidden;"]/p/img/@src').extract()
                               
                #for Images
                #images = []
                #img_urls = response.xpath('//*[@style="overflow:hidden;"]/p/img/@src').extract()
                #img_count = len(response.xpath('//*[@style="overflow:hidden;"]/p/img/@src').extract())
                
                #img_names = [ title + "_" + str(n) for n in range(img_count)]
                #for image_url, image_name in zip(img_urls, img_names):
                #    images.append({'url': image_url, 'name': image_name})
                #
                #item["image_urls"] = images
            except:
                pass
        
            try:
                # for DB
                item["img_link"] = response.xpath('//*[@style="overflow:hidden;"]/a/@href').extract()[0]
                item["img_count"] = len(response.xpath('//*[@style="overflow:hidden;"]/a/@href').extract())
                item["image_urls"] = response.xpath('//*[@style="overflow:hidden;"]/a/@href').extract()
                               
                # for Images
                #images = []
                #img_urls = response.xpath('//*[@style="overflow:hidden;"]/a/@href').extract()
                #img_count = len(response.xpath('//*[@style="overflow:hidden;"]/a/@href').extract())
                #
                #img_names = [ title + "_" + str(n) for n in range(img_count)]
                #for image_url, image_name in zip(img_urls, img_names):
                #    images.append({'url': image_url, 'name': image_name})
                #
                #item["image_urls"] = images
            except:
                pass
        except:
            item["img_count"] = "0"
            item["img_link"] = ""
        
        time.sleep(1) # 게시물 check 딜레이
        
        yield item

Overwriting comics/comics/spiders/spider.py


### 3. pipeline for mongodb

In [5]:
%%writefile comics/comics/mongodb.py
import pymongo

# mongodb 모듈 생성
client = pymongo.MongoClient('mongodb://15.165.28.220:27017/') # mongodb ip 필요
db = client.comics
collection = db.dcinsidecomics

Overwriting comics/comics/mongodb.py


In [6]:
%%writefile comics/comics/pipelines.py
from .mongodb import collection
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem


# mongo_db 설정
class ComicsPipeline(object):

    def process_item(self, item, spider):
        
        data = {
            "title": item["title"], 
            "date": item["date"],
            "views": item["views"], 
            "recommend": item["recommend"],
            "link": item["link"],
            "img_count": item["img_count"],
            "img_link": item["img_link"],
               }
        collection.insert(data)
        return item

# image download
class MyImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(url=image_url, headers={'Referer':item['link']})

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        return item

Overwriting comics/comics/pipelines.py


### 4. setting.py 변경
- ROBOTSTXT_OBEY = False
- ITEM_PIPELINES = 300

In [7]:
%%writefile comics/comics/settings.py

# Scrapy settings for comics project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'comics'

SPIDER_MODULES = ['comics.spiders']
NEWSPIDER_MODULE = 'comics.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'comics (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False


# Configure item pipelines
ITEM_PIPELINES = {
    'comics.pipelines.ComicsPipeline': 300,
    'scrapy.pipelines.images.ImagesPipeline': 1
}

# Configure download
IMAGES_STORE = 'images'

# 다운로드 딜레이
DOWNLOAD_DELAY = 1 

Overwriting comics/comics/settings.py


### 5. run 실행 파일 생성
- -o 파일이름.csv
- default = -a page=1

In [8]:
%%writefile run.sh
cd ./comics
scrapy crawl Comics -a page=1

Overwriting run.sh
