Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Anibel/list parser #102

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
5 changes: 5 additions & 0 deletions apps/parse/anibel/list_parser/consts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
MANGA_CARD_TAG = '//div[@class = "anime-card grid-6 top-05"]'
TITLE_TAG = "//h1//a/text()"
SOURCE_URL_TAG = "//h1//a/@href"
GENRES_TAG = '//div[@class = "tags"]//a/text()'
THUMBNAIL_IMG_URL_TAG = '//div[contains(@class, "grid-4 left")]//img/@data-src'
Comment on lines +1 to +5
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Что-то мне кажется, что разметка сайта поменялась, потому что эти XPATH не работают.
У них есть сейчас сайт old.anibel.net и там работает

80 changes: 80 additions & 0 deletions apps/parse/anibel/list_parser/manga_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import logging

import requests
import scrapy
from lxml import etree
from scrapy.http import HtmlResponse
from twisted.python.failure import Failure

from apps.core.commands import ParseCommandLogger

from .consts import GENRES_TAG, MANGA_CARD_TAG, SOURCE_URL_TAG, THUMBNAIL_IMG_URL_TAG, TITLE_TAG

logging.getLogger(__name__)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Это же ничего не делает
У нас management_logger, обычный не нужен

ANIBEL_URL = "https://anibel.net"


class AnibelMangaSpider(scrapy.Spider):
management_logger: "ParseCommandLogger"
name = "anibel_manga"

def __init__(self, *args, logger, **kwargs):
super().__init__(*args, **kwargs)
self.__dict__.update({"management_logger": logger})

@property
def logger(self):
return self.management_logger

def start_requests(self):
self.logger.info("Starting requests")
self.logger.info("=================")
mangas_list = requests.get(f"{ANIBEL_URL}/manga")
if not mangas_list.status_code == 200:
self.logger.error(f"Failed request with code {mangas_list.status_code}")
return
mangas_list = mangas_list.text

xpath_selector = '//li[@class = "page-item-num"]//a[@class = "page-link"]/text()'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Это тоже можно вынести в consts.py

html_parser = etree.HTML(mangas_list)
max_page = html_parser.xpath(xpath_selector)[-1]
base_url = f"{ANIBEL_URL}/manga?page="
pages = [page for page in range(1, int(max_page) + 1)]
urls = [base_url + str(page) for page in pages]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)

def request_fallback(self, failure: Failure):
self.logger.error(
f'Request for url "{failure.value.response.url}" '
f"failed with status {failure.value.response.status}"
)

def parse(self, response):
mangas = []
descriptions = response.xpath(MANGA_CARD_TAG).extract()
for description in descriptions:
response = HtmlResponse(url="", body=description, encoding="utf-8")

full_title = response.xpath(TITLE_TAG).extract_first("")
alt_title = full_title.split(" / ")[1]
alt_title = alt_title.split("[")[0]
title = full_title.split(" / ")[0]
source_url = ANIBEL_URL + response.xpath(SOURCE_URL_TAG).extract_first("")
genres = response.xpath(GENRES_TAG).extract()
image = ANIBEL_URL + response.xpath(THUMBNAIL_IMG_URL_TAG).extract_first("")
mangas.append(
{
"title": title,
"alt_title": alt_title,
"thumbnail": image,
"genres": genres,
"image": image,
"source_url": source_url,
}
)
self.logger.info('Parsed ,anga "{}"'.format(title))

self.logger.info("Processing items...")
self.logger.info("===================")
return mangas
49 changes: 49 additions & 0 deletions apps/parse/anibel/list_parser/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import logging
from copy import deepcopy
from typing import List, Tuple

from django.db import transaction

from apps.parse.anibel.list_parser.manga_spider import AnibelMangaSpider
from apps.parse.models import Genre, Manga

logger = logging.getLogger()


@transaction.atomic
def bulk_get_or_create(cls, names: List[str]) -> Tuple:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

По-моему эта функция уже есть, глянь в соседних файлах, может можно реюзнуть

objects = []
for name in names:
objects.append(cls.objects.get_or_create(name=name))
return tuple(obj for obj, _ in objects)


class AnibelPipeline:
@staticmethod
def process_item(item, spider: "AnibelMangaSpider"):
data = deepcopy(item)

title = data.pop("title")
genres = data.pop("genres")
source_url = data.pop("source_url")

if not title:
message = f"Error processing {data}: No title name was set"
spider.logger.error(message)
raise KeyError(message)

genres = bulk_get_or_create(Genre, genres)

manga, _ = Manga.objects.get_or_create(source_url=source_url)
manga_already = Manga.objects.filter(source_url=source_url)
if manga_already.exists():
manga_already.update(title=title, **data)
manga = manga_already.first()
spider.logger.info(f'Updated item "{manga}"')
else:
manga = Manga.objects.create(title=title, source_url=source_url, **data)
spider.logger.info(f'Created item "{manga}"')

manga.genres.add(*genres)

return item
19 changes: 19 additions & 0 deletions apps/parse/anibel/list_parser/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from apps.parse.anibel.list_parser.pipelines import AnibelPipeline

BOT_NAME = "anibel"

SPIDER_MODULES = ["apps.parse.anibel.list_parser"]
NEWSPIDER_MODULE = "apps.parse.anibel.list_parser"

ROBOTSTXT_OBEY = True

DOWNLOAD_DELAY = 2

ITEM_PIPELINES = {
AnibelPipeline: 300,
dhvcc marked this conversation as resolved.
Show resolved Hide resolved
}

AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 2
AUTOTHROTTLE_MAX_DELAY = 15
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
2 changes: 1 addition & 1 deletion apps/parse/management/commands/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def add_arguments(self, parser: CommandParser) -> None:
"parser",
type=str,
default="readmanga",
choices=["readmanga", "mangalib"],
choices=["readmanga", "mangalib", "anibel"],
help="parser to use which respresents a website source",
)

Expand Down
1 change: 1 addition & 0 deletions apps/parse/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class Manga(BaseModel):
SOURCE_MAP = {
"https://readmanga.live": "Readmanga",
"https://mangalib.me": "Mangalib",
"https://anibel.net": "Anibel",
}

title = TextField()
Expand Down
18 changes: 18 additions & 0 deletions apps/parse/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

from apps.parse.anibel.list_parser.manga_spider import AnibelMangaSpider
from apps.parse.mangalib.chapter_parser.parse import chapters_manga_info as mangalib_chapters_info
from apps.parse.mangalib.detail_parser.parse import detail_manga_parser
from apps.parse.mangalib.image_parser.parse import images_manga_info as mangalib_images_info
Expand All @@ -15,6 +16,7 @@
from apps.parse.readmanga.list_parser.manga_spider import MangaSpider

SETTINGS_PATH = "apps.parse.readmanga.list_parser.settings"
ANIBEL_SETTINGS_PATH = "apps.parse.anibel.list_parser.settings"


def readmanga_parser(settings=None, logger=None):
Expand All @@ -35,6 +37,19 @@ def mangalib_parser(settings=None, logger=None):
asyncio.get_event_loop().run_until_complete(crawler.get_list())


def anibel_parser(settings=None, logger=None):
os.environ.setdefault("SCRAPY_SETTINGS_MODULE", ANIBEL_SETTINGS_PATH)
process = CrawlerProcess(
{
**get_project_settings(),
**(settings if settings else {}),
}
)

process.crawl(AnibelMangaSpider, logger=logger)
process.start()


DETAIL_PARSER = "details"
LIST_PARSER = "list"
IMAGE_PARSER = "images"
Expand All @@ -53,4 +68,7 @@ def mangalib_parser(settings=None, logger=None):
CHAPTER_PARSER: mangalib_chapters_info,
IMAGE_PARSER: mangalib_images_info,
},
Manga.SOURCE_MAP.get("https://anibel.net"): {
LIST_PARSER: anibel_parser,
},
}