In [1]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.remote.webelement import WebElement

from datetime import timedelta, date
import os
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)

def selenium_initializer():
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--ignore-ssl-errors')
    driver = webdriver.Chrome('../chromedriver', chrome_options=options)
    return driver


In [2]:
from tqdm import tqdm_notebook

In [None]:
!pip install peewee

In [3]:
def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)


In [7]:
start_date = date(2020, 1, 1)
end_date = date(2020, 1, 3)

In [8]:
driver = selenium_initializer()

  


In [9]:
for from_date in daterange(start_date, end_date):
    to_date = from_date + timedelta(days=1)
    print("From: {}-{}-{} | To: {}-{}-{}".format(from_date.year, from_date.month, from_date.day,
                                                 to_date.year, to_date.month, to_date.day))
    TARGET_SEARCH_LINK = 'https://ficbook.net/find?title=&fandom_filter=any&fandom_group_id=1&sizes%5B%5D=3&sizes%5B%5D=4&pages_min=&pages_max=&ratings%5B%5D=5&ratings%5B%5D=6&ratings%5B%5D=7&ratings%5B%5D=8&ratings%5B%5D=9&transl=1&statuses%5B%5D=2&directions%5B%5D=1&directions%5B%5D=2&directions%5B%5D=3&directions%5B%5D=4&directions%5B%5D=7&directions%5B%5D=6&directions%5B%5D=5&author=0&likes_min=&likes_max=&dateFilter=1&date_create_min={}-{}-{}&date_create_max={}-{}-{}&date_update_min=2019-12-27&date_update_max=2019-12-27&sort=3&rnd=777383900&find=%D0%9D%D0%B0%D0%B9%D1%82%D0%B8%21#result'.format(
        from_date.year, from_date.month, from_date.day,
        to_date.year, to_date.month, to_date.day)
    logging.info("Searching by link: {}".format(TARGET_SEARCH_LINK))
    driver.get(TARGET_SEARCH_LINK)

INFO:Searching by link: https://ficbook.net/find?title=&fandom_filter=any&fandom_group_id=1&sizes%5B%5D=3&sizes%5B%5D=4&pages_min=&pages_max=&ratings%5B%5D=5&ratings%5B%5D=6&ratings%5B%5D=7&ratings%5B%5D=8&ratings%5B%5D=9&transl=1&statuses%5B%5D=2&directions%5B%5D=1&directions%5B%5D=2&directions%5B%5D=3&directions%5B%5D=4&directions%5B%5D=7&directions%5B%5D=6&directions%5B%5D=5&author=0&likes_min=&likes_max=&dateFilter=1&date_create_min=2020-1-1&date_create_max=2020-1-2&date_update_min=2019-12-27&date_update_max=2019-12-27&sort=3&rnd=777383900&find=%D0%9D%D0%B0%D0%B9%D1%82%D0%B8%21#result


From: 2020-1-1 | To: 2020-1-2


INFO:Searching by link: https://ficbook.net/find?title=&fandom_filter=any&fandom_group_id=1&sizes%5B%5D=3&sizes%5B%5D=4&pages_min=&pages_max=&ratings%5B%5D=5&ratings%5B%5D=6&ratings%5B%5D=7&ratings%5B%5D=8&ratings%5B%5D=9&transl=1&statuses%5B%5D=2&directions%5B%5D=1&directions%5B%5D=2&directions%5B%5D=3&directions%5B%5D=4&directions%5B%5D=7&directions%5B%5D=6&directions%5B%5D=5&author=0&likes_min=&likes_max=&dateFilter=1&date_create_min=2020-1-2&date_create_max=2020-1-3&date_update_min=2019-12-27&date_update_max=2019-12-27&sort=3&rnd=777383900&find=%D0%9D%D0%B0%D0%B9%D1%82%D0%B8%21#result


From: 2020-1-2 | To: 2020-1-3


In [11]:
#db.close()
os.remove('ficbook-net.db')

In [12]:
from peewee import *

db = SqliteDatabase('ficbook-net.db')

class BaseModel(Model):
    class Meta:
        database = db
        
class Tags(BaseModel):
    id = IntegerField(primary_key=True)
    url = TextField()
    name = TextField()
    description = TextField()
    synonyms = TextField(null=True)

class Fandoms(BaseModel):
    fandomURL = TextField(unique=True)
    fandomName = TextField()

class Authors(BaseModel):
    id = IntegerField(unique=True, primary_key=True)
    authorName = TextField()
    authorURL = TextField()

class Rating(BaseModel):
    ratingText = TextField()
    ratingDescription = TextField()

class Pairing(BaseModel):
    pairingURL = TextField()
    pairingName = TextField(unique=True)
    
class Size(BaseModel):
    sizeName = TextField(unique=True)
    sizeDescription = TextField()

class Direction(BaseModel):
    directionName = TextField(unique = True)
    directionDescription = TextField()


class FanficDescription(BaseModel):
    id = IntegerField(primary_key = True)
    fanficURL = TextField()
    authorID = ForeignKeyField(Authors, backref = 'authorID')
    fanficName = TextField()
    directionID = ForeignKeyField(Direction, backref = 'directionID')
    likes = IntegerField()
    pages = IntegerField()
    parts = IntegerField()
    rewards = IntegerField()
    pairingID = ForeignKeyField(Pairing, backref = 'pairingID', null=True)
    ratingID = ForeignKeyField(Rating, backref = 'ratingID')
    sizeID = ForeignKeyField(Size, backref = 'sizeID')
    shortDescription = TextField()
    
    
    
class TagsFanfic(BaseModel):
    fanficID = ForeignKeyField(FanficDescription, backref='fanficID')
    tagID = ForeignKeyField(Tags, backref='tagID')
        
db.connect()
db.create_tables([Tags, Fandoms,Authors,Rating,Pairing,Size,FanficDescription,Direction,TagsFanfic])


In [13]:
MAIN_TOPIC_FIELD = 'main-topic'
TITLE_FIELD = 'title'
FANFIC_LINK_FIELD = 'fanfic-link'
FANFIC_ID_FIELD = 'fanfic-id'
AUTHORS_FIELD = 'authors'
LIKES_COUNT_FIELD = 'likes-count'
REWARD_COUNT_FIELD = 'rewards-count'
STATUS_FIELD = 'status'
PAIRING_FIELD = 'pairing'
TAGS_FIELD = 'tags'
DESCRIPTION_FIELD = 'description'
FANDOM_FIELD = 'fandom'
RATING_FIELD = 'age_rating'
# size
FANFIC_SIZE_FIELD = 'fanfic-size-short'       # (size_category, size_description)
FANFIC_SIZE_FULL_FIELD = 'fanfic-size-full'   # (size_category, count_of_pages, count_of_parts)

def insert_fanfic_description(fanfic_description_element):
    if len(fanfic_description_element[AUTHORS_FIELD]) > 1:
        print('Count of author more than one!')
    fanfic_id = fanfic_description_element[FANFIC_ID_FIELD]
    print('Current fanfic ID: {}'.format(fanfic_id))

    # Authors processing
    for (author_id, author_name, author_url) in fanfic_description_element[AUTHORS_FIELD]:
        authorID, was_created = Authors.get_or_create(
            id = int(author_id), 
            authorName = author_name, 
            authorURL = author_url)
        print('External id = {}, Internal (db) = {}'.format(author_id, authorID))
        
    # Tags
    for (tag_id, tag_name, tag_description, tag_url) in fanfic_description_element[TAGS_FIELD]:
        tagID, was_created = Tags.get_or_create(
            id = tag_id, url = tag_url, name = tag_name, description = tag_description)
        TagsFanfic.create(fanficID = fanfic_id, tagID = tagID)
    # Directions
    direction_id, _ = Direction.get_or_create(directionName = fanfic_description_element[MAIN_TOPIC_FIELD][0], 
                                           directionDescription =  fanfic_description_element[MAIN_TOPIC_FIELD][1])
    # Fandom
    fandomID, _ = Fandoms.get_or_create(
                            fandomURL = fanfic_description_element[FANDOM_FIELD][1], 
                            fandomName =  fanfic_description_element[FANDOM_FIELD][0])
    # Rating
    ratingID, _ = Rating.get_or_create(
                            ratingText = fanfic_description_element[RATING_FIELD][0], 
                            ratingDescription = fanfic_description_element[RATING_FIELD][1])
    
    # Size
    sizeID, _ = Size.get_or_create(
                            sizeName = fanfic_description_element[FANFIC_SIZE_FIELD][0], 
                            sizeDescription =  fanfic_description_element[FANFIC_SIZE_FIELD][1])
    # Pairing
    if PAIRING_FIELD in fanfic_description_element:
        PairingID, _ = Pairing.get_or_create(pairingURL=fanfic_description_element[PAIRING_FIELD][1],
                                             pairingName = fanfic_description_element[PAIRING_FIELD][0])
    else:
        PairingID = None
    
    fanficID, was_create = FanficDescription.get_or_create(
        id = fanfic_id,
        fanficURL = fanfic_description_element[FANFIC_LINK_FIELD],
        authorID = authorID,
        fanficName = fanfic_description_element[TITLE_FIELD],
        directionID = direction_id,
        likes = fanfic_description_element[LIKES_COUNT_FIELD],
        rewards = fanfic_description_element[REWARD_COUNT_FIELD],
        pairingID = PairingID,
        ratingID = ratingID,
        sizeID = sizeID,
        pages = fanfic_description_element[FANFIC_SIZE_FULL_FIELD][2],
        parts =  fanfic_description_element[FANFIC_SIZE_FULL_FIELD][3],
        shortDescription = fanfic_description_element[DESCRIPTION_FIELD],
        
    )

In [14]:
def process_one_fanfic_description(fanfic_description_element : WebElement):
    one_row = {}
    u = fanfic_description_element.find_element_by_tag_name('h3')
    main_topic = u.find_element_by_tag_name('span').get_attribute('title')
    main_topic = main_topic.strip()
    one_row[MAIN_TOPIC_FIELD] = (main_topic.split('—')[0].strip(), main_topic)

    visit_element = fanfic_description_element.find_element_by_class_name('visit-link')
    title_of_fanfic = visit_element.text
    link_of_fanfic = visit_element.get_attribute('href')
    one_row[TITLE_FIELD] = title_of_fanfic
    one_row[FANFIC_LINK_FIELD] = link_of_fanfic
    one_row[FANFIC_ID_FIELD] = int(link_of_fanfic.split('/')[-1].split('?')[0])

    # Авторов может быть много
    authors = fanfic_description_element.find_element_by_class_name('author').find_elements_by_tag_name('a')
    one_row[AUTHORS_FIELD] = []
    for one_author in authors:
        one_row[AUTHORS_FIELD].append(
            (int(one_author.get_attribute('href').split('/')[-1]), 
             one_author.text,
             one_author.get_attribute('href'))
        )

    try:
        count_of_likes = fanfic_description_element.find_element_by_xpath('.//*/sup[@class="count"]/span[@class="value"]').text
        one_row[LIKES_COUNT_FIELD] = int(count_of_likes)
    except NoSuchElementException:
        one_row[LIKES_COUNT_FIELD] = 0
    try:
        # Potential error (element couldn't appear sometimes)
        count_of_rewards = fanfic_description_element.find_element_by_xpath('.//*/sup[@class="reward"]').text
        one_row[REWARD_COUNT_FIELD] = int(count_of_rewards)
    except NoSuchElementException:
        one_row[REWARD_COUNT_FIELD] = 0

    ### Additional information
    current_info_section = fanfic_description_element.find_element_by_class_name('info')

    for title, value in zip(current_info_section.find_elements_by_tag_name('dt'),
                            current_info_section.find_elements_by_tag_name('dd')[:-1]):
        if (title.text[:-1] == 'Фэндом'):
            try:
                one_row[FANDOM_FIELD] = (value.text, value.find_element_by_tag_name('a').get_attribute('href'))
            except NoSuchElementException:
                one_row[FANDOM_FIELD] = (value.text, '')
        elif (title.text[:-1] == 'Пэйринг и персонажи'):
            one_row[PAIRING_FIELD] = (value.text,
                                  value.find_element_by_tag_name('a').get_attribute('href'))
        elif (title.text[:-1] == 'Рейтинг'):
            one_row[RATING_FIELD] = (value.text,
                                     value.find_element_by_tag_name('strong').get_attribute('data-original-title').split('—')[1].strip().capitalize())
        elif (title.text[:-1] == 'Размер'):
            size_description = value.find_element_by_tag_name('strong').get_attribute('data-original-title').split('—')[1].strip().capitalize()
            category, count_of_pages, count_of_parts = value.text.split(',')
            one_row[FANFIC_SIZE_FIELD] = (
                category,
                size_description)
            one_row[FANFIC_SIZE_FULL_FIELD] = (
                category,
                size_description,
                count_of_pages.strip().split()[0], 
                count_of_parts.strip().split()[0]
                )

        elif (title.text[:-1] == 'Статус'):
            one_row[STATUS_FIELD] = value.text

    one_row[TAGS_FIELD] = []
    try:
        tags_element = fanfic_description_element.find_element_by_class_name('tags')
        spoilers = tags_element.find_elements_by_class_name('show-hidden-tags-btn.js-show-hidden-tags.tag.icon-warning2')
        if spoilers:
            spoilers[0].click()
        tags_element = fanfic_description_element.find_element_by_class_name('tags')
        for one_tag in tags_element.find_elements_by_class_name('tag\n'):
            one_row[TAGS_FIELD].append(
                (int(one_tag.get_attribute('href').split('/')[-1]),
                 one_tag.text,
                 one_tag.get_attribute('data-original-title').split('—')[1].strip(),
                 one_tag.get_attribute('href'))
            )
    except NoSuchElementException:
        one_row[TAGS_FIELD] = []

    #one_row['description'] = fanfic_description_element.find_element_by_xpath('//div[@class="fanfic-description"]/div[@class="wrap.word-break.urlize.fanfic-description-text"]').text
    u = fanfic_description_element.find_element_by_class_name("fanfic-description")
    description = u.find_element_by_class_name("wrap.word-break.urlize.fanfic-description-text").text
    one_row[DESCRIPTION_FIELD] = description
    return one_row


In [15]:
def process_one_search_page(current_link):
    driver.get(current_link)
    # Выделяем секцию фанфиков на текущей странице
    fanfic_sections = driver.find_elements_by_class_name('fanfic-thumb-block')
    print('Count of fanfics on this page: {}'.format(len(fanfic_sections)))
    return fanfic_sections

def process_one_date(current_link):
    current_search_page = current_link
    indexator = 1
    
    list_of_fanfic_descriptions = []
    
    while(current_search_page):
        fanfic_sections = process_one_search_page(current_search_page)
        for one_fanfic_section in fanfic_sections:
            current_row = process_one_fanfic_description(one_fanfic_section)
            list_of_fanfic_descriptions.append(current_row)
        try:
            current_search_page = driver.find_element_by_xpath('//*[@id="main"]/div[1]/section/nav[2]/div[3]/a').get_attribute('href')
            print('Page finished. Go to: {}'.format(current_search_page))
            indexator += 1
        except NoSuchElementException:
            current_search_page = ''
            print('Search finished. It takes {} iterations'.format(indexator))
            break
    for one_fanfic_description in tqdm_notebook(list_of_fanfic_descriptions):
        insert_fanfic_description(one_fanfic_description)
    print('Insertion process completed')


In [17]:
process_one_date(TARGET_SEARCH_LINK)

Count of fanfics on this page: 0
Search finished. It takes 1 iterations


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Insertion process completed


In [16]:
TARGET_SEARCH_LINK

'https://ficbook.net/find?title=&fandom_filter=any&fandom_group_id=1&sizes%5B%5D=3&sizes%5B%5D=4&pages_min=&pages_max=&ratings%5B%5D=5&ratings%5B%5D=6&ratings%5B%5D=7&ratings%5B%5D=8&ratings%5B%5D=9&transl=1&statuses%5B%5D=2&directions%5B%5D=1&directions%5B%5D=2&directions%5B%5D=3&directions%5B%5D=4&directions%5B%5D=7&directions%5B%5D=6&directions%5B%5D=5&author=0&likes_min=&likes_max=&dateFilter=1&date_create_min=2020-1-2&date_create_max=2020-1-3&date_update_min=2019-12-27&date_update_max=2019-12-27&sort=3&rnd=777383900&find=%D0%9D%D0%B0%D0%B9%D1%82%D0%B8%21#result'

In [None]:
fanfic_description_element = fanfic_sections[0]

In [None]:
fanfic_sections = process_search_page(TARGET_SEARCH_LINK)
for fanfic_description_element in fanfic_sections:
    dict_fanfic_descriptor = process_one_fanfic_description(fanfic_description_element)
    #print(dict_fanfic_descriptor)
    insert_fanfic_description(dict_fanfic_descriptor)

In [None]:
dict_fanfic_descriptor = process_one_fanfic_description(fanfic_description_element)
insert_fanfic_description(dict_fanfic_descriptor)
dict_fanfic_descriptor