In [18]:
from bs4 import BeautifulSoup
from collections import defaultdict
import pandas as pd
import requests
import logging
import pickle
import time
import json
import re

In [2]:
from natasha import (NamesExtractor, SimpleNamesExtractor)
from natasha.markup import format_json
from urllib.parse import urlparse

extractor = SimpleNamesExtractor()

In [3]:
from tqdm import tqdm_notebook

## WIKI

### Вспомогательные функции

In [4]:
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)


In [5]:
def get_soup(url):
    """
    Download a page
    """
    global session
    req = session.get(url)
    if req.status_code == 200:
        html = req.text
        soup = BeautifulSoup(html, 'lxml')
        return soup
    return None


def preproc(text):
    """
    Preprocessing text
    """
    global replacements
    for repl in replacements:
        text = re.sub(repl, replacements[repl], text)
    return text


def preproc_book(text):
    """
    Preprocessing book names
    """
    text = preproc(text)
    text = re.sub('«|»', '', text)
    return text


In [10]:
def get_wiki_id(link):
    """
    Extract wiki id from a link (Qid)
    """
    soup = get_soup(link)
    res = list(soup.find_all('li', id="t-wikibase"))
    w_id = None
    if res:
        w_id = res[0].a.attrs['href']
        w_id = urlparse(w_id).path
        w_id = w_id.split('/')[-1]
        w_id = w_id.title()
    return w_id


def person_wiki_page(line):
    """
    Extract wiki links for authors
    """
    global base_url
    wiki_link = line.a.attrs['href']
    urlp = urlparse(wiki_link)
    w_id = None
    if urlp.path.startswith('/wiki/'):
        if urlp.scheme == '' and urlp.netloc == '':
            url = base_url + wiki_link
        else:
            url = wiki_link
        w_id = get_wiki_id(url)
    return w_id


In [11]:
def book_name_parser(line, all_books, person_id, book_id):
    """
    Extract and add book information
    """
    for title in line.find_all('li'):
        links = title.find_all('a')
        book_url = None

        if links == []: 
            book_name = list(title.children)[0]
        else:
            res = links[0]
            book_name = res.text
            lnk = res.attrs['href']
            if urlparse(lnk).path.startswith('/wiki/'):
                book_url = lnk

        all_books['id'].append(book_id)
        all_books['author_id'].append(person_id)
        all_books['book_name'].append(preproc_book(book_name))
        all_books['book_url'].append(book_url)
        book_id += 1

    return all_books, book_id


In [12]:
def person_name_parser(line, person_id, main_tag, small_tag, col_names):
    """
    Extract authors' names
    """
    person = {}
    person['id'] = person_id
    person['main_cat'] = main_tag
    person['add_cat'] = small_tag
    person['wiki_id'] = person_wiki_page(line)

    name = preproc(line.text)
    person['name'] = name

    words = name.split(' ')
    ttls = [n for n in words if n.istitle()]

    if len(ttls) == len(words):
        matches = extractor(name)
        for index, _ in enumerate(matches):
            ordered = _.fact.as_json
            for name_type in ordered:
                col = name_type + str(index)
                value = ordered[name_type]
                person[col] = value.title()
                col_names.add(col)

    person_id += 1
    return person, person_id


### Скачиваем

Лог

In [25]:
dir_path = 'new/new_data/{}'

logging.basicConfig(filename=dir_path.format('log3.txt'),
                    filemode='a', level=logging.DEBUG)
logging.info("Wiki parse")

Скачиваем страницу Викитека:Школьная_программа

In [26]:
link_wiki = 'https://ru.wikisource.org/wiki/Викитека:Школьная_программа'
base_url = 'https://ru.wikisource.org'
soup_wiki = get_soup(link_wiki)
body = soup_wiki.find("div", {"class": "mw-content-ltr"})
body = list(body)[0]

Обработка

In [27]:
replacements = {'\[.+?\]': '', '\.-': '. ',
                '\(.+?\)': '', '—|–':  '-',
                ' +': ' ', ' ?- ?': '-', 
                '(?:^ | $|\n|\*|\xa0)': ''}

In [28]:
all_books = {'id': [], 'author_id': [], 'book_name': [], 'book_url': []}
person_id, book_id = 0, 0
main_tag, small_tag = '', ''
all_persons = []
col_names = set()

In [29]:
index = 0
check_id = 0

for line in body:

    if line.name == 'ul':
        if check_id > -150:
            all_books, book_id = book_name_parser(line, all_books,
                                                  person_id, book_id)

    if line.name == 'h2':  # главные разделы
        main_tag = preproc(line.text)
        small_tag = ''

    if line.name == 'h4':  # имя
        check_id += 1
        if check_id > -150:
            person, person_id = person_name_parser(line, person_id, main_tag,
                                                   small_tag, col_names)
            all_persons.append(person)
            logging.info(str(person))
            time.sleep(2)

    if line.name == 'h3':
        if main_tag == 'Русская литература':  # подкатегории
            small_tag = preproc(line.text)

        else:  # имя
            check_id += 1
            if check_id > -150:
                person, person_id = person_name_parser(line, person_id,
                                                       main_tag, small_tag,
                                                       col_names)
                all_persons.append(person)
                logging.info(str(person))
                time.sleep(2)

    if index > 0 and index % 50 == 0:
        time.sleep(25)
    index += 1


In [30]:
len(all_persons)

171

In [14]:
len(all_books['id'])

612

### Записть в файл

Таблица писателей

* id 
* name
* middle
* last1
* last2
* tag_main
* tag_smal

In [31]:
col_names = ['id', 'main_cat', 'add_cat', 'wiki_id', 'name',
            'first0', 'first1', 'middle0', 'middle1', 'last0', 'last1']

In [32]:
n_people = len(all_persons)
d = {i:[0]*n_people for i in col_names}

for index, data in enumerate(all_persons):
    for col in data:
        d[col][index] = data[col]

In [34]:
df = pd.DataFrame(d)
df.to_csv(dir_path.format('all_persons.tsv'), sep='\t')

Таблица книг

* id
* author_id
* book_name
* book_link

In [35]:
df = pd.DataFrame(all_books)
df.to_csv(dir_path.format('all_books.tsv'), sep='\t')

In [36]:
df

Unnamed: 0,id,author_id,book_name,book_url
0,0,1,"Помню, я ещё младшенька была…","/wiki/%D0%9F%D0%BE%D0%BC%D0%BD%D1%8E,_%D1%8F_%..."
1,1,1,"Ай вы, ветры, ветры буйные…","/wiki/%D0%90%D0%B9_%D0%B2%D1%8B,_%D0%B2%D0%B5%..."
2,2,2,Ермак готовится к походу на Сибирь,/wiki/%D0%95%D1%80%D0%BC%D0%B0%D0%BA_%D0%B3%D0...
3,3,2,Пугачёв в темнице,/wiki/%D0%9F%D1%83%D0%B3%D0%B0%D1%87%D1%91%D0%...
4,4,3,Два Ивана-солдатских сына,/wiki/%D0%9D%D0%B0%D1%80%D0%BE%D0%B4%D0%BD%D1%...
...,...,...,...,...
606,606,171,Стандарт основного общего образования по литер...,
607,607,171,Стандарт основного общего образования по литер...,
608,608,171,Базовый уровень,
609,609,171,Базовый уровень,


# Список по классам

Данные: http://lickey3.narod.ru/spisok_reed.pdf

Открытие данных об авторах и произведениях

In [52]:
person_data = pd.read_csv(dir_path.format('all_persons.tsv'), sep='\t')
book_data = pd.read_csv(dir_path.format('all_books.tsv'), sep='\t')

In [59]:
book_data = book_data.drop(columns=['Unnamed: 0'])

In [589]:
person_data.head(2)

Unnamed: 0.1,Unnamed: 0,id,main_cat,add_cat,wiki_id,name,first0,first1,middle0,middle1,last0,last1
0,0,0,Русская литература,Фольклор,,Устное народное творчество,0,0,0,0,0,0
1,1,1,Русская литература,Фольклор,,Исторические песни,0,0,0,0,0,0


In [590]:
book_data.head(2)

Unnamed: 0,id,author_id,book_name,book_url
0,0,0,"Помню, я ещё младшенька была…","/wiki/%D0%9F%D0%BE%D0%BC%D0%BD%D1%8E,_%D1%8F_%..."
1,1,0,"Ай вы, ветры, ветры буйные…","/wiki/%D0%90%D0%B9_%D0%B2%D1%8B,_%D0%B2%D0%B5%..."


Список по классам

In [41]:
with open('/Users/Stoneberry/Desktop/школьный архив/spisok_reed.txt', 'r', encoding='utf-8') as f:
    array = f.read()
    literature = array.split('\n\n')

In [62]:
year_list = {}

for cat in literature:
    res = cat.split('\n')
    year_list[res[0]] = res[1:]

### Сопоставление данных

In [44]:
def make_replacements(line, names, repl):
    """
    Convert names to one format
    """
    for name in names:
        name2 = name
        for rep in repl:
            name2 = re.sub(rep, repl[rep], name2)
        line = re.sub(name, name2, line)
    return line

In [63]:
name_pattern1 = '(?:[А-Я]\. ?)?-?[А-Я]\. ?[А-Я]\w+?\\b'
name_pattern1 = re.compile(name_pattern1)
repl1 = {'-': '', ' ': '_'}

name_pattern2 = '[А-Я]\w+? (?:[А-Я]\. )?-?[А-Я]\.'
name_pattern2 = re.compile(name_pattern2)
repl2 = {'-': '', '\. ': '.'}

Приведение имен авторов к одному формату

In [64]:
for class_ in year_list:
    for index, line in enumerate(d[class_]):
        names = re.findall(name_pattern2, line)
        if names:
            for name1 in names:
                name2 = name1.split(' ')
                name2 = name2[1:] + name2[:1]
                name2 = ''.join(name2)
                line = re.sub(name1, name2, line)
            year_list[class_][index] = line
        else:
            names = re.findall(name_pattern1, line)
            if names:
                line = make_replacements(line, names, repl2)
                year_list[class_][index] = line

Сопоставление данных

In [65]:
class_n = [None for i in range(book_data.shape[0])]

In [66]:
for class_ in year_list:
    for index, line in enumerate(year_list[class_]):

        if ':' in line: sent = line.split(':')
        else: sent = line.split(' ', 1)
        
        name = sent[0]
        books = sent[1]
        in_data = False
        
        names = preproc(name).split(',')
        for name in names:
            if '.' in name:
                name = [i for i in name.split('.') if i != '']
                for field in ['first1', 'last0', 'last1', 'name']:
                    res = person_data[person_data[field] == name[-1]]
                    if not res.empty and len(res.index) == 1:
                        in_data = res['id'].values[0]
        
        if re.findall('\.|"|,', books) != []:

            if '"' in books: books = re.findall('"(.*?)"', books)
            elif ',' in books:  books = books.split(',')
            elif '.' in books:  books = books.split('.')
            
            for book in books:
                book = preproc_book(book)
                if book != '':
                    res = book_data[book_data['book_name'] == book]
                    if not res.empty and len(res.index) == 1:
                        class_n[res.index[0]] = class_
                    elif res.empty and in_data and len(book) > 1:
                        bk_id = book_data.shape[0]
                        class_n.append(class_)
                        book_data.loc[bk_id] = [bk_id, in_data, book, None]


In [67]:
book_data['grade'] = class_n

In [68]:
book_data.head()

Unnamed: 0,id,author_id,book_name,book_url,grade
0,0,1,"Помню, я ещё младшенька была…","/wiki/%D0%9F%D0%BE%D0%BC%D0%BD%D1%8E,_%D1%8F_%...",
1,1,1,"Ай вы, ветры, ветры буйные…","/wiki/%D0%90%D0%B9_%D0%B2%D1%8B,_%D0%B2%D0%B5%...",
2,2,2,Ермак готовится к походу на Сибирь,/wiki/%D0%95%D1%80%D0%BC%D0%B0%D0%BA_%D0%B3%D0...,
3,3,2,Пугачёв в темнице,/wiki/%D0%9F%D1%83%D0%B3%D0%B0%D1%87%D1%91%D0%...,
4,4,3,Два Ивана-солдатских сына,/wiki/%D0%9D%D0%B0%D1%80%D0%BE%D0%B4%D0%BD%D1%...,


In [600]:
book_data.to_csv(dir_path.format('all_books3.tsv'), sep='\t', index=False)