In [1]:
import requests
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from bs4 import BeautifulSoup

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ksenia/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
session = requests.session()
poets, titles, poems = [], [], []

In [4]:
def get_poem(poem_url):
  poem_req = session.get(poem_url)
  poem_page = poem_req.text
  poem_soup = BeautifulSoup(poem_page)
  poem = poem_soup.find('div', {'class': 'entry-content poem-text'}).text.split('(adsbygoogle')[0]
  if len(poem.split()) >= 110:
    return poem
  return False

In [7]:
def get_poet(poet_url):
  poet_req = session.get(poet_url)
  poet_page = poet_req.text
  poet_soup = BeautifulSoup(poet_page)
  all_poems = poet_soup.find_all('div', {'class': 'entry-title'})
  global poets
  global titles
  global poems
  for poem_url in all_poems:
    poem = get_poem(poem_url.find('a', href=True)['href'])
    if poem != False:
      try:
        poet, title = poem_url.text.strip().split(' — ')
        poets.append(poet)
        titles.append(title)
        poems.append(poem)
      except:
        print(poem_url.text)
  return poets, titles, poems

In [8]:
def get_data(url):
  req = session.get(url)
  page = req.text
  soup = BeautifulSoup(page)
  article = soup.find('article')
  links = article.find_all('li')
  for link in links[:5]:
    get_poet(link.find('a', href=True)['href'])

In [9]:
get_data('https://rustih.ru/stixi-sovetskix-poetov/')

Марина Цветаева — Асе (Ты — принцесса из царства не светского)
Сергей Есенин — Да! Теперь — решено. Без возврата


In [117]:
len_poems = []
for poem in poems:
  len_poems.append(len(poem.split()))

In [12]:
df = pd.DataFrame(columns=['poet', 'title', 'sentence'])

In [10]:
def parse_poem(poem):
  sentences = sent_tokenize(poem)
  clean_sentences = []
  for sent in sentences:
    clean_sent = [word.strip() for word in word_tokenize(sent) if word.strip().isalpha()]
    clean_sentences.append(' '.join(clean_sent))
  return clean_sentences

In [13]:
for i, poet in enumerate(poets):
  poem = poems[i]
  clean_poem = parse_poem(poem)
  for sent in clean_poem:
    df = df.append({'poet': poet, 'title': titles[i], 'sentence': sent}, ignore_index=True)

In [14]:
df.head()

Unnamed: 0,poet,title,sentence
0,Иван Бунин,Вечер,О счастье мы всегда лишь вспоминаем
1,Иван Бунин,Вечер,А счастье всюду
2,Иван Бунин,Вечер,Может быть оно Вот этот сад осенний за сараем ...
3,Иван Бунин,Вечер,В бездонном небе легким белым краем Встает сия...
4,Иван Бунин,Вечер,Давно Слежу за Мы мало видим знаем А счастье т...


In [24]:
import sqlite3

In [30]:
conn = sqlite3.connect('poems_corpus.db')
cur = conn.cursor()

In [31]:
cur.execute("""
CREATE TABLE IF NOT EXISTS info (
    id_info INTEGER PRIMARY KEY, 
    poet TEXT,
    title TEXT
)
""")

cur.execute("""
CREATE TABLE IF NOT EXISTS sentences (
    id_sent INTEGER PRIMARY KEY, 
    sent TEXT
)
""")

cur.execute("""
CREATE TABLE IF NOT EXISTS poems_to_info
(id INTEGER PRIMARY KEY AUTOINCREMENT, id_info int, id_sent int) 
""")

<sqlite3.Cursor at 0x7fe8430adb20>

In [32]:
cur.execute('SELECT id_info, poet, title FROM info')
db_info = {}
max_id = 0
for idx, poet, title in cur.fetchall():
    max_id += 1
    if poet in db_info:
        db_info[poet].append(title)
    else:
        db_info[poet] = [title]

poem_cnt = 1
for index, row in df.iterrows():
    poet = row['poet']
    title = row['title']
    if poet not in db_info:
        db_info[poet] = []
    
    if title not in db_info[poet]:
        db_info[poet].append(title)
        max_id += 1
        cur.execute('INSERT INTO info VALUES (?, ?, ?)', (max_id, poet, title))
        conn.commit()

    poem_sent = row['sentence']
    cur.execute('INSERT INTO sentences VALUES (?, ?)', (poem_cnt, poem_sent))
    cur.execute('INSERT INTO poems_to_info (id_info, id_sent) VALUES (?, ?)', (max_id, poem_cnt))
    poem_cnt += 1

In [177]:
cur.execute('SELECT poet_id, poet FROM poets')
db_poets = {}
for idx, name in cur.fetchall():
    db_poets[name] = idx

cur.execute('SELECT title_id, title FROM titles')
db_titles = {}
for name, idx in cur.fetchall():
    db_titles[name] = idx
 
been_titles = []
poem_cnt = 1
for index, row in df.iterrows():
  poet = row['poet']
  if poet not in db_poets:
    if db_poets.values():
      db_poets[poet] = max(db_poets.values()) + 1 
    else:
      db_poets[poet] = 1
    cur.execute('INSERT INTO poets VALUES (?, ?)', (db_poets[poet], poet))
    conn.commit()
  
  title = row['title']
  if title not in db_titles:
    if db_titles.values():
      db_titles[title] = max(db_titles.values()) + 1 
    else:
      db_titles[title] = 1
    cur.execute('INSERT INTO titles VALUES (?, ?)', (db_titles[title], title))
    conn.commit()

  if title not in been_titles:
    cur.execute('INSERT INTO poets_to_titles (id_poet, id_title) VALUES (?, ?)', (db_poets[poet], db_titles[title]))
    been_titles.append(title)
  
  poem_sent = row['sentence']
  cur.execute('INSERT INTO sentences VALUES (?, ?)', (poem_cnt, poem_sent))
  poem_cnt += 1
  
  cur.execute('INSERT INTO poems_to_titles (id_poem, id_title) VALUES (?, ?)', (poem_cnt, db_titles[title]))