In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup


In [2]:
class Parser_anecdotes:

    def __init__(self, start_url):
        self.start_url = start_url

    # получение анекдотов и ссылок на след стреницы
    def get_text(self, url):
        inner_html_code = str(urlopen(url).read(),'utf-8')
        inner_soup = BeautifulSoup(inner_html_code, "html.parser")
        # получаем все анекдоты на странице
        anecs = [element.text for element in inner_soup.find_all('div', {"class": 'text'})]
        # получаем ссылки на следующие страницы
        pages_block = inner_soup.find('div', {"class": 'pageslist'})
        pages_block = pages_block.select('span ~ a')
        pages = []
        if len(pages_block) != 0:
            pages = [self.start_url + tag["href"] for tag in pages_block[:-1]]
        return anecs, pages

    def write_data(self, file, data):
        with open(file, 'a', encoding='utf-8') as f:
            for line in data:
                f.write(line + '\n')

    # получаем анекдоты из категории и записываем в файл
    def get_data_categories(self, file, start_url):
        anecs_from_page, pages_url = self.get_text(start_url)
        self.write_data(file, anecs_from_page)

        url = pages_url

        for curr_url in url:
            anecs_from_page, pages_url = self.get_text(curr_url)
            self.write_data(file, anecs_from_page)
            if (len(pages_url) != 0) and (pages_url[-1] not in url):
                url.append(pages_url[-1])

    # получаем ссылки на категории "лучшее за *"
    def get_urls(self):
        inner_html_code = str(urlopen(self.start_url).read(),'utf-8')
        inner_soup = BeautifulSoup(inner_html_code, "html.parser")
        url_block = inner_soup.find('ul', {"class": "second"})
        urls = [self.start_url + tag["href"] for tag in url_block.select("a")[1:4]]
        return urls

    # создаем и записываем датасет с анекдотами в file
    def create_dataset(self, file):
        start_urls = self.get_urls()

        for url in start_urls:
            self.get_data_categories(file, url)

        print("Данные получены и записаны в файл ", file)


In [3]:
file = 'anecdotes_data.txt'
start_url = 'https://www.anekdot.ru'

parser = Parser_anecdotes(start_url)
parser.create_dataset(file)

Данные получены и записаны в файл  anecdotes_data.txt


In [5]:
!pip install markovify

Collecting markovify
  Downloading markovify-0.9.4-py3-none-any.whl.metadata (23 kB)
Collecting unidecode (from markovify)
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading markovify-0.9.4-py3-none-any.whl (19 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode, markovify
Successfully installed markovify-0.9.4 unidecode-1.4.0


In [6]:
import markovify
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


In [7]:
class Markov_anecdotes:
    def __init__(self, text):
        self.text = text
        self.text_model = markovify.Text(text)

In [8]:
file = 'anecdotes_data.txt'
text = open(file, encoding='utf-8').read()
markov_model = Markov_anecdotes(text)
print(markov_model.text_model.make_sentence())

как отдохнули?Общее мнение, что после премьеры фильма «Ирония Судьбы» 1 января эта услуга платная - дополнительные 200 рублей с человека.


In [9]:
class LSTM_anecdotes:
    def __init__(self, data, labels, total_words):
        self.model = Sequential()
        self.model.add(Embedding(total_words, 200))
        self.model.add(LSTM(150))
        self.model.add(Dense(total_words, activation='softmax'))
        self.model.compile(optimizer='adam',
                            loss='sparse_categorical_crossentropy',
                            metrics=['accuracy'])
        self.history = self.model.fit(x=data, y=labels, epochs=10, verbose=1)


def tokenizer_text(text):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([text])
    return tokenizer

def text_to_seq(text, tokenizer):
    input_sequences = []
    for line in text.split('\n'):
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    max_seq_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre'))
    return input_sequences

In [10]:
tokenizer = tokenizer_text(text)
input_seq = text_to_seq(text, tokenizer)

data = input_seq[:, :-1]
labels = input_seq[:, -1]
total_words = len(tokenizer.word_index) + 1

lstm_model = LSTM_anecdotes(data, labels, total_words)

Epoch 1/10
[1m   7/5593[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:18:18[0m 1s/step - accuracy: 6.3776e-04 - loss: 10.5135

KeyboardInterrupt: 