## Парсинг сообщений из вашего личного архива VK

В архиве вашего профиля ВК можно найти папку `messages`, где каждая переписка вынесена в отдельную папку. Чтобы понять какую именно папку открывать, заглянем в `index-messages.html` и найдём нужного человека. Перейдя по ссылке, в адресной строке вы увидете что-то типа:
```
file:///home/username/.fr-GZFN5j/messages/-209008242/messages0.html
```
Здесь `-209008242` именно тот номер, который нас интересует. Находим эту папку в `messages` и переносим её в папку users.

In [1]:
from bs4 import BeautifulSoup
import bs4 as bs4
import os

USERS_DIR = os.path.join(os.getcwd(), "users")


def get_user_message_files(user_dir_name: str) -> list:
    """
    Returns a list of all html files for a given user."

    :param user_dir_name: name of the user folder
    :return: list of all files in the directory corresponding to this user in the USERS_DIR folder
    """
    messages_files = []
    user_dir = os.path.join(USERS_DIR, user_dir_name)
    for file_name in os.listdir(user_dir):
        if file_name.endswith(".html"):
            messages_files.append(os.path.join(user_dir, file_name))
    return messages_files


def parse_messages(html_code, user_name) -> str:
    """
    Extracts all text messages from an HTML document with correspondence.
    Forwarded messages, pictures, and other attachments are ignored.
    Each new paragraph in the message is saved on a separate line.

    :param html_code: a string containing the HTML code to extract messages from
    :return: a string containing the extracted messages
    """
    messages_text = ""

    soup = BeautifulSoup(html_code, 'html.parser')
    message_divs = soup.find_all('div', class_='message')

    for message_div in reversed(message_divs):
        kludges_div = message_div.find('div', class_='kludges')
        if kludges_div:
            parent_div = kludges_div.find_parent('div')

            name_div = message_div.find('div', class_='message__header')
            name_a = name_div.find('a')
            if name_a:
                name = name_a.text.strip()
            else:
                name = user_name
            for content in parent_div:
                if type(content) == bs4.element.NavigableString:
                    messages_text += f"{name}:{content}\n"
    return messages_text


def write_user_messages_to_file(user_dir_name, user_name, output_file):
    """
     Extracts the text of all messages for the user and writes them to the output file.

    :param user_dir_name: the ID of the user whose messages will be processed
    :param output_file: the file path where the text of all messages will be written to
    """

    messages_files = get_user_message_files(user_dir_name)
    sorted_files = sorted(messages_files, reverse=True)  # Sort the files in descending order

    with open(output_file, 'w', encoding='utf-8') as f:
        for file in sorted_files:
            with open(os.path.join(USERS_DIR, user_dir_name, file), 'r', encoding='windows-1251') as html_file:
                messages_text = parse_messages(html_file.read(), user_name)
                f.write(messages_text)

## Находим сообщения, содержащие нужные слова

Сначала используем парсер и выгрузим сообщения в переменную `messages_text`:

In [2]:
USER = "!!!ENTER DIALOG ID!!!"
SELF_NAME = "Иван Петров" # ENTER YOUR NAME

write_user_messages_to_file(USER, SELF_NAME, "output.txt")
f = open("output.txt", "r") 
messages_text = f.read()
f.close()

## Получаем датасет

In [4]:
import json

input_lines = 8  # Number of lines for input code
output_lines = 3  # Number of lines for output code
step_size = 4  # Step size for sliding window

In [5]:
input_list = []
output_list = []
# Open the file in read mode and read all lines into a list
with open('output.txt', 'r') as file:
    lines = file.readlines()

    lines = [line.replace("Дмитрий Мордвинов", "Дима") for line in lines]
    lines = [line.replace("Владимир Сидоров", "Владимир") for line in lines]

    for i in range(0, len(lines), step_size):
        start_line = i
        end_line = min(i + input_lines, len(lines))
        input_code = "".join(lines[i:end_line]).strip()

        output_start_line = end_line
        output_end_line = min(output_start_line + output_lines, len(lines))
        output_code = "".join(lines[output_start_line:output_end_line]).strip()
        
        if not output_code:
            continue

        input_list.append(input_code)
        output_list.append(output_code)

        print(f"Parsing lines {start_line}-{end_line}:")
        print(f"Parsing lines {output_start_line}-{output_end_line}:")
        print("----------------------------\n")

data = []

for input_code, output_code in zip(input_list, output_list):
    data.append({
        "input": input_code,
        "output": output_code
    })

with open("dataset.json", "w", encoding="utf-8") as json_file:
    for i, item in enumerate(data):
        json.dump(item, json_file, separators=(',', ':'), ensure_ascii=False)
        if i != len(data) - 1:
            json_file.write('\n')

Parsing lines 0-8:
Parsing lines 8-11:
----------------------------

Parsing lines 4-12:
Parsing lines 12-15:
----------------------------

Parsing lines 8-16:
Parsing lines 16-19:
----------------------------

Parsing lines 12-20:
Parsing lines 20-23:
----------------------------

Parsing lines 16-24:
Parsing lines 24-27:
----------------------------

Parsing lines 20-28:
Parsing lines 28-31:
----------------------------

Parsing lines 24-32:
Parsing lines 32-35:
----------------------------

Parsing lines 28-36:
Parsing lines 36-39:
----------------------------

Parsing lines 32-40:
Parsing lines 40-43:
----------------------------

Parsing lines 36-44:
Parsing lines 44-47:
----------------------------

Parsing lines 40-48:
Parsing lines 48-51:
----------------------------

Parsing lines 44-52:
Parsing lines 52-55:
----------------------------

Parsing lines 48-56:
Parsing lines 56-59:
----------------------------

Parsing lines 52-60:
Parsing lines 60-63:
---------------------------