In [1]:
! python3 -m ipykernel install --user --name=main-venv

Installed kernelspec main-venv in /home/roman/.local/share/jupyter/kernels/main-venv


In [2]:
import os

import json

from tqdm import tqdm

import pandas as pd
import numpy as np

from pydantic import BaseModel, Field

from typing import List

from langchain_core.messages import SystemMessage, AIMessage, HumanMessage
from langchain_core.exceptions import OutputParserException

from langchain_gigachat.chat_models import GigaChat

from parser.static_parsers import build_hranidengi_parser
from parser.static_parsers import build_findozor_parser

In [3]:
DATA_DIR = "data"

# ‚úÖBuild Giga solution

In [4]:
class GigaOutput(BaseModel):

    topic: str = Field(description='''
        –ù–∞–∑–≤–∞–Ω–∏–µ —Ç–µ–º—ã –Ω–∞ —Ñ–æ—Ä—É–º–µ.
    ''')
    menace_status: str = Field(description='''
        –û—Ç–≤–µ—Ç –æ—Ç –ì–∏–≥–∞—á–∞—Ç–∞ –Ω–∞ –≤–æ–ø—Ä–æ—Å, –æ–±—Å—É–∂–¥–∞–µ—Ç—Å—è –ª–∏
        –≤ —ç—Ç–æ–π —Ç–µ–º–µ –Ω–∞—Ä—É—à–µ–Ω–∏–µ –∑–∞–∫–æ–Ω–∞ —Å —Ü–µ–ª—å—é –ª–∏—á–Ω–æ–π –≤—ã–≥–æ–¥—ã.
    '''),
    comment: str = Field(description='''
        –ù–µ–±–æ–ª—å—à–æ–π –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–π –æ—Ç –ì–∏–≥–∞—á–∞—Ç–∞,
        –ø–æ—á–µ–º—É –æ–Ω –ø–æ—Å—á–∏—Ç–∞–ª –æ–±—Å—É–∂–¥–µ–Ω–∏–µ –≤ —Ç–µ–º–µ –º–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏–º–∏.
    ''')
    highlights: str = Field(description='''
        –ì–ª–∞–≤–Ω—ã–µ, –ø–æ –º–Ω–µ–Ω–∏—é –≥–∏–≥–∞—á–∞—Ç–∞, —Å–ª–æ–≤–∞. 
    ''')

class GigaMenace:

    def __init__(self, token: str, prompt: str, model: str = 'GigaChat-2'):

        self.model = GigaChat(
            credentials=token,
            verify_ssl_certs=False,
            model=model
        ).with_structured_output(GigaOutput, method='json_mode')

        self.prompt = prompt
        self.messages = []

    def _init_chat(self, messages: List[str]):
        
        self.messages = [
            SystemMessage(content=self.prompt)
        ]

        for message in messages:
            self.messages.append(
                HumanMessage(content=str(message))
            )
    
    def inference(self, topics: List[str]):
        
        res = []
        
        for topic in tqdm(topics):

            try:
                self._init_chat(topics[topic])
                answer = self.model.invoke(
                    self.messages
                )
            except OutputParserException:
                answer = {'topic': topic, 'menace_status': '–¥–∞', 'comment': '—á—É–≤—Å—Ç–≤–∏—Ç–µ–ª—å–Ω–∞—è —Ç–µ–º–∞', 'highlights': ''}
                
            res.append(dict(answer))
        
        return res

In [5]:
with open('gc-token.txt', 'r') as file:
    TOKEN = file.read()

system_prompt = '''
    –¢—ã - –ø–æ–º–æ—à–Ω–∏–∫ –ø–æ –±–æ—Ä—å–±–µ —Å –æ–Ω–ª–∞–π–Ω –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–æ–º –≤ –±–∞–Ω–∫–µ.
    –¢–µ–±–µ –¥–∞—é—Ç –¥–∏–∞–ª–æ–≥ –Ω–∞ –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–Ω—É—é —Ç–µ–º—É –Ω–∞ –æ–¥–Ω–æ–º –∏–∑ –æ–Ω–ª–∞–π–Ω –±–∞–Ω–∫–æ–≤—Å–∫–∏—Ö —Ñ–æ—Ä—É–º–æ–≤.
    –¢–≤–æ—è –∑–∞–¥–∞—á–∞ - –æ–ø—Ä–µ–¥–µ–ª–∏—Ç—å, —Å–æ–¥–µ—Ä–∂–∏—Ç –ª–∏ –¥–∏–∞–ª–æ–≥ —ç–ª–µ–º–µ–Ω—Ç—ã –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞ - 
    –µ—Å–ª–∏ –¥–∞, –æ—Ç–≤–µ—Ç—å "–¥–∞", –∏–Ω–∞—á–µ - "–Ω–µ—Ç".
    –¢–∞–∫–∂–µ –¥–∞–π –Ω–µ–±–æ–ª—å—à–æ–π –∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–π, –ø–æ—á–µ–º—É —Ç—ã —Ç–∞–∫ —Ä–µ—à–∏–ª.
    –ì–ª–∞–≤–Ω—ã–µ —Å–ª–æ–≤–∞ –≤ –¥–∏–∞–ª–æ–≥–µ - –ø–æ–¥—Å–≤–µ—Ç–∏.

    –ü—Ä–æ–∞–Ω–∞–ª–∏–∑–∏—Ä—É–π —Ç–µ–º—É –∏ –≤–µ—Ä–Ω–∏ –æ—Ç–≤–µ—Ç –≤ —Ñ–æ—Ä–º–∞—Ç–µ JSON —Å–æ —Å—Ç—Ä—É–∫—Ç—É—Ä–æ–π:
        {
          "topic": "–æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω–æ–µ –Ω–∞–∑–≤–∞–Ω–∏–µ —Ç–µ–º—ã"
          "menace_status": "–¥–∞/–Ω–µ—Ç",
          "comment": "—Ç–≤–æ–π –∫–æ—Ä–æ—Ç–∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–π",
          "highlights": "—Å–∞–º—ã–µ —è—Ä–∫–∏–µ —Ç–µ–∑–∏—Å—ã –¥–∏–∞–ª–æ–≥–∞, –µ—Å–ª–∏ –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–æ, —Ç–æ —è–≤–Ω—ã–µ —Å–ª–æ–≤–∞ –æ –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–µ"
        }

    –ó–∞–ø—Ä–µ—â–µ–Ω–æ:
        - –î–æ–±–∞–≤–ª—è—Ç—å –ø–æ—Å—Ç–æ—Ä–æ–Ω–Ω–∏–π —Ç–µ–∫—Å—Ç
        - –ú–µ–Ω—è—Ç—å —Å—Ç—Ä—É–∫—Ç—É—Ä—É JSON
'''

giga = GigaMenace(TOKEN, system_prompt)

# Hranidengi.com

In [6]:
hranidengi_parser = build_hranidengi_parser()

hranidengi_topics = pd.read_csv('data/hranidengi_mapping_credit_story.csv')
hranidengi_topics = hranidengi_topics[hranidengi_topics['menace_status'] == '–¥–∞']

hranidengi_topics_links = hranidengi_parser.parse_topics('https://hranidengi.com/forums/kreditnaja-istorija/')

In [7]:
hranidengi_topics_links

{'–ù–µ–∑–∞–∫–æ–Ω–Ω—ã–µ –∑–∞–ø—Ä–æ—Å—ã –∫—Ä–µ–¥–∏—Ç–Ω–æ–π –∏—Å—Ç–æ—Ä–∏–∏': 'https://hranidengi.com/threads/nezakonnye-zaprosy-kreditnoj-istorii.229/',
 '–û–±—Å—É–∂–¥–µ–Ω–∏–µ –∏ –∞–Ω–∞–ª–∏–∑ –∫—Ä–µ–¥–∏—Ç–Ω–æ–π –∏—Å—Ç–æ—Ä–∏–∏': 'https://hranidengi.com/threads/obsuzhdenie-i-analiz-kreditnoj-istorii.1119/',
 '–ö–∞–∫ –±–µ—Å–ø–ª–∞—Ç–Ω–æ –ø—Ä–æ–≤–µ—Ä–∏—Ç—å –∫—Ä–µ–¥–∏—Ç–Ω—É—é –∏—Å—Ç–æ—Ä–∏—é': 'https://hranidengi.com/threads/kak-besplatno-proverit-kreditnuju-istoriju.401/',
 '–°–µ—Ä–≤–∏—Å—ã –æ–ø–ª–∞—Ç—ã —á–∞—Å—Ç—è–º–∏ –∏ –∏—Ö –≤–ª–∏—è–Ω–∏–µ –Ω–∞ –∫—Ä–µ–¥–∏—Ç–Ω—É—é –∏—Å—Ç–æ—Ä–∏—é': 'https://hranidengi.com/threads/servisy-oplaty-chastjami-i-ix-vlijanie-na-kreditnuju-istoriju.669/',
 '–ë–∞–Ω–∫ "–†–æ—Å—Å–∏—è" –∑–∞–≤—ã—à–∞–µ—Ç —Å—Ä–µ–¥–Ω–µ–º–µ—Å—è—á–Ω—ã–µ –ø–ª–∞—Ç–µ–∂–∏': 'https://hranidengi.com/threads/bank-rossija-zavyshaet-srednemesjachnye-platezhi.920/',
 '–í–Ω–µ—Å–µ–Ω–∏–µ –∏–∑–º–µ–Ω–µ–Ω–∏–π –≤ –∫—Ä–µ–¥–∏—Ç–Ω—É—é –∏—Å—Ç–æ—Ä–∏—é': 'https://hranidengi.com/threads/vnesenie-izmenenij-v-kreditnuju-ist

In [8]:
messages = dict()

for topic in hranidengi_topics['topic']:
    parsed = list(
        hranidengi_parser.parse_topic_comments(hranidengi_topics_links[topic], 5).values()
    )[0]
    parsed = ''.join(parsed).split('#')
    messages[topic] = parsed

In [9]:
hranidengi_res = giga.inference(messages)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.89s/it]


In [10]:
hranidengi_res

[{'topic': '–í–æ–ø—Ä–æ—Å –ª–∞–º–µ—Ä–∞: –∫–∞–∫ –∏—Å–ø–æ—Ä—Ç–∏—Ç—å –ö–ò –ª—é–±–∏–º–æ–º—É —Ä–æ–¥—Å—Ç–≤–µ–Ω–Ω–∏–∫—É, –∑–∞ –∫–æ—Ç–æ—Ä—ã–º –Ω—É–∂–µ–Ω –ø—Ä–∏—Å–º–æ—Ç—Ä, –µ—Å–ª–∏ –Ω–µ—Ç —Ä–µ—Å—É—Ä—Å–æ–≤ –Ω–∞ –∫—Ä—É–≥–ª–æ—Å—É—Ç–æ—á–Ω—ã–π –ø—Ä–∏—Å–º–æ—Ç—Ä.',
  'menace_status': '–¥–∞',
  'comment': '—á—É–≤—Å—Ç–≤–∏—Ç–µ–ª—å–Ω–∞—è —Ç–µ–º–∞',
  'highlights': ''}]

# Findozor.net

In [11]:
findozor_parser = build_findozor_parser()

findozor_topics = pd.read_csv('data/findozor_mapping_credit_story.csv')
# findozor_topics = findozor_topics[findozor_topics['menace_status'] == '–¥–∞']
findozor_topics = {
    '–ù–∞ –∞–≤–∏—Ç–æ –ø—Ä–µ–¥–ª–∞–≥–∞—é—Ç –∑–∞ –¥–µ–Ω—å–≥–∏ —Ä–∞–∑–±–ª–æ–∫–∏—Ä–æ–≤–∞—Ç—å —Å—á—ë—Ç –≤ –±–∞–Ω–∫–µ',
    '–†–µ—Ñ–∏–Ω–∞–Ω—Å–∏—Ä–æ–≤–∞–Ω–∏–µ –≤ –ø–æ—á—Ç–∞ –±–∞–Ω–∫–µ',
    '–§–µ–π–∫–æ–≤–æ–µ –º–µ—Å—Ç–æ —Ä–∞–±–æ—Ç—ã',
    '–ö—Ä–µ–¥–∏—Ç—ã',
    '–ù—É–¥–Ω–∞ –ø–æ–º–æ—â—å! –ó–∞–π–º–æ–≤ 6 —à—Ç—É–∫, –Ω–∞ 100 000, —Ä–µ—Ñ–µ–Ω–∞–Ω—Å, –∫—Ä–µ–¥–∏—Ç —á—Ç–æ –¥–µ–ª–∞—Ç—å?',
    '–†–µ—Ñ–∏–Ω–∞–Ω—Å–∏—Ä–æ–≤–∞–Ω–∏–µ'
}

findozor_topics_links = findozor_parser.parse_topics('https://findozor.net/forum/forums/banki2/', deep = 5)

In [12]:
findozor_topics_links

{'–†–µ—Ñ–∏–Ω–∞–Ω—Å–∏—Ä–æ–≤–∞–Ω–∏–µ –≤ –ø–æ—á—Ç–∞ –±–∞–Ω–∫–µ': 'https://findozor.net/forum/threads/refinansirovaniye-v-pochta-banke.29579/',
 '–ù–∞ –∞–≤–∏—Ç–æ –ø—Ä–µ–¥–ª–∞–≥–∞—é—Ç –∑–∞ –¥–µ–Ω—å–≥–∏ —Ä–∞–∑–±–ª–æ–∫–∏—Ä–æ–≤–∞—Ç—å —Å—á—ë—Ç –≤ –±–∞–Ω–∫–µ': 'https://findozor.net/forum/threads/na-avito-predlagayut-za-den-gi-razblokirovat-schet-v-banke.25207/',
 '–í –∫–∞–∫–∏—Ö –±–∞–Ω–∫–∞—Ö –æ–¥–æ–±—Ä—è—é—Ç –∫–∞—Ä—Ç—ã –Ω–∞ —Ä–∞—Å—Å—Ä–æ—á–∫—É': 'https://findozor.net/forum/threads/v-kakikh-bankakh-odobryayut-karty-na-rassrochku.29540/',
 '–í–∫–ª–∞–¥ –∏–ª–∏ –Ω–∞–∫–æ–ø–∏—Ç–µ–ª—å–Ω—ã–π —Å—á–µ—Ç ‚Äî —á—Ç–æ –ø–æ—Å–æ–≤–µ—Ç—É–µ—Ç–µ?': 'https://findozor.net/forum/threads/vklad-ili-nakopitel-nyi-schet-chto-posovetuyete.29433/',
 '–ú–æ–≥—É—Ç –ª–∏ –±–∞–Ω–∫–∏ –Ω–∞–∑–≤–∞–Ω–∏–≤–∞—Ç—å —á–∞—Å—Ç–æ, –µ—Å–ª–∏ –Ω–µ –±–µ—Ä—É —Ç—Ä—É–±–∫—É?': 'https://findozor.net/forum/threads/mogut-li-banki-nazvanivat-chasto-yesli-ne-beru-trubku.26408/',
 '–í—Å–µ–º –ø—Ä–∏–≤–µ—Ç üëã –≥–¥–µ –≤–∑—è—Ç—å –¥–µ–Ω—å–≥–∏ —Å –æ—á–µ–Ω—å –ø–ª–æ—

In [13]:
messages = dict()

for topic in findozor_topics:
    parsed = list(
        findozor_parser.parse_topic_comments(findozor_topics_links[topic], 5).values()
    )[0]
    parsed = ''.join(parsed).split('#')
    messages[topic] = parsed

In [14]:
findozor_res = giga.inference(messages)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:06<00:00,  1.10s/it]


In [15]:
findozor_res

[{'topic': '–ó–∞–º–æ—Ä–æ–∑–∫–∞ —Å—É–º–º—ã –∫—Ä–µ–¥–∏—Ç–∞ –¥–ª—è –ø–æ–≥–∞—à–µ–Ω–∏—è',
  'menace_status': '–Ω–µ—Ç',
  'comment': '–û–±—Å—É–∂–¥–µ–Ω–∏–µ –Ω–æ—Å–∏—Ç –∏–Ω—Ñ–æ—Ä–º–∞—Ç–∏–≤–Ω—ã–π —Ö–∞—Ä–∞–∫—Ç–µ—Ä –±–µ–∑ —è–≤–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞.',
  'highlights': '–¥–∏–∞–ª–æ–≥ –ø–æ—Å–≤—è—â–µ–Ω –≤–æ–ø—Ä–æ—Å–∞–º —Ä–µ—Å—Ç—Ä—É–∫—Ç—É—Ä–∏–∑–∞—Ü–∏–∏ –¥–æ–ª–≥–∞ –∏ –ø–æ–ª—É—á–µ–Ω–∏—è —Å–∫–∏–¥–æ–∫ –æ—Ç –±–∞–Ω–∫–æ–≤ –ø—Ä–∏ –Ω–∞–ª–∏—á–∏–∏ –ø—Ä–æ—Å—Ä–æ—á–µ–∫, –±–µ–∑ —É–ø–æ–º–∏–Ω–∞–Ω–∏—è –º–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö —Å—Ö–µ–º'},
 {'topic': '–ù–∞ –∞–≤–∏—Ç–æ –ø—Ä–µ–¥–ª–∞–≥–∞—é—Ç –∑–∞ –¥–µ–Ω—å–≥–∏ —Ä–∞–∑–±–ª–æ–∫–∏—Ä–æ–≤–∞—Ç—å —Å—á—ë—Ç –≤ –±–∞–Ω–∫–µ',
  'menace_status': '–¥–∞',
  'comment': '—á—É–≤—Å—Ç–≤–∏—Ç–µ–ª—å–Ω–∞—è —Ç–µ–º–∞',
  'highlights': ''},
 {'topic': '–§–µ–π–∫–æ–≤–æ–µ –º–µ—Å—Ç–æ —Ä–∞–±–æ—Ç—ã',
  'menace_status': '–¥–∞',
  'comment': '—á—É–≤—Å—Ç–≤–∏—Ç–µ–ª—å–Ω–∞—è —Ç–µ–º–∞',
  'highlights': ''},
 {'topic': '–†–µ—Ñ–∏–Ω–∞–Ω—Å–∏—Ä–æ–≤–∞–Ω–∏–µ –≤ –ø–æ—á—Ç–∞ –±–∞–Ω–∫–µ',
  'mena