In [1]:
import json
import requests
from bs4 import BeautifulSoup
import spacy
from collections import Counter
from dotenv import load_dotenv
import os
from PyQt5.QtWidgets import QApplication, QWidget, QVBoxLayout, QLabel, QCheckBox, QPushButton, QScrollArea, QVBoxLayout
from openai import OpenAI
import openai

In [11]:
import os
from anki.storage import Collection

# Укажите путь к файлу вашей коллекции Anki (.anki2)
collection_path = os.path.expanduser(r'~/Library/Application Support/Anki2/User 1/collection.anki2')

# Открываем коллекцию
col = Collection(collection_path)

# Укажите название вашей колоды
deck_name = 'Название вашей колоды'

# Получаем ID колоды
deck_id = col.decks.id(deck_name)

# Получаем все заметки из колоды
note_ids = col.find_notes(f"deck:{deck_name}")

# Список для хранения значений поля "FrontText"
front_texts = []

# Проходимся по всем заметкам и извлекаем поле "FrontText"
for note_id in note_ids:
    note = col.get_note(note_id)
    front_text = note.fields[0]  # Предполагается, что первое поле является "FrontText"
    front_texts.append(front_text)

# Закрываем коллекцию
col.close()

# Выводим результаты
for text in front_texts:
    print(text)

In [2]:
load_dotenv()  # Загружает переменные из .env файла в окружение
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [3]:

# Загрузим модель испанского языка в spacy
nlp = spacy.load("es_core_news_md")


In [4]:
client = OpenAI(api_key=OPENAI_API_KEY)

# Загрузим модель испанского языка в spacy
nlp = spacy.load("es_core_news_md")

KNOWN_WORDS_FILE = 'known_words.txt'
TRANSLATIONS_FILE = 'translations.txt'
CACHE_FILE = 'translation_cache.json'

POS_NAMES = {
    "ADJ": "прилагательное",
    "ADP": "предлог",
    "ADV": "наречие",
    "AUX": "вспомогательный глагол",
    "CONJ": "союз",
    "CCONJ": "сочинительный союз",
    "DET": "определитель",
    "INTJ": "междометие",
    "NOUN": "существительное",
    "NUM": "числительное",
    "PART": "частица",
    "PRON": "местоимение",
    "PROPN": "собственное имя",
    "PUNCT": "пунктуация",
    "SCONJ": "подчинительный союз",
    "SYM": "символ",
    "VERB": "глагол",
    "X": "неизвестное"
}

# def extract_text_from_html(html):
#     soup = BeautifulSoup(html, 'html.parser')
#     text = ''
#     for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'div']):
#         if element.string:
#             text += element.string.strip() + ' '
#     return text.strip()

def extract_text_from_html(html):
    # Создаем объект BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Извлекаем все блоки с классом "col-md-8"
    blocks = soup.find_all('div', class_='col-md-8')
    
    # Извлекаем текст из каждого блока и объединяем их
    text = "\n".join([block.get_text(separator=" ", strip=True) for block in blocks])
    
    return text.strip()

def fetch_page_content(url):
    response = requests.get(url)
    response.raise_for_status()
    return extract_text_from_html(response.text)

def fetch_page_content_local_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return extract_text_from_html(file.read())

def fetch_page_content_local_file_dir(path):
    text = ''
    for filename in os.listdir(path):
        print(f'Processing {filename}')
        with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
            text += extract_text_from_html(file.read()) + '\n'

    print(f'Text:\n{text}')
    return text





def normalize_and_count_words(text):
    doc = nlp(text)
    lemmatized_words = [f"{token.lemma_.lower()} ({POS_NAMES.get(token.pos_, 'другая часть речи')})" for token in doc if token.is_alpha]
    return Counter(lemmatized_words)

def load_known_words():
    if os.path.exists(KNOWN_WORDS_FILE):
        with open(KNOWN_WORDS_FILE, 'r', encoding='utf-8') as file:
            return set(file.read().splitlines())
    return set()

def save_known_words(known_words):
    with open(KNOWN_WORDS_FILE, 'w', encoding='utf-8') as file:
        file.write('\n'.join(sorted(known_words)))

def load_translations():
    translations = {}
    if os.path.exists(TRANSLATIONS_FILE):
        with open(TRANSLATIONS_FILE, 'r', encoding='utf-8') as file:
            for line in file:
                parts = line.strip().split('\t')
                if len(parts) == 2:
                    word, translation = parts
                    translations[word] = translation
    return translations

def save_translation(word, pos, translation):
    with open(TRANSLATIONS_FILE, 'a', encoding='utf-8') as file:
        pos_str = POS_NAMES.get(pos, 'другая часть речи')
        file.write(f'=[ {word} ({pos_str}) ]='.center(40, '-'))
        file.write(f'\n{translation}\n')
        file.write('-' * 40)
        file.write('\n\n')

def load_cache():
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, 'r', encoding='utf-8') as file:
            return json.load(file)
    return {}

def save_cache(cache):
    with open(CACHE_FILE, 'w', encoding='utf-8') as file:
        json.dump(cache, file)

def get_translation(word, pos, debug=False):
    cache = load_cache()
    cache_key = f"{word}_{pos}"
    print(f"Translating {word} ({pos})")
    
    if cache_key in cache and not debug:
        return cache[cache_key]
    
    with open('question_template.txt', 'r', encoding='utf-8') as file:
        question = file.read()
    question = question.format(word=word, pos=pos)
    if debug:
        print(f"Question for OpenAI: {question}")
        raise ValueError('Debug mode')

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            # model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": question}
            ]
        )
        translation = response.choices[0].message.content.strip()
        cache[cache_key] = translation
        save_cache(cache)
    except openai.APIError as e:
        #Handle API error here, e.g. retry or log
        print(f"OpenAI API returned an API Error: {e}")
        pass
    except openai.APIConnectionError as e:
        #Handle connection error here
        print(f"Failed to connect to OpenAI API: {e}")
        pass
    except openai.RateLimitError as e:
        #Handle rate limit error (we recommend using exponential backoff)
        print(f"OpenAI API request exceeded rate limit: {e}")
        pass
    return translation

class WordFrequencyChecker(QWidget):
    def __init__(self, source, source_type='url'):
        super().__init__()
        self.known_words = load_known_words()
        self.translations = load_translations()
        self.initUI(source, source_type)
    
    def initUI(self, source, source_type = 'url'):
        if source_type == 'url':
            text_content = fetch_page_content(source)
        elif source_type == 'file':
            text_content = fetch_page_content_local_file(source)
        elif source_type == 'dir':
            text_content = fetch_page_content_local_file_dir(source)
        else:
            raise ValueError('Unknown source type')
            
        
        word_frequencies = normalize_and_count_words(text_content)
        
        layout = QVBoxLayout()
        layout.addWidget(QLabel('Выберите слова, которые вы знаете'))
        
        self.checkboxes = {}
        
        print(word_frequencies)
        for word, freq in word_frequencies.most_common(1000):  # выводим топ-40 слов
            if freq > 5:
                word_only = word.split(' (')[0]
                if word_only not in self.known_words:
                    checkbox = QCheckBox(f'{word} ({freq})')
                    self.checkboxes[word] = checkbox
                    layout.addWidget(checkbox)
        
        save_button = QPushButton('Сохранить')
        save_button.clicked.connect(self.save_known_words)
        layout.addWidget(save_button)
        
        translate_button = QPushButton('Перевести оставшиеся')
        translate_button.clicked.connect(self.translate_words)
        layout.addWidget(translate_button)
        
        self.translations_output = QLabel('')
        layout.addWidget(self.translations_output)
        
        scroll = QScrollArea()
        container = QWidget()
        container.setLayout(layout)
        scroll.setWidget(container)
        scroll.setWidgetResizable(True)
        
        main_layout = QVBoxLayout()
        main_layout.addWidget(scroll)
        self.setLayout(main_layout)
        
        self.setWindowTitle('Word Frequency Checker')
        self.show()
    
    def save_known_words(self):
        for word, checkbox in self.checkboxes.items():
            if checkbox.isChecked():
                self.known_words.add(word.split(' (')[0])
        save_known_words(self.known_words)
        self.close()
    
    def translate_words(self):
        untranslated_words = [(word.split(' (')[0], token.pos_) for word, token in [(word.split(' (')[0], nlp(word.split(" (")[0])[0]) for word in self.checkboxes if not self.checkboxes[word].isChecked()]]
        translations = {}
        for word, pos in untranslated_words:
            if word in self.translations:
                translations[word] = self.translations[word]
            else:
                translation = get_translation(word, POS_NAMES.get(pos, 'другая часть речи'))
                if translation:
                    print(f'{word}: {translation}')
                    translations[word] = translation
                    save_translation(word, pos, translation)
        
        self.translations_output.setText('\n'.join([f'\n[{word} ({pos})]:\n{translation}' for word, translation in translations.items()]))

def main(source, is_url=True):
    app = QApplication([])
    ex = WordFrequencyChecker(source, is_url)
    app.exec_()

In [5]:
# Пример использования
if __name__ == "__main__":
    source = './data/Albina'  # Замените на ваш файл или URL
    source_type = 'dir'  # Установите True, если source это URL
    main(source, source_type)

Processing Test_del_Permiso_B_Gratis_de_la_DGT_con_Explicaciones_2024_05_07.html
Processing Test_del_Permiso_B_Gratis_de_la_DGT_con_Explicaciones_2024_30_06.html
Processing Test_del_Permiso_B_Gratis_de_la_DGT_con_Explicaciones_2024_03_07.html
Processing Test_del_Permiso_B_Gratis_de_la_DGT_con_Explicaciones_2024_02_07.html
Processing Test_del_Permiso_B_Gratis_de_la_DGT_con_Explicaciones_2024_01_07.html
Text:
Guardar 1. Usted conduce en una carretera que conecta dos localidades y observa que está nevando ligeramente. ¿Puede utilizar la luz antiniebla? 1 312 A) No, está prohibido utilizar la luz antiniebla si nieva ligeramente. B) Sí, pero solamente la luz antiniebla delantera ya que en este caso se prohíbe utilizar la luz antiniebla trasera. C) Si, tiene que utilizar la luz antiniebla trasera obligatoriamente.
Guardar 2. Fumar durante la conducción... 1 231 A) está prohibido. B) proporciona tranquilidad. C) puede disminuir el nivel de atención.
Guardar 3. ¿Qué distancia de seguridad debe

In [6]:
# # Пример использования
# if __name__ == "__main__":
#     source = './data/Test del Permiso B Gratis de la DGT con Explicaciones 2024.html'  # Замените на ваш файл или URL
#     source_type = 'file'  # Установите True, если source это URL
#     main(source, source_type)