In [None]:
import ast
import os
import re
import xml.etree.ElementTree as ET
from html.parser import HTMLParser

# Class for parsing and processing the `html` data

In [None]:
class HskHtmlParser(HTMLParser):
    
    def __init__(self):
        super().__init__()
        self.dict_content = {}
    
    def handle_data(self, data):
        if "window.__REACT_DATA = " in data:
            content = data.split("window.__REACT_DATA = ")[1][:-2]  # removing the last ';'
            self.dict_content = ast.literal_eval(content)

            
    def create_word_xml_automatic(self, output_file, grammar_indicator={}):
        deck = ET.Element('deck', attrib={'name': f'HSK {self.dict_content["hskLevel"]} Word List'})

        fields = ET.SubElement(deck, 'fields')
        chinese = ET.SubElement(fields, 'chinese', attrib={'name': 'Chinois', 'sides': '10', 'lang': 'zh-CN', 'pinyinMode': 'back'})
        text = ET.SubElement(fields, 'text', attrib={'name': 'Traduction', 'sides': '01', 'lang': 'fr-FR'})

        cards = ET.SubElement(deck, "cards")

        for word_entry in self.dict_content["words"]:

            hanzi = word_entry["hanzi"]
            definition = word_entry["def"]

            if hanzi != word_entry["hanziRaw"]:
                # Replacing chinese grammar indicators
                for key, value in grammar_indicator.items():
                    hanzi = hanzi.replace(key, value)

            card = ET.SubElement(cards, "card")
            chinese = ET.SubElement(card, 'chinese', attrib={'name': 'Chinois'})
            chinese.text = hanzi
            text = ET.SubElement(card, 'text', attrib={'name': 'Traduction'})
            text.text = definition

        deck_tree = ET.ElementTree(deck)
        deck_tree.write(output_file, encoding="unicode")

    def create_sentence_xml_automatic(self, output_file):
        deck = ET.Element('deck', attrib={'name': f'HSK {self.dict_content["hskLevel"]} Sentence List'})

        fields = ET.SubElement(deck, 'fields')
        chinese = ET.SubElement(fields, 'chinese', attrib={'name': 'Chinois', 'sides': '10', 'lang': 'zh-CN', 'pinyinMode': 'back'})
        text = ET.SubElement(fields, 'text', attrib={'name': 'Traduction', 'sides': '01', 'lang': 'fr-FR'})

        cards = ET.SubElement(deck, "cards")

        for word_entry in self.dict_content["localizedSentences"]:

            hanzi = word_entry["hanzi"]
            definition = word_entry["def"]

            card = ET.SubElement(cards, "card")
            chinese = ET.SubElement(card, 'chinese', attrib={'name': 'Chinois'})
            chinese.text = hanzi
            text = ET.SubElement(card, 'text', attrib={'name': 'Traduction'})
            text.text = definition

        deck_tree = ET.ElementTree(deck)
        deck_tree.write(output_file, encoding="unicode")

    def create_word_xml(self, output_file, grammar_indicator={}):

        deck = ET.Element('deck', attrib={'name': f'HSK {self.dict_content["hskLevel"]} Word List'})

        fields = ET.SubElement(deck, 'fields')
        front = ET.SubElement(fields, 'text', attrib={'name': 'Front', 'sides': '11', 'lang': 'zh-CN'})  # Visible on both sides
        back = ET.SubElement(fields, 'text', attrib={'name': 'Back', 'sides': '01', 'lang': 'zh-CN'})
        pinyin = ET.SubElement(fields, 'rich-text', attrib={'name': 'Pinyin', 'sides': '01'})

        cards = ET.SubElement(deck, "cards")

        for word_entry in self.dict_content["words"]:

            hanzi = word_entry["hanzi"]
            definition = word_entry["def"]
            pinyin_tone = word_entry["pinyinToneSpace"]

            if hanzi != word_entry["hanziRaw"]:
                # Replacing chinese grammar indicators
                for key, value in grammar_indicator.items():
                    hanzi = hanzi.replace(key, value)

            card = ET.SubElement(cards, "card")
            front = ET.SubElement(card, 'text', attrib={'name': 'Front'})
            front.text = hanzi
            back = ET.SubElement(card, 'text', attrib={'name': 'Back'})
            back.text = definition
            pinyin = ET.SubElement(card, 'rich-text', attrib={'name': 'Pinyin'})
            italic = ET.SubElement(pinyin, 'i')
            italic.text = pinyin_tone

            card = ET.SubElement(cards, "card")
            front = ET.SubElement(card, 'text', attrib={'name': 'Front'})
            front.text = definition
            back = ET.SubElement(card, 'text', attrib={'name': 'Back'})
            back.text = hanzi
            pinyin = ET.SubElement(card, 'rich-text', attrib={'name': 'Pinyin'})
            italic = ET.SubElement(pinyin, 'i')
            italic.text = pinyin_tone

        deck_tree = ET.ElementTree(deck)
        deck_tree.write(output_file, encoding="unicode")

    def create_sentence_xml(self, output_file):
        deck = ET.Element('deck', attrib={'name': f'HSK {self.dict_content["hskLevel"]} Sentence List'})

        fields = ET.SubElement(deck, 'fields')
        front = ET.SubElement(fields, 'text', attrib={'name': 'Front', 'sides': '11', 'lang': 'zh-CN'})  # Visible on both sides
        back = ET.SubElement(fields, 'text', attrib={'name': 'Back', 'sides': '01', 'lang': 'zh-CN'})
        pinyin = ET.SubElement(fields, 'rich-text', attrib={'name': 'Pinyin', 'sides': '01'})

        cards = ET.SubElement(deck, "cards")

        for word_entry in self.dict_content["localizedSentences"]:

            hanzi = word_entry["hanzi"]
            definition = word_entry["def"]
            pinyin_tone = word_entry["pinyinTone"]

            card = ET.SubElement(cards, "card")
            front = ET.SubElement(card, 'text', attrib={'name': 'Front'})
            front.text = hanzi
            back = ET.SubElement(card, 'text', attrib={'name': 'Back'})
            back.text = definition
            pinyin = ET.SubElement(card, 'rich-text', attrib={'name': 'Pinyin'})
            italic = ET.SubElement(pinyin, 'i')
            italic.text = pinyin_tone

        deck_tree = ET.ElementTree(deck)
        deck_tree.write(output_file, encoding="unicode")


# Reading the data
You need to download html files from [here](https://hsk.academy/fr/hsk-1-vocabulary-list).

If the link does not work, copy this link: `https://hsk.academy/fr/hsk-1-vocabulary-list`

In [None]:
with open("html_files/HSK_1.html") as f:
    html_file = "".join(f.readlines())

parser = HskHtmlParser()
parser.feed(html_file)

In [None]:
grammar_indicator = {
    "(助动词)": "(verbe auxiliaire)",
    "(助词)": "(particule)",
    "(动词)": "(verbe)",
    "(叹词)": "(interjection)",
    "(形容词)": "(adjectif)",
    "(介词)": "(préposition)",
    "(副词)": "(adverbe)",
    "(名词)": "(nom)",
    "(量词)": "(quantificateur)"
}

# Checking if there are other indicators like '(助词)' in
# the data that are not already in grammar_indicator
parenthesis_regex = re.compile("\(.*?\)")
list_missing_indicator = []
for word_entry in parser.dict_content["words"]:
    if word_entry["hanzi"] != word_entry["hanziRaw"]:
        parenthesis_words = parenthesis_regex.findall(word_entry["hanzi"])
        if parenthesis_words:
            list_missing_indicator += [k for k in parenthesis_words if k not in grammar_indicator.keys()]
            
print(f"Missing indicators: {list(set(list_missing_indicator))}")

# Creating the decks

In [None]:
# Classic HSK decks with automatic pinyin detection
output_word = f"xml_outputs/automatic_pinyin/HSK_{parser.dict_content['hskLevel']}_word_list.xml"
parser.create_word_xml_automatic(output_word, grammar_indicator=grammar_indicator)

output_sentence = f"xml_outputs/automatic_pinyin/HSK_{parser.dict_content['hskLevel']}_sentence_list.xml"
parser.create_sentence_xml_automatic(output_sentence)


# Custom HSK decks with manual pinyin implementation
output_word = f"xml_outputs/HSK_{parser.dict_content['hskLevel']}_word_list.xml"
parser.create_word_xml(output_word, grammar_indicator=grammar_indicator)

output_sentence = f"xml_outputs/HSK_{parser.dict_content['hskLevel']}_sentence_list.xml"
parser.create_sentence_xml(output_sentence)