In [None]:
import ast
from html.parser import HTMLParser
import os
import xml.etree.ElementTree as ET

In [None]:
class MyHTMLParser(HTMLParser):
    
    def __init__(self):
        super().__init__()
        self.list_content = []
    
    def handle_data(self, data):
        if "window.__REACT_DATA = " in data:
            content = data.split("window.__REACT_DATA = ")[1][:-2] # removing the last ';'
            dict_content = ast.literal_eval(content)
            self.list_content += [dict_content]

# You need to download html files from [here](https://hsk.academy/en/hsk-1-vocabulary-list)
If the link does not work, copy this link: `https://hsk.academy/en/hsk-1-vocabulary-list`

In [None]:
with open("html_files/HSK_6.html") as f:
    html_file = "".join(f.readlines())

parser = MyHTMLParser()
parser.feed(html_file)

In [None]:
dict_content = parser.list_content[0]

# Structure of the content dictionnary

### 1. Key `word`
Contains a list of dictionnaries.
Each dictionnary has 8 keys:

`'id'`, `'hanzi'`, `'hanziRaw'`, `'trad'`, `'pinyinToneSpace'`, `'def'`, `'mp3File'`, `'oggFile'`

Example of a dictionnary:
```
{'id': 684,
 'hanzi': '得(助动词)',
 'hanziRaw': '得',
 'trad': '得(助動詞)',
 'pinyinToneSpace': 'dé',
 'def': 'devoir, pouvoir (particule utilisée pour exprimer la possibilité, la capacité, l’effet, le degré)',
 'mp3File': '得(助动词).mp3',
 'oggFile': '得(助动词).ogg'
}

```

- `id` contains just the number of the entry. **Useless**
- `hanzi` contains the chinese character (simplified), with indication of auxiliary particules (助词) or verbs (助动词)
- `hanziRaw` contains the chinese character whitout indication of auxiliary particules (助词) or verbs (助动词). **Useless**
- `trad` contains the traditionnal hanzi. **Useless (for me)**
- `pinyinToneSpace` contains the pinyin.
- `def` contains the definition/translation
- `mp3file` contains the name of the mp3 file with the audio of the word. **Useless**
- `oggFile` contains the name of the ogg file with the audio of the word. **Useless**

### 2. Key `wordIdToCharacters` (useless)
Contains a dictionnary whose keys are the `id`s of every word, and the content are the decomposition of the world into different characters
Example of `(key, value)` pairs:
```
'762': [{'slug': '果', 'hanzi': '果', 'wordId': 762},
        {'slug': '汁', 'hanzi': '汁', 'wordId': 762}],
'684': [{'slug': '得', 'hanzi': '得', 'wordId': 684}],
```
It seems that `slug` and `hanzi` values are always the same.

### 3. Key `localizedSentences`
Contains a list of dictionnaries. Each dictionnary has three keys: 
`'hanzi'`, `'pinyinTone'`, `'def'`
Example:
```
{'hanzi': '你把火点着吧。',
 'pinyinTone': 'Nǐ bǎ huǒ diǎnzhe ba.',
 'def': 'Vous allumez le feu.'
}
```

### 4. Key `hskLevel`
The corresponding HSK level, i.e. `4` for instance.

# Checking if there are special word entries

In [None]:
for d in dict_content["words"]:
    if d["hanzi"] != d["hanziRaw"]:
        print(d)

# Desired structure of the `.xml` file
(two possible structure for a card):

```
<deck name="Chinois">
    <fields>
        <chinese name='Chinois' sides='11' lang='zh-CN'  pinyinMode='hint'></chinese>
        <text name='Traduction' sides='01' lang='fr-FR'></text>
    </fields>
    <cards>
        <card>
            <chinese name='Chinois'>月亮</chinese>
            <text name='Traduction'>Lune</text>
        </card>
        <card>
            <chinese name='Chinois'>
                <chinese>介绍</chinese>
            </chinese>
            <text name='Traduction'>Introduire, présenter qqun </text>
        </card>
    </cards>
</deck>
```


## Creating words `.xml` file

In [None]:
deck = ET.Element('deck', attrib={'name': f'HSK {dict_content["hskLevel"]} Word List'})

fields = ET.SubElement(deck, 'fields')
chinese = ET.SubElement(fields, 'chinese', attrib={'name': 'Chinois', 'sides': '11', 'lang': 'zh-CN', 'pinyinMode': 'hint'})
text = ET.SubElement(fields, 'text', attrib={'name': 'Traduction', 'sides': '01', 'lang': 'fr-FR'})

cards = ET.SubElement(deck, "cards")
ET.dump(deck)

In [None]:
for word_entry in dict_content["words"]:
    
    hanzi = word_entry["hanzi"]
    definition = word_entry["def"]
    
    # Replacing chinese grammar indicators
    if hanzi != word_entry["hanziRaw"]:
        hanzi = hanzi.replace("(助动词)", "(verbe auxiliaire)")
        hanzi = hanzi.replace("(助词)", "(particule)")
        hanzi = hanzi.replace("(叹词)", "(interjection)")
        hanzi = hanzi.replace("(形容词)", "(adjectif)")
        hanzi = hanzi.replace("(介词)", "(préposition)")
        hanzi = hanzi.replace("(副词)", "(adverbe)")
        hanzi = hanzi.replace("(名词)", "(nom)")
        hanzi = hanzi.replace("(量词)", "(quantificateur)")
        
    card = ET.SubElement(cards, "card")
    chinese = ET.SubElement(card, 'chinese', attrib={'name': 'Chinois'})
    chinese.text = hanzi
    text = ET.SubElement(card, 'text', attrib={'name': 'Traduction'})
    text.text = definition

# For testing only
# ET.dump(deck)

In [None]:
deck_tree = ET.ElementTree(deck)
deck_tree.write(f"xml_outputs/HSK_{dict_content['hskLevel']}_word_list.xml", encoding="unicode")

## Creating sentence `.xml` file

In [None]:
deck = ET.Element('deck', attrib={'name': f'HSK {dict_content["hskLevel"]} Sentence List'})

fields = ET.SubElement(deck, 'fields')
chinese = ET.SubElement(fields, 'chinese', attrib={'name': 'Chinois', 'sides': '11', 'lang': 'zh-CN', 'pinyinMode': 'hint'})
text = ET.SubElement(fields, 'text', attrib={'name': 'Traduction', 'sides': '01', 'lang': 'fr-FR'})

cards = ET.SubElement(deck, "cards")
ET.dump(deck)

In [None]:
for word_entry in dict_content["localizedSentences"]:
    
    hanzi = word_entry["hanzi"]
    definition = word_entry["def"]
        
    card = ET.SubElement(cards, "card")
    chinese = ET.SubElement(card, 'chinese', attrib={'name': 'Chinois'})
    chinese.text = hanzi
    text = ET.SubElement(card, 'text', attrib={'name': 'Traduction'})
    text.text = definition

# For testing only
# ET.dump(deck)

In [None]:
deck_tree = ET.ElementTree(deck)
deck_tree.write(f"xml_outputs/HSK_{dict_content['hskLevel']}_sentence_list.xml", encoding="unicode")