<a href="https://colab.research.google.com/github/ShiiZhongTian/colab-dict-dev/blob/main/wiktionary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ダンプファイルのストリームサイズを計測する関数

In [None]:
import bz2

def create_streamlen_tsv(lang):
    input = f"{lang}wiktionary-20250101-pages-articles-multistream.xml.bz2"
    output = f"{lang}streamlen.tsv"

    size = 1024 * 1024  # 1MB
    with open(input, "rb") as f, open(output, "w") as out:
        decompressor = bz2.BZ2Decompressor()
        slen = 0
        data = b''
        lens = []
        while data or (data := f.read(size)):
            len1 = len(data)
            decompressor.decompress(data)
            data = decompressor.unused_data
            slen += len1 - len(data)
            if decompressor.eof:
                lens.append(str(slen))
                slen = 0
                decompressor = bz2.BZ2Decompressor()
        out.writelines("\n".join(lens))


In [None]:
import bz2

target = "/content/drive/MyDrive/Wiktionary/enwiktionary-20250101-pages-articles-multistream.xml.bz2"

size = 1024 * 1024  # 1MB
with bz2.open(target, "rt", encoding="utf-8") as f:
    decompressor = bz2.BZ2Decompressor()
    data = f.read(size)
    print(data)

<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.11/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.11/ http://www.mediawiki.org/xml/export-0.11.xsd" version="0.11" xml:lang="en">
  <siteinfo>
    <sitename>Wiktionary</sitename>
    <dbname>enwiktionary</dbname>
    <base>https://en.wiktionary.org/wiki/Wiktionary:Main_Page</base>
    <generator>MediaWiki 1.44.0-wmf.8</generator>
    <case>case-sensitive</case>
    <namespaces>
      <namespace key="-2" case="case-sensitive">Media</namespace>
      <namespace key="-1" case="first-letter">Special</namespace>
      <namespace key="0" case="case-sensitive" />
      <namespace key="1" case="case-sensitive">Talk</namespace>
      <namespace key="2" case="first-letter">User</namespace>
      <namespace key="3" case="first-letter">User talk</namespace>
      <namespace key="4" case="case-sensitive">Wiktionary</namespace>
      <namespace key="5" case="case-sensitive">Wikt

In [None]:
import bz2

target = "/content/drive/MyDrive/Wiktionary/jawiktionary-20250101-pages-articles-multistream.xml.bz2"

size = 1024 * 1024  # 1MB
with bz2.open(target, "rt", encoding="utf-8") as f:
    decompressor = bz2.BZ2Decompressor()
    data = f.read(size)
    print(data)

<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.11/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.11/ http://www.mediawiki.org/xml/export-0.11.xsd" version="0.11" xml:lang="ja">
  <siteinfo>
    <sitename>Wiktionary</sitename>
    <dbname>jawiktionary</dbname>
    <base>https://ja.wiktionary.org/wiki/Wiktionary:%E3%83%A1%E3%82%A4%E3%83%B3%E3%83%9A%E3%83%BC%E3%82%B8</base>
    <generator>MediaWiki 1.44.0-wmf.8</generator>
    <case>case-sensitive</case>
    <namespaces>
      <namespace key="-2" case="case-sensitive">メディア</namespace>
      <namespace key="-1" case="first-letter">特別</namespace>
      <namespace key="0" case="case-sensitive" />
      <namespace key="1" case="case-sensitive">トーク</namespace>
      <namespace key="2" case="first-letter">利用者</namespace>
      <namespace key="3" case="first-letter">利用者・トーク</namespace>
      <namespace key="4" case="case-sensitive">Wiktionary</namespace>
      <namespace

# WikiTextParserテスト

In [None]:
!pip install wikitextparser

In [None]:
import wikitextparser as wtp

# s = "{{context|lang=ru|figuratively}} {{q|{{m|ru|за|tr=-}} 造格の}} [[後]]を[[おう|追う]]"
# parsed = wtp.parse(s)
# print(parsed.templates)
# print(parsed.templates[1].string[2:-2])

def process_wikilink(s):
    parsed = wtp.parse(s)
    if links := parsed.wikilinks:
        for link in links:
            ret = link.text if link.text else link.title
            s = s.replace(link.string, ret)
    return s

def process_templates(s):
    parsed = wtp.parse(s)
    if templates := parsed.templates:
        for template in templates:
            if template.nesting_level > 1:
                continue
            match template.name:
                case "ふりがな":
                    s = s.replace(template.string, template.arguments[0].value)
                case "おくりがな":
                    ret = "".join(template.string[2:-2].split("|")[1:3])
                    s = s.replace(template.string, ret)
                case "おくりがな2":
                    ret = "".join(template.string[2:-2].split("|")[1::2])
                    s = s.replace(template.string, ret)
                case "おくりがな3":
                    args = template.string[2:-2].split("|")
                    ret = "".join(list(map(lambda i: args[i], [1,3,4,6])))
                    s = s.replace(template.string, ret)
                case _:
                    s = s.replace(template.string, "")
    return s

print(process_templates(process_wikilink("{{context|lang=ru|figuratively}} {{q|{{m|ru|за|tr=-}} 造格の}} [[後]]を[[おう|追う]]")))
print(process_templates(process_wikilink("{{q|{{m|ru|к|tr=-}} 与格に}} [[ならす|慣らす]]")))
print(process_wikilink("[[くりかえす|繰り返し]][[いう|言う]]"))
print(process_templates("{{ふりがな|栗|くり}}"))
print(process_templates("{{おくりがな2|教|おし|える|おしえる}}"))
print(process_templates("{{おくりがな|小|さい|ちいさい}}"))
print(process_templates("{{おくりがな3|見|み||積|つ|もる|みつもる}}"))

  後を追う
 慣らす
繰り返し言う
栗
教える
小さい
見積もる


In [None]:
s = """{{also|中國}}
==Chinese==
{{zh-see|中國}}

==Japanese==

===Etymology 1===
{{wp|lang=ja}}
{{ja-kanjitab|yomi=o|ちゅう|こく|k2=ごく}}
From {{bor|ja|ltc|-}} {{ltc-l|中國|id=1|lit=[[central]] [[country]]}}. Compare modern {{cog|yue|中國|tr=zung<sup>1</sup> gwok<sup>3</sup>}}. Attested from at least the 12th century.<ref>{{R:Kokugo Dai Jiten}}</ref>

{{rfe|ja|Where the "China" sense first attested in a Japanese document?}}

====Proper noun====
{{ja-pos|proper|ちゅうごく}}

#[[China]]
## {{lb|ja|historical|_|or contemporary|sort=ちゅうごく}} any of several [[kingdom]]s, [[empire]]s, or [[state]]s of the [[Chinese]] people
##: {{hyponyms|ja|殷|tr1=In|周|tr2=Shū|秦|tr3=Shin|漢|tr4=Kan|唐|tr5=Tō|中華民国|tr6=Chūka Minkoku}}
##* {{quote-book|ja|year=c. 1120|title=Konjaku monogatari shū|trans-title=Tales of Times Now Past|section=scroll 6}}
##*: {{ja-usex|造り奉れる体を見るに、'''中%國'''には不%似ず、面[めう]胡%国に似たり。|^つくり まつれる すがた を みる に、'''^ちゅう%ごく''' に は ふ-%にず、 めん[めう] こ%こく に にたり。|Judging from the appearance [of the statue], it is not unlike those in '''China''', though the face resembles the northern nations.}}
## {{lb|ja|especially|in contemporary usage|sort=ちゅうごく}} {{short for|ja|中華人民共和国|tr=Chūka Jinmin Kyōwakoku|nocap=1|dot=,}} the [[People's Republic of China]]
##* {{quote-book
|ja
|genre=fiction
|author={{w|lang=ja|藤子・F・不二雄|Fujiko F. Fujio}}
|title={{lw|ja|ドラえもん のび太とアニマル惑星|{{ruby|のび[太](た)とアニマル[惑星](プラネット)}}}}
|trans-title={{w|Doraemon: Nobita and the Animal Planet|Nobita and the Animal Planet}}
|location=Tokyo
|publisher=Shogakukan
|date=Nov 30 1998
|edition=22nd
|origdate=Nov 25 1990
|isbn=4-09-140610-6
|volume=10
|page=27
|series=lw:ja:大長編ドラえもん
|seriesvolume=10}}
##*: {{ja-usex|クニ？クニ って なあに。|Country? What’s a country?}}{{ja-usex|アメリカとか'''中%国'''とかいろいろあるじゃない。|^アメリカ とか '''^ちゅう%ごく''' とか いろいろ ある じゃ ない。|You know, countries. There are a bunch of them like America and '''China'''.}}

=====Derived terms=====
* {{ja-r|中%国%語|^ちゅう%ごく%-ご|[[Chinese]] [[language]]}}
* {{ja-r|中%国%人|^ちゅう%ごく%-じん|Chinese [[person]]/[[people]]}}
* {{ja-r|中%国 料%理|^ちゅう%ごく りょう%り|Chinese [[cuisine]]}}

=====Related terms=====
* {{ja-r|中%華|^ちゅう%か}}
* {{ja-r|支%那|^し%な}} {{q|used before World War II, now offensive}}

===Etymology 2===
{{ja-kanjitab|yomi=o|ちゅう|こく|k2=ごく}}
From {{bor|ja|ltc|-}} {{ltc-l|中國|id=1}}. {{rfv-etym|ja|sort=ちゆうこく}}

Analyzable as a compound of {{com|ja|sort=ちゆうこく|中|tr1=chū|t1=[[center]], [[middle]]|国|tr2=koku|t2=[[province]]}}. {{rendaku2|sort=ちゆうこく|koku|goku}}

====Pronunciation====
{{ja-pron|ちゅうごく|acc=1|acc2=0|acc_ref=SMK5|acc2_ref=DJR|a=Ja-Chugoku.ogg}}

====Noun====
{{ja-noun|ちゅうごく}}

# the [[central]] [[part]] of a [[country]], historically, commonly known as the [[capital city]] or the [[main]] [[region]], where an [[emperor]] or other [[ruler]] [[reside]]s
# something [[within]] a country
# {{lb|ja|sort=ちゆうこく|historical}} under the {{m|ja|律令|tr={{w|Ritsuryō}}}} system, a [[province]] of the [[third]] [[rank]] according to a four-rank system based on [[population]] and [[area]]
# {{lb|ja|historical|sort=ちゅうごく}} under the {{m|ja|律令|tr={{w|Ritsuryō}}}} system, a [[province]] of the [[second]] rank according to the [[distance]] from the [[capital city]]
# [[collective]] [[term]] for the {{m|ja|山陽道|tr={{w|San'yōdō}}}} and {{m|ja|山陰道|tr={{w|San'indō}}}} [[regions]] of [[Japan]]

=====Derived terms=====
* {{ja-r|中%国 地%方|^ちゅう%ごく ち%ほう}}

====Proper noun====
{{ja-pos|proper|ちゅうごく}}

# {{short for|ja|中国地方|tr=Chūgoku chihō|dot=:}} the {{w|Chūgoku region}} of [[Japan]]

===Etymology 3===
{{ja-kanjitab|yomi=k|なか|くに}}

From {{com|ja|中|tr1=naka|t1=[[center]], [[middle]]|国|tr2=kuni|t2=[[country]]}}.

====Proper noun====
{{ja-pos|proper|なかくに}}

# {{surname|ja|sort=なかくに}}

===See also===
* {{ja-r|中つ国|なか つ くに}}

===References===
<references/>

{{attention|ja|additional pronunciations in jawiki articles https://ja.wikipedia.org/wiki/中国_(曖昧さ回避) つぉんぐぉ and チュンコック}}

{{topics|ja|China|Countries in Asia|sort=ちゅうごく}}

==Zhuang==

===Proper noun===
{{za-head|proper noun}}

# {{za-sawndip form of|Cunghgoz}}
"""

parsed = wtp.parse(s)
print(parsed.sections)
sections = list(filter(lambda sec: sec.level == 2, parsed.sections))
print(len(sections))
jp_section = list(filter(lambda sec: "Japanese" in sec.title, sections))[0]
jp_section_lev3 = list(filter(lambda sec: sec.level == 3, jp_section.sections))
print(dir(jp_section_lev3[0]))
print(jp_section_lev3[0].sections)
jp_sections_lev4 = list(filter(lambda sec: sec.level == 4, jp_section_lev3[0].sections))
print(jp_sections_lev4)
meaning = list(filter(lambda sec: "Proper noun" in sec.title, jp_sections_lev4))[0]
print(meaning.get_lists()[0].items[0])

[Section('{{also|中國}}\n'), Section('==Chinese==\n{{zh-see|中國}}\n\n'), Section('==Japanese==\n\n===Etymology 1===\n{{wp|lang=ja}}\n{{ja-kanjitab|yomi=o|ちゅう|こく|k2=ごく}}\nFrom {{bor|ja|ltc|-}} {{ltc-l|中國|id=1|lit=[[central]] [[country]]}}. Compare modern {{cog|yue|中國|tr=zung<sup>1</sup> gwok<sup>3</sup>}}. Attested from at least the 12th century.<ref>{{R:Kokugo Dai Jiten}}</ref>\n\n{{rfe|ja|Where the "China" sense first attested in a Japanese document?}}\n\n====Proper noun====\n{{ja-pos|proper|ちゅうごく}}\n\n#[[China]]\n## {{lb|ja|historical|_|or contemporary|sort=ちゅうごく}} any of several [[kingdom]]s, [[empire]]s, or [[state]]s of the [[Chinese]] people\n##: {{hyponyms|ja|殷|tr1=In|周|tr2=Shū|秦|tr3=Shin|漢|tr4=Kan|唐|tr5=Tō|中華民国|tr6=Chūka Minkoku}}\n##* {{quote-book|ja|year=c. 1120|title=Konjaku monogatari shū|trans-title=Tales of Times Now Past|section=scroll 6}}\n##*: {{ja-usex|造り奉れる体を見るに、\'\'\'中%國\'\'\'には不%似ず、面[めう]胡%国に似たり。|^つくり まつれる すがた を みる に、\'\'\'^ちゅう%ごく\'\'\' に は ふ-%にず、 めん[めう] こ%こく に にたり。|Ju

In [None]:
meaning = "{{infl|ru|verb|head=зака́нчивать|tr=zakánčivatʹ|不完了体}}(完了体:[[закончить|зако́нчить]])<br>{{おくりがな2|終|お|える|おえる}}。{{おくりがな2|終|お|わる|おわる}}。[[完成]]させる。"
print(wtp.parse(meaning.replace("<br>", "\n")).get_lists())
# prime_meaning = wtp.parse(meaning.replace("<br>", "\n")).get_lists()[0].items[0]

[]


In [None]:
s = "<!-- to become [[mossy]], -->to become [[covered]] or [[overgrown]] with [[moss]]"
parsed = wtp.parse(s)
print(parsed.comments[0].string)

<!-- to become [[mossy]], -->


In [None]:
s = "# {{inflection of|ru|есть||2|s|pres|ind|impfv|;|2|s|imp|impfv}}"
parsed = wtp.parse(s)
args = parsed.get_lists()[0].templates[0].arguments
print(list(filter(lambda a: a.value == ";", args)))

[Argument('|;')]


# 言語と品詞を指定してダンプファイルから該当する語を取得するスクリプト

In [None]:
!pip install wikitextparser

Collecting wikitextparser
  Downloading wikitextparser-0.56.3-py3-none-any.whl.metadata (13 kB)
Downloading wikitextparser-0.56.3-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wikitextparser
Successfully installed wikitextparser-0.56.3


In [None]:
%cd "/content/drive/MyDrive/Wiktionary/"

import bz2, io, re, os, xml.etree.ElementTree as ET
import concurrent.futures
import wikitextparser as wtp
from wikitextparser import Section
from typing import Optional

lang_dict = {
    "ru": {
        "en": {"Russian"},
        "ja": {"{{L|ru}}", "{{ru}}", "ロシア語"}
    }
}

pos_dict = {
    "verb": {
        "en": {"Verb"},
        "ja": {"{{verb}}", "動詞"}
    },
    "adjective": {
        "en": {"Adjective"},
        "ja": {"{{adjective}}", "形容詞"}
    },
    "noun": {
        "en": {"Noun"},
        "ja": {"{{noun}}", "名詞"}
    },
    "proper_noun": {
        "en": {"Proper noun"},
        "ja": {"{{name}}", "固有名詞"}
    },
    "pronoun": {
        "en": {"Pronoun"},
        "ja": {"{{pronoun}}", "人称代名詞","物主代名詞"}
    }
}

etymology_dict = {
    "en": {"Etymology 2"},
    "ja": {"{{etym}}2", "{{etym}} 2", "語源2", "語源 2"}
}


def getpages_xml(bz2data):
    xml = bz2.decompress(bz2data).decode("utf-8")
    pages = ET.fromstring(f"<pages>{xml}</pages>")
    for page in pages:
        if int(page.find("ns").text) == 0:
            title = page.find("title").text
            yield title, page.find("revision/text").text

def les_read(lang):
    tsv = f"{lang}streamlen.tsv"
    spos, slen = [], []
    with open(tsv) as f:
        pos = 0
        while (line := f.readline()):
            length = int(line)
            spos.append(pos)
            slen.append(length)
            pos += length
    return spos, slen

def get_target_lang_section(lang_code: str, target_lang: str, lev2_sections: list[Section]) -> Optional[Section]:
    langs = list(map(lambda sec: sec.title, lev2_sections))
    lang_set = lang_dict[target_lang][lang_code]
    if set(langs) & lang_set:
        return list(filter(lambda sec: sec.title in lang_set, lev2_sections))[0]
    else:
        return None

def extract_target_words(lang_code, target_lang_alpha2, pos):
    target = f"{lang_code}wiktionary-20250101-pages-articles-multistream.xml.bz2"
    outtsv = f"{lang_code}output-{target_lang_alpha2}-{pos}.tsv"
    spos, slen = les_read(lang_code)
    target_pos = pos_dict[pos][lang_code]
    result = []

    # アウトプットファイルが既存の場合は削除
    if os.path.exists(outtsv):
        os.remove(outtsv)

    with open(target, "rb") as f:
        f.seek(slen[0])
        for length in slen[1:-1]:
            bz2data = f.read(length)
            for title, text in getpages_xml(bz2data):
                # ページに目標言語の項があるか確認
                lev2_sections = list(filter(lambda sec: sec.level == 2, wtp.parse(text).sections))
                # 目標言語のセクションを抽出: 存在しない場合はNone
                target_lang_section = get_target_lang_section(lang_code, target_lang_alpha2, lev2_sections)
                if target_lang_section:
                    # print(f"title: {title}, langs: {langs}")
                    # 1階層下のセクションの一覧を取得
                    lev3_sections = list(filter(lambda sec: sec.level == 3, target_lang_section.sections))
                    level3_list = list(map(lambda sec: sec.title, lev3_sections))
                    meaning = ""

                    # 同形異議語が存在しない場合
                    if set(level3_list) & target_pos:
                        meaning = list(filter(lambda sec: sec.title in target_pos, lev3_sections))[0].contents
                        # print(f"同形異義語なし, title: {title}, meaning: {meaning}")
                    # 同形異議語が存在する場合
                    elif set(level3_list) & etymology_dict[lang_code]:
                        # print(f"title: {title}, level3_list: {level3_list}")
                        for lev3_section in lev3_sections:
                            lev4_sections = list(filter(lambda sec: sec.level == 4, lev3_section.sections))
                            level4_list = list(map(lambda sec: sec.title, lev4_sections))
                            # print(f"level4_list: {level4_list}")
                            if set(level4_list) & target_pos:
                                meaning = list(filter(lambda sec: sec.title in target_pos, lev4_sections))[0].contents
                                # print(f"同形異義語あり, title: {title}, meaning: {meaning}")
                                break
                    if meaning:
                        # 見出し語と意味のペアを返す
                        result.append((title, meaning))

    with open(outtsv, "w", encoding="utf-8") as f:
        for title, meaning in result:
            meaning = meaning.replace('\t', '').replace('\n', '<br>')
            f.write(f"{title}\t{meaning}\n")


# 日本語版ダンプファイルから取得
# extract_target_words("ja", "ru", "verb")
# extract_target_words("ja", "ru", "adjective")
# extract_target_words("ja", "ru", "noun")
# extract_target_words("ja", "ru", "proper_noun")
# extract_target_words("ja", "ru", "pronoun")

# 英語版ダンプファイルから取得
# extract_target_words("en", "ru", "verb")
extract_target_words("en", "ru", "adjective")
# extract_target_words("en", "ru", "noun")
# extract_target_words("en", "ru", "proper_noun")
# extract_target_words("en", "ru", "pronoun")

/content/drive/MyDrive/Wiktionary


# Wiktionaryの処理結果から語の第1義を抽出するスクリプト

In [None]:
import csv, re, regex, os
import wikitextparser as wtp
import pandas as pd

def process_link(s):
    pattern = re.compile("\[{2}[^\[\]\|]+\|[^\[\]\|]+\]{2}")
    links = pattern.findall(s)
    for link in links:
        ret = link.replace("[", "").replace("]", "").split("|")[1]
        s = s.replace(link, ret)
    return s

def process_parted(line):
    result = []
    words = line.split()
    for word in words:
        if "/" in word:
            result.append(word.split("/")[0])
        elif "#" in word:
            result.append(word.split("#")[0])
        else:
            result.append(word)
    return " ".join(result)


def process_comment(s: str) -> str:
    parsed = wtp.parse(s)
    comments = parsed.comments
    for comment in comments:
        s = s.replace(comment.string, "")
    return s


def process_wikilink(s: str) -> str:
    parsed = wtp.parse(s)
    if links := parsed.wikilinks:
        for link in links:
            ret = link.text if link.text else link.title
            s = s.replace(link.string, ret)
    return s


def process_templates(s: str) -> str:
    parsed = wtp.parse(s)
    if templates := parsed.templates:
        for template in templates:
            if template.nesting_level > 1:
                continue
            match template.name:
                case "ふりがな" | "wikipedia-s" | "w":
                    s = s.replace(template.string, template.arguments[0].value)
                case "おくりがな":
                    ret = "".join(template.string[2:-2].split("|")[1:3])
                    s = s.replace(template.string, ret)
                case "おくりがな2":
                    ret = "".join(template.string[2:-2].split("|")[1::2])
                    s = s.replace(template.string, ret)
                case "おくりがな3":
                    args = template.string[2:-2].split("|")
                    ret = "".join(list(map(lambda i: args[i], [1,3,4,6])))
                    s = s.replace(template.string, ret)
                case "m" | "l":
                    s = s.replace(template.string, template.arguments[1].value)
                case _:
                    s = s.replace(template.string, "")
    return s


def remove_cite(s):
    """引用を除去"""
    pattern1 = re.compile("<ref.+/ref>") # 終了タグあり
    pattern2 = re.compile("<ref[^>]+/>") # 自己終了型
    s = pattern1.sub("", s)
    s = pattern2.sub("", s)
    return s

def process_colon_comma(s, is_infl):
    semi = s.split(";")
    colon = s.split(":")
    comma = s.split(",")
    if is_infl:
        if len(semi) > 1:
            semi2 = semi[1].lstrip()
            if semi2.startswith("to "):
                return semi2
            else:
                return ""
        elif len(colon) > 1:
            colon2 = colon[1].lstrip()
            if colon2.startswith("to "):
                return colon2
            else:
                return ""
        elif len(comma) > 1:
            comma2 = comma[1].lstrip()
            if comma2.startswith("to "):
                return comma2
            else:
                return ""
        else:
            return ""
    else:
        s = s.split(";")[0]
        s = s.split(":")[0]
        s = s.split(",")[0]
        return s

# print(process_hurigana("[[くりかえす|繰り返し]][[いう|言う]]"))
# print(process_hurigana("{{ふりがな|栗|くり}}"))
# print(process_okurigana("{{おくりがな2|教|おし|える|おしえる}}"))
# print(process_okurigana("{{おくりがな|小|さい|ちいさい}}"))
# print(process_okurigana("{{おくりがな3|見|み||積|つ|もる|みつもる}}"))

def get_first_meaning_ja(meaning: str) -> Optional[str]:
    # 語義リストの先頭を取得する
    meaning_list = wtp.parse(meaning.replace("<br>", "\n")).get_lists()
    if meaning_list:
        prime_meaning = wtp.parse(meaning.replace("<br>", "\n")).get_lists()[0].items[0]
    else:
        prime_meaning = meaning.replace("<br>", " ")

    prime_meaning = prime_meaning.split("。")[0]
    # print(prime_meaning)

    # 活用形を除外
    if "{{l" in prime_meaning:
        # print(prime_meaning)
        return None
    # if "{{m" in prime_meaning:
    #     return None

    # 括弧書きを除去
    pattern_brace = re.compile("(\(|（)[^\)）]+(\)|）)")
    # if pattern_brace.search(prime_meaning):
    #     print(prime_meaning)
    #     print(re.sub(pattern_brace, '', prime_meaning))
    prime_meaning = re.sub(pattern_brace, '', prime_meaning)

    prime_meaning = process_wikilink(prime_meaning)
    prime_meaning = process_templates(prime_meaning)
    prime_meaning = remove_cite(prime_meaning)
    prime_meaning = re.sub(r'<[^>]+>', '', prime_meaning)  # XMLタグを除去
    prime_meaning = re.sub(r'《[^》]+》', '', prime_meaning)  # 《鳥類》的なやつを除去
    prime_meaning = prime_meaning.replace(" ", "")
    prime_meaning = prime_meaning.replace(":", "")
    prime_meaning = re.sub(r"'{2,}", '', prime_meaning)  # 2個以上連続するシングルクオートを除去
    prime_meaning = prime_meaning.split('、')[0]
    prime_meaning = prime_meaning.split('・')[0]
    if prime_meaning and not regex.search(r'\p{Cyrillic}', prime_meaning):
        # print(f"{word},{prime_meaning}")
        return prime_meaning
    else:
        return None

def get_first_meaning_en(meaning: str, pos: str) -> Optional[str]:
        # 語義リストの先頭を取得する
        meaning_list = wtp.parse(meaning.replace("<br>", "\n")).get_lists()
        if meaning_list:
            first_meaning = wtp.parse(meaning.replace("<br>", "\n")).get_lists()[0].items[0]
        else:
            first_meaning = meaning.replace("<br>", " ")

        if first_meaning.startswith("{{infl"):
            return None
        else:
            is_infl = False
            if "{{infl" in first_meaning:
                is_infl = True
            first_meaning = remove_cite(first_meaning)
            first_meaning = process_comment(first_meaning)
            first_meaning = process_wikilink(first_meaning)
            first_meaning = process_templates(first_meaning)
            first_meaning = process_templates(first_meaning) # 入れ子対応
            # print(first_meaning)
            # if first_meaning.strip() and is_infl:
            #     print(f"{word}...{first_meaning}")
            first_meaning = re.sub(r"\([^\(\)]*\)", "", first_meaning)
            first_meaning = re.sub(r"\([^\(\)]*\)", "", first_meaning) # 入れ子対応
            first_meaning = re.sub(r"\[[^\[\]]*\]", "", first_meaning) # 入れ子対応
            first_meaning = process_colon_comma(first_meaning, is_infl) # 原形だけどinflがついてる単語用の対応
            first_meaning = first_meaning.split(",")[0]
            first_meaning = first_meaning.split(" - ")[0]
            first_meaning = process_parted(first_meaning)
            first_meaning = first_meaning.replace("  ", " ")
            # first_meaning = first_meaning.replace(".", "")
            first_meaning = re.sub(r"\.{1,}$", '', first_meaning)
            first_meaning = first_meaning.replace("’", "'")
            first_meaning = re.sub(r"'{2,}", '', first_meaning)
            if pos != "verb":
                first_meaning = first_meaning.replace('"', "")
            first_meaning = first_meaning.replace('?', "")
            first_meaning = first_meaning.strip()
            # 「# {{archaic form of|ru|внема́точной|nodot=1}} – {{inflection of|ru|внема́точный||ins|s|f}}」のようなケースを除外
            if first_meaning and first_meaning != "–" and not regex.search(r'^[\p{Cyrillic}́]+$', first_meaning):
                first_meaning = first_meaning.replace("To ", "to ")
                return first_meaning
            else:
                return None

def make_simple_dictionary(lang, pos, wiktlang):
    outfile_name = f"{lang}-{pos}-{wiktlang}.csv"
    # アウトプットファイルがすでに存在する場合は削除する
    if os.path.exists(outfile_name):
        os.remove(outfile_name)

    with open(f'{wiktlang}output-{lang}-{pos}.tsv', 'r') as f, open(outfile_name, 'w') as fout:
        count = 0
        for rows in csv.reader(f, delimiter='\t'):
            try:
                word, meaning = rows
            except ValueError:
                print(rows)
                break
            # 見出し語が句や略語の場合はスキップ
            if " " in word or "." in word:
                continue
            if wiktlang == "ja":
                first_meaning = get_first_meaning_ja(meaning)
            else:
                first_meaning = get_first_meaning_en(meaning, pos)
            if first_meaning:
                fout.write(f"{word},{first_meaning}\n")
                count += 1
        print(count)

def test_get_first_meaning(lang, pos, wiktlang, word):
    with open(f'{wiktlang}output-{lang}-{pos}.tsv', 'r') as f:
        df = pd.read_csv(f, header=None, delimiter='\t')
        meaning = df[df.iloc[:, 0] == word].iloc[0, 1]
        if wiktlang == "ja":
            first_meaning = get_first_meaning_ja(meaning)
        else:
            first_meaning = get_first_meaning_en(meaning)
        print(first_meaning)

# make_simple_dictionary("ru", "verb", "ja")
# make_simple_dictionary("ru", "adjective", "ja")
# make_simple_dictionary("ru", "noun", "ja")
# make_simple_dictionary("ru", "proper_noun", "ja")
# make_simple_dictionary("ru", "pronoun", "ja")

# make_simple_dictionary("ru", "verb", "en")
make_simple_dictionary("ru", "adjective", "en")
# make_simple_dictionary("ru", "noun", "en")
# make_simple_dictionary("ru", "proper_noun", "en")
# make_simple_dictionary("ru", "pronoun", "en")

# test_get_first_meaning("ru", "verb", "ja", "следовать")
# test_get_first_meaning("ru", "adjective", "en", "картавый")

8262


# 出力結果のテスト

## ru-verb-en.csv

In [None]:
import pandas as pd

df = pd.read_csv("ru-verb-en.csv", header=None)
print(df.columns)

filtered_df = df[df.iloc[:, 1].str.contains(" [^a-zA-Z' ]") | ~df.iloc[:, 1].str.startswith("to")]

# 結果を出力
print(filtered_df)


## ru-noun-en.csv

In [None]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df = pd.read_csv("ru-noun-en.csv", header=None)
print(df.columns)

filtered_df = df[df.iloc[:, 1].str.contains(" [^a-zA-Z' ]")]

# 結果を出力
print(filtered_df)


Index([0, 1], dtype='int64')
                      0                                                  1
199                 наш  the name of the letter н in the Old Russian al...
611                  ка                              the Cyrillic letter К
745             постеля                                   same as посте́ль
1079            луковка                             diminutive of луковица
1083              бухма                                         see брюква
1217                 аз                      name of the Cyrillic letter А
1602                 бэ                              The Cyrillic letter Б
1604                 цэ                              The Cyrillic letter Ц
1606                 дэ                              The Cyrillic letter Д
1607                 эф                              The Cyrillic letter Ф
1611                 эм                              The Cyrillic letter М
1612                 эн                              The Cyrillic lette

## ru-adjective-en.csv

In [None]:
import pandas as pd

df = pd.read_csv("ru-adjective-en.csv", header=None)
print(df.columns)

filtered_df = df[df.iloc[:, 1].str.contains(" [^a-zA-Z' ]")]

# 結果を出力
print(filtered_df)

Index([0, 1], dtype='int64')
Empty DataFrame
Columns: [0, 1]
Index: []


# 活用形を渡すと他の活用形とその意味を返すスクリプト

## ライブラリインストール

In [None]:
!pip install -U pip
!pip install pymorphy3
!pip install jinf
!pip install rhoknp
!pip install lemminflect
!pip install spacy==3.7.5
# !python -m spacy download en_core_web_lg
# !pip install -q fastapi nest-asyncio uvicorn pyngrok

Collecting pip
  Downloading pip-25.0-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0-py3-none-any.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m55.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0
Collecting pymorphy3
  Downloading pymorphy3-2.0.2-py3-none-any.whl.metadata (1.8 kB)
Collecting dawg-python>=0.7.1 (from pymorphy3)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3

In [None]:
!python -m spacy validate

[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation: /usr/local/lib/python3.11/dist-packages/spacy[0m

NAME             SPACY            VERSION                            
en_core_web_lg   >=3.8.0,<3.9.0   [38;5;2m3.8.0[0m   [38;5;2m✔[0m
en_core_web_sm   >=3.7.2,<3.8.0   [38;5;3m3.7.1[0m   --> 3.8.0     

[1m
Use the following commands to update the packages:
python -m spacy download en_core_web_sm



In [None]:
%cd /content
!wget http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.02.tar.xz
!tar xJvf jumanpp-1.02.tar.xz
%cd jumanpp-1.02
!./configure
!make
!sudo make install
%cd "/content/drive/MyDrive/Wiktionary/"

In [None]:
!which jumanpp

/usr/local/bin/jumanpp


## 日本語活用テスト

In [None]:
from rhoknp import Jumanpp
from jinf import Jinf

jinf = Jinf()
jumanpp = Jumanpp(skip_sanity_check=True)
analised = jumanpp.apply_to_sentence("合体する")
mrphlist = analised.morphemes
print(mrphlist[-1])
print(mrphlist[-1].__dict__)
results = list(map(lambda mrph: mrph.surf, mrphlist[0:-1]))
verb = mrphlist[-1]
# print(list(map(lambda mrph: mrph.conjtype, mrphlist)))
results.append(jinf(verb.surf, verb.conjtype, verb.conjform, "未然形"))
print(results)
# print(jinf.convert_pyknp_morpheme(mrph, "未然形"))

する
{'_text': 'する', 'reading': 'する', 'lemma': 'する', 'pos': '動詞', 'pos_id': 2, 'subpos': '*', 'subpos_id': 0, 'conjtype': 'サ変動詞', 'conjtype_id': 16, 'conjform': '基本形', 'conjform_id': 2, '_base_phrase': None, '_sentence': <rhoknp.units.sentence.Sentence: '合体する'>, 'semantics': {'代表表記': 'する/する', '付属動詞候補（基本）': True, '自他動詞': '自:成る/なる'}, 'features': {}, 'homographs': [], 'index': 1}
['合体', 'さ']


## 英語活用テスト

In [None]:
from lemminflect import getInflection, getAllInflections

print(getInflection("get", tag="VBD"))
print(getInflection("watch", tag="VBN"))
print(getAllInflections("get"))

('got',)
('watched',)
{'VBD': ('got',), 'VBN': ('gotten',), 'VBG': ('getting',), 'VBZ': ('gets',), 'VB': ('get',), 'VBP': ('get',)}


## and/or検証

In [None]:
import spacy
from spacy import displacy

# nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_lg")
def detect_heads(s: str) -> list[int]:
    doc = nlp(s)
    # displacy.serve(doc, style="dep")
    print(doc.text)
    conjuncts = []
    root = None
    indices = []
    for token in doc:
        # print(token.text, token.conjuncts)
        # print(token.text, token.pos_, token.dep_, token.head, token.tag_)
        if token.dep_ == "ROOT":
            root = token
            indices.append(token.i)
            if token.conjuncts:
                conjuncts.append(*token.conjuncts)
            continue
        if root and token in conjuncts:
            if token.morph == root.morph:
                indices.append(token.i)
    print(f"root: {root}, indices: {indices}")
    print()
    return indices

detect_heads("to fast and attend church services in preparation for communion")
detect_heads("to start to toss and turn")
detect_heads("to become strong and healthy")
detect_heads("to get dirty and frayed from being worn a long time")
detect_heads("to go to school or office and come back")

to fast and attend church services in preparation for communion
root: fast, indices: [1, 3]

to start to toss and turn
root: start, indices: [1]

to become strong and healthy
root: become, indices: [1]

to get dirty and frayed from being worn a long time
root: get, indices: [1]

to go to school or office and come back
root: go, indices: [1, 7]



## someone/something検証

In [None]:
import spacy
from spacy import displacy
from spacy.tokens import Token, Doc
from typing import Iterator

nlp = spacy.load("en_core_web_lg")

def trace_conj(token: Token, root: Token) -> bool:
    """再帰関数"""
    # print(token.text, token.pos_, token.dep_, token.head, token.tag_)
    if token.head == root:
        return True
    elif token.head.dep_ == "conj":
        return trace_conj(token.head, root)
    return False

def search_adp_of_root(children: Iterator[Token]) -> list[int]:
    indices = []
    for child in children:
        if child.pos_ == "ADP":
            indices.append(child.i)
            if len(tuple(child.children)) > 0:
                sub_indices = search_adp_of_root(child.children)
                if sub_indices:
                    indices.append(*sub_indices)
            else:
                return indices
    return indices

def is_no_complm(index: int, doc: Doc) -> bool:
    """
    tokenの前に補文標識がないことを確認する
    """
    for token in doc:
        if token.i == index:
            break
        if token.dep_ == "mark" or token.tag_ == "WP":
            return False
    return True

def detect_some(s: str) -> list[int]:
    somes = ["someone", "something"]
    doc = nlp(s)
    # displacy.serve(doc, style="dep")
    print(doc.text)
    root = None
    indices = []
    child_adp_i = []
    is_causative = False
    for token in doc:
        print(token.text, token.pos_, token.dep_, token.head, token.tag_)
        if token.dep_ == "ROOT":
            # print(list(token.children))
            root = token
            child_adp_i = search_adp_of_root(token.children)
            continue
        if root and token.text in somes:
            head = token.head
            if head == root:
                indices.append(token.i)
                break # ROOTの直接目的語となるsomeone/somethingが見つかった時点で探索を終了する
            elif head.dep_ == "ccomp" and trace_conj(head, root) and is_no_complm(token.i, doc):
                indices.append(token.i)
        elif token.i in child_adp_i:
            for child in token.children:
                if child.text in somes:
                    indices.append(child.i)

    # print(child_adp_i)
    print(indices)
    print()

# detect_some("to fuck someone over")
# detect_some("to close by hanging something over it")
# detect_some("to force someone to do something contrary to his personal will and wishes")
# detect_some("to listen to something incompletely or not to the end")
# detect_some("to make fun of someone")
# detect_some("to get the hang of something")
# detect_some("to make someone occupy some space")
# detect_some("to make someone cold")
# detect_some("to make or let someone sit down")
# detect_some("to treat someone to something delicious")
# detect_some("to name someone as something")
detect_some("to spit past or over something")
# detect_some("to start doing something")
# detect_some("to just happen to say something")
# detect_some("to establish communication or agree on something by exchanging writings")
# detect_some("to buy what someone else wanted to get")
# detect_some("to say that someone else wanted to get it")

to spit past or over something
to PART aux spit TO
spit VERB ROOT spit VB
past ADV advmod spit RB
or CCONJ cc past CC
over ADP conj past IN
something PRON pobj over NN
[]



## 使役動詞判定検証

In [None]:
import spacy

def is_causative(s: str) -> bool:
    causative_verbs = ["make", "let", "have"]
    comps = ["ccomp", "xcomp"]
    doc = nlp(s)
    root = None
    for token in doc:
        print(token.text, token.pos_, token.dep_, token.head, token.tag_)
        if token.dep_ == "ROOT":
            if token.text in causative_verbs:
                root = token
                continue
            else:
                return False
        if root and token.dep_ in comps and token.pos_ == "VERB" and trace_conj(token, root) and is_no_complm(token.i, doc):
            return True
    return False

# print(is_causative("to make someone occupy some space"))
# print(is_causative("to make someone cold"))
# print(is_causative("to make what someone wants to buy"))
# print(is_causative("to make or let someone sit down"))
print(is_causative("to make understand"))
# print(is_causative("to make noise"))

to PART aux make TO
make VERB ROOT make VB
understand NOUN dobj make NN
False


## ロシア語活用テスト

### 形容詞

In [None]:
import pymorphy3, itertools

RU_GENDER_DICT = {
    "masc": "男性",
    "femn": "女性",
    "neut": "中性",
}

RU_CASE_DICT = {
    "nomn": "主格",
    "gent": "生格",
    "datv": "与格",
    "accs": "対格",
    "ablt": "造格",
    "loct": "前置格",
    "voct": "呼格",
    "gen2": "分格",
    "acc2": "対格",
    "loc2": "処格",
}

RU_DICT_FOR_SORT_ADJ = {
    "男": 1000,
    "女": 2000,
    "中": 3000,
    "単": 100,
    "複": 200,
    "主": 10,
    "生": 20,
    "与": 30,
    "対": 40,
    "造": 50,
    "前置": 60,
    "呼": 70,
    "分": 80,
    "処": 90,
    "活動": 1,
    "非活動": 2,
}

def sort_adj_inf(inf):
    ret = 0
    if "性" in inf:
        gender, inf = inf.split("性")
        ret += RU_DICT_FOR_SORT_ADJ[gender]
    if "数" in inf:
        number, inf = inf.split("数")
        ret += RU_DICT_FOR_SORT_ADJ[number]
    if "格" in inf:
        wcase, inf = inf.split("格")
        ret += RU_DICT_FOR_SORT_ADJ[wcase]
    if "体" in inf:
        anim, inf = inf.split("体")
        ret += RU_DICT_FOR_SORT_ADJ[anim]
    return ret

def get_inf_desc_adj(word):
    morph = pymorphy3.MorphAnalyzer()
    morphs = morph.parse(word)
    # print(morphs)
    morphemes = list(filter(lambda m: m.tag.POS in ["ADJF", "ADJS", "COMP"], morph.parse(word)))
    # グループ化するためにnormal_formでソート
    morphemes = sorted(morphemes, key=lambda x: x.normal_form)
    print(morphemes)
    descs = []
    # 原形が複数存在する場合があるため，normal_formでグループ化する
    for normal, group in itertools.groupby(morphemes, key=lambda x: x.normal_form):
        morphemes = list(group)
        cases = []
        for morpheme in morphemes:
            # 性・数
            if gender := morpheme.tag.gender:
                gen_num = f"{RU_GENDER_DICT[gender]}単数"
            else:
                gen_num = "複数"

            match morpheme.tag.POS:
                case "ADJF": # 長語尾
                    if {"Supr"} in morpheme.tag:
                        form = ""
                        cases.append(f"最上級")
                        break
                    form = "長語尾"
                    # 格チェック
                    wcase = RU_CASE_DICT[morpheme.tag.case]
                    if wcase == "対格":
                        # 活動体チェック
                        anim = "活動体" if morpheme.tag.animacy == "anim" else "非活動体"
                        cases.append(f"{gen_num}{wcase}{anim}")
                    else:
                        cases.append(f"{gen_num}{wcase}")
                case "ADJS": # 短語尾
                    form = "短語尾"
                    cases.append(f"{gen_num}")
                case "COMP": # 比較級
                    form = ""
                    cases.append(f"比較級")
        descs.append(f"{word}は形容詞{normal}の{form}{'/'.join(sorted(cases, key=sort_adj_inf))}です")
    return descs

print(get_inf_desc_adj("большой"))
print(get_inf_desc_adj("большие"))
print(get_inf_desc_adj("наибольший"))
print(get_inf_desc_adj("больше"))
print(get_inf_desc_adj("больший"))
print(get_inf_desc_adj("велик"))
print(get_inf_desc_adj("рад"))

[Parse(word='большой', tag=OpencorporaTag('ADJF,Qual femn,sing,ablt'), normal_form='большой', score=0.327433, methods_stack=((DictionaryAnalyzer(), 'большой', 532, 11),)), Parse(word='большой', tag=OpencorporaTag('ADJF,Qual femn,sing,gent'), normal_form='большой', score=0.292035, methods_stack=((DictionaryAnalyzer(), 'большой', 532, 8),)), Parse(word='большой', tag=OpencorporaTag('ADJF,Qual femn,sing,loct'), normal_form='большой', score=0.176991, methods_stack=((DictionaryAnalyzer(), 'большой', 532, 13),)), Parse(word='большой', tag=OpencorporaTag('ADJF,Qual masc,sing,nomn'), normal_form='большой', score=0.088495, methods_stack=((DictionaryAnalyzer(), 'большой', 532, 0),)), Parse(word='большой', tag=OpencorporaTag('ADJF,Qual inan,masc,sing,accs'), normal_form='большой', score=0.070796, methods_stack=((DictionaryAnalyzer(), 'большой', 532, 4),)), Parse(word='большой', tag=OpencorporaTag('ADJF,Qual femn,sing,datv'), normal_form='большой', score=0.044247, methods_stack=((DictionaryAnalyze

### 動詞

In [None]:
import pymorphy3

def hoge(word):
    morph = pymorphy3.MorphAnalyzer()
    morphs = morph.parse(word)
    # print(morphs)
    for morpheme in morphs:
        # print(morpheme.tag.grammemes)
        if morpheme.tag.aspect == "impf":
            lexeme = list(map(lambda l: l.inflect(l.tag.grammemes - {"perf"} | {"impf"}), morpheme.lexeme))
        else:
            lexeme = morpheme.lexeme
        lexeme = [l.word for l in lexeme if l]
        print(lexeme)
        # print(morpheme.inflect({"sing", "impr"}))

def fuga(word):
    morph = pymorphy3.MorphAnalyzer()
    morphs = morph.parse(word)
    # print(morphs)
    for lexeme in morphs[0].lexeme:
        print(lexeme.word, lexeme.tag.grammemes)

# aspect = morpheme.tag.aspect

# print(hoge("большой"))
# print(hoge("наибольший"))
# print(hoge("больше"))
# print(hoge("использую"))
# print(hoge("могу"))
# print(hoge("ела"))
# print(hoge("евши"))
# print(hoge("еденный"))
# print(hoge("евший"))
# print(hoge("ешьте"))
# print(hoge("едено"))
# print(hoge("скучать"))
print(fuga("красивее"))

### 名詞

In [None]:
s = "{word}が"
print(s.format(word="猫"))
l = ["masc", "femn", "None"]
print(sorted(l, reverse=True))

猫が
['masc', 'femn', 'None']


In [None]:
import pymorphy3

def fuga(word):
    morph = pymorphy3.MorphAnalyzer()
    morphs = morph.parse(word)
    # print(morphs)
    # for morpheme in morphs:
    #     print(morpheme)
    for lexeme in morphs[0].lexeme:
        print(lexeme.word, lexeme)

# fuga("ёж")
# fuga("масс-медиа")
# fuga("очки")
fuga("авось")
# fuga("кафе")
# fuga("лес")
# fuga("собака")
# fuga("Аладдин")
# fuga("Арагви")
# fuga("Чехов")
# fuga("АСЕАН")
# fuga("они")
# fuga("них")
# fuga("моего")
# fuga("это")

авось Parse(word='авось', tag=OpencorporaTag('PRCL'), normal_form='авось', score=1.0, methods_stack=((DictionaryAnalyzer(), 'авось', 22, 0),))


# CSV関係

## CSV結合スクリプト

In [None]:
%cd "/content/drive/MyDrive/Wiktionary/"

import pandas as pd
import os, re
import itertools

dfs_per_lang = []

# merged.csvが既に存在する場合は削除する
os.remove("merged.csv") if os.path.exists("merged.csv") else None

# カレントディレクトリ配下のCSVファイル名のリストを取得
csv_files = [f for f in os.listdir() if re.match(r'.+-.+-.+\.csv', f)]
# print(csv_files)
csv_files = sorted(csv_files, key=lambda x: x.split('-')[0])
for lang, lang_group in itertools.groupby(csv_files, key=lambda x: x.split('-')[0]):
    lang_files = list(lang_group)
    print(lang_files)
    dfs_per_pos = []
    # 品詞ごとにPandasでCSV読み込む
    lang_files = sorted(lang_files, key=lambda x: x.split('-')[1])
    for pos, pos_group in itertools.groupby(lang_files, key=lambda x: x.split('-')[1]):
        pos_files = list(pos_group)
        print(pos_files)
        dfs = []
        for f in pos_files:
            df = pd.read_csv(f, header=None)
            # ヘッダーを追加（word + ja/en-meaning）
            mean_lang = f.split('-')[2].split(".")[0]
            df.columns = ["word", f"{mean_lang}-meaning"]
            dfs.append(df)
        # 品詞ごとにwordをキーとして横に結合
        df_merged = pd.merge(dfs[0], dfs[1], on="word", how="outer")
        # pos列を追加して品詞（英語フルネーム）を入力
        df_merged.insert(1, "pos", pos)
        dfs_per_pos.append(df_merged)

    # 品詞ごとに結合したもの同士を縦に結合
    df_concat = pd.concat(dfs_per_pos)
    # lang列を追加して言語（2文字）を入力
    df_concat.insert(1, "lang", lang)
    dfs_per_lang.append(df_concat)

# 言語ごとに結合したもの同士を縦に結合
df_concat = pd.concat(dfs_per_lang)

# CSVに書き出し
df_concat.to_csv("merged.csv", index=False, quoting=csv.QUOTE_NONE)

/content/drive/MyDrive/Wiktionary
['ru-adjective-ja.csv', 'ru-verb-ja.csv', 'ru-verb-en.csv', 'ru-adjective-en.csv', 'ru-noun-ja.csv', 'ru-proper_noun-ja.csv', 'ru-pronoun-ja.csv', 'ru-noun-en.csv', 'ru-proper_noun-en.csv', 'ru-pronoun-en.csv']
['ru-adjective-ja.csv', 'ru-adjective-en.csv']
['ru-noun-ja.csv', 'ru-noun-en.csv']
['ru-pronoun-ja.csv', 'ru-pronoun-en.csv']
['ru-proper_noun-ja.csv', 'ru-proper_noun-en.csv']
['ru-verb-ja.csv', 'ru-verb-en.csv']


## CSV検索テスト

In [None]:
%cd "/content/drive/MyDrive/Wiktionary/"

import pandas as pd

df = pd.read_csv("merged.csv")

df_match = df[df['word'] == 'hoge']

print(df_match)
print(df_match.empty)
# lang列の1行目を取得
print(df_match['lang'].iloc[0])
print(df_match['ja-meaning'].iloc[0])
print(pd.isnull(df_match['ja-meaning'].iloc[0]))

## CSV更新テスト

In [None]:
%cd "/content/drive/MyDrive/Wiktionary/"

import pandas as pd

df = pd.read_csv("update_test.csv")
df.columns = ["word", "meaning"]

df.loc[(df['word'] == "мы") & (df['meaning'] == "我々は"), 'meaning'] = "我々"
df.to_csv("update_test.csv", index=False, quoting=csv.QUOTE_NONE)

df2 = pd.read_csv("update_test.csv")
print(df2)

# 辞書データからテストパターンに使える単語を探す

In [None]:
import pandas as pd

def is_positive(series):
    """与えられたSeriesの各要素が正の値かどうかを確認し、ブールインデックスを返す"""
    return series > 0

# 使用例
data = {'numbers': [1, -2, 3, 0, -5, 6]}
df = pd.DataFrame(data)
result = is_positive(df['numbers'])

print(result)

0     True
1    False
2     True
3    False
4    False
5     True
Name: numbers, dtype: bool


In [None]:
%cd "/content/drive/MyDrive/Wiktionary/"

import pymorphy3
import pandas as pd

morph = pymorphy3.MorphAnalyzer()
df = pd.read_csv("merged.csv")

def is_fixed_plural(word):
    """"""
    # 見出し語を原形とする形態素を抽出
    try:
        morpheme = list(filter(lambda m: m.normal_form == m.word , morph.parse(word)))[0]
    except IndexError:
        return False

    if morpheme.tag.number == "plur" and "Fixd" in morpheme.tag.grammemes:
        return True
    else:
        return False

df = df.loc[(df['lang'] == "ru") & (df['pos'].isin(["noun", "proper_noun"])) & (df['word'].map(is_fixed_plural)), 'word']
print(df)

/content/drive/MyDrive/Wiktionary
8300             ВВС
8308              ВС
8406             СНВ
9378             ара
10496          бонмо
10784         буржуа
11728             вт
11920             га
12659       граффити
13805           евро
15256         кабаре
15307         казино
15322         какаду
15754             кг
15775        кенгуру
15815           киви
16022          клише
16065             км
16517       конфетти
17113           купе
17895           майя
18108     масс-медиа
18220          медиа
18933    мультимедиа
20946             па
21603           песо
22677         портье
24529         реноме
24587         рефери
26389       спагетти
27011           суши
27118           табу
27788          трико
29126          хиппи
30515           эссе
30674           янки
30956      Викимедиа
Name: word, dtype: object


# lingua-pyテスト

In [None]:
!pip install lingua-language-detector

In [None]:
from lingua import Language, LanguageDetectorBuilder

languages = [Language.CHINESE, Language.KOREAN, Language.RUSSIAN, Language.SPANISH, Language.VIETNAMESE]
detector = LanguageDetectorBuilder.from_languages(*languages).build()
language = detector.detect_language_of("TT")
print(language.iso_code_639_1.name.lower())

vi


# キリル文字正規表現テスト

In [None]:
import regex

s = "ми́лыйの短語尾複数形"
s = "сдо́ба"
ret = regex.search(r'[\p{Cyrillic}́]+', s)
print(ret)

<regex.Match object; span=(0, 6), match='сдо́ба'>


# 動詞活用重複調査

In [None]:
import csv, re, regex, os
import wikitextparser as wtp


with open(f'enoutput-ru-verb.tsv', 'r') as f:
    for rows in csv.reader(f, delimiter='\t'):
        word, meaning = rows
        # 見出し語が句や略語の場合はスキップ
        if " " in word or "." in word:
            continue

        # 語義リストの先頭を取得する
        meaning_list = wtp.parse(meaning.replace("<br>", "\n")).get_lists()
        # 最上位だけfilter
        meaning_list = list(filter(lambda m: m.level == 1, meaning_list))
        if meaning_list:
            # for meaning in meaning_list:
            templates = meaning_list[0].templates
            if templates:
                t = meaning_list[0].templates[0]
                if "infl" in t.name and list(filter(lambda a: a.value == ";", t.arguments)):
                    print(word)

говорите
использую
используем
используешь
используете
использует
используют
хотите
браните
горите
дробите
жените
корите
костите
поносит
велим
осип
спите
заспите
благословите
болите
вините
победите
покривите
заболите
запалите
лежите
летите
наводните
обвините
отключите
повелите
предупредите
причините
простите
рачите
решите
сравните
убедите
злоупотребите
одобрите
раним
раните
ранят
пристань
посетите
повторите
навестите
удовлетворите
бегу
бежишь
бежит
бежим
бежите
бегут
грустите
атакую
атакуешь
атакует
атакуем
казню
абсолютизирую
абсолютизируешь
абсолютизирует
абсолютизируем
абсолютизируете
абсолютизируют
ешь
съешь
поешь
атакуете
атакуют
автоматизирую
автоматизируешь
автоматизирует
автоматизируем
автоматизируете
автоматизируют
ездите
казнишь
казнит
казним
казните
казнят
абсорбирую
абсорбируешь
абсорбирует
авансирую
абсорбируем
авансируешь
садитесь
стартую
авансирует
авансируем
годитесь
чтите
абсорбируете
гремите
абсорбируют
бейся
сидите
помните
нахожусь
находишься
находится
находимся
наход