Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #59 from Taiwanese-Corpus/台灣植物名彙
台灣植物名彙
- Loading branch information
Showing
6 changed files
with
120 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from csv import DictReader | ||
import io | ||
from urllib.request import urlopen | ||
|
||
|
||
from 臺灣言語工具.解析整理.拆文分析器 import 拆文分析器 | ||
from 臺灣言語服務.models import 訓練過渡格式 | ||
from 匯入.指令 import 匯入枋模 | ||
from 臺灣言語工具.解析整理.解析錯誤 import 解析錯誤 | ||
|
||
|
||
class Command(匯入枋模): | ||
help = 'http://ip194097.ntcu.edu.tw/memory/TGB/thak.asp?id=862' | ||
|
||
公家內容 = { | ||
'來源': '台灣植物名彙', | ||
'種類': '字詞', | ||
'年代': '1928', | ||
} | ||
github網址 = ( | ||
'https://github.com/Taiwanese-Corpus/' | ||
'Syuniti-Sasaki_1928_List-of-Plants-of-Formosa/' | ||
'raw/master/ChhoeTaigi_TaioanSitbutMialui.csv' | ||
) | ||
|
||
def 全部資料(self, *args, **參數): | ||
匯入數量 = 0 | ||
for 台文 in self.github資料(): | ||
yield 訓練過渡格式( | ||
文本=台文, | ||
**self.公家內容 | ||
) | ||
|
||
匯入數量 += 1 | ||
if 匯入數量 % 1000 == 0: | ||
self.stdout.write('匯入 {} 筆'.format(匯入數量)) | ||
|
||
def github資料(self): | ||
with urlopen(self.github網址) as 檔: | ||
with io.StringIO(檔.read().decode()) as 資料: | ||
for row in DictReader(資料): | ||
羅馬字 = row['poj_unicode'].strip() | ||
可能漢字 = row['taigi_hanji'].strip() | ||
for lo, han in self.漢羅組合(羅馬字, 可能漢字): | ||
try: | ||
物件 = 拆文分析器.建立句物件(han, lo) | ||
except 解析錯誤 as 錯誤: | ||
self.stderr.write(str(錯誤)) | ||
else: | ||
for 字物件 in 物件.篩出字物件(): | ||
if 字物件.型 == 'XXX': | ||
字物件.型 = 字物件.音 | ||
yield 物件.看分詞() | ||
|
||
def 漢羅組合(self, 羅馬字, 可能漢字): | ||
for 漢字 in 可能漢字.replace('?', ' XXX ').split('、'): | ||
yield 羅馬字, 漢字 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from django.core.management import call_command | ||
from django.test.testcases import TestCase | ||
from 臺灣言語服務.models import 訓練過渡格式 | ||
from 匯入.management.commands.台灣植物名彙 import Command | ||
from 臺灣言語工具.解析整理.拆文分析器 import 拆文分析器 | ||
|
||
|
||
class 台灣植物名彙試驗(TestCase): | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
call_command('台灣植物名彙') | ||
super().setUpClass() | ||
|
||
def test句數正確(self): | ||
self.assertGreater(訓練過渡格式.資料數量(), 1700) | ||
|
||
def test切出一詞(self): | ||
self.assertEqual( | ||
list(Command().漢羅組合('Kau-tîn', '鈎藤')), | ||
[('Kau-tîn', '鈎藤')], | ||
) | ||
|
||
def test切出兩詞(self): | ||
self.assertEqual( | ||
list(Command().漢羅組合('Tsuí-kim-kiann', '水金京、水金驚')), | ||
[('Tsuí-kim-kiann', '水金京'), ('Tsuí-kim-kiann', '水金驚')], | ||
) | ||
|
||
def test問號換做XXX(self): | ||
self.assertEqual( | ||
list(Command().漢羅組合('Tò-tiàu-hong', '倒?風')), | ||
[('Tò-tiàu-hong', '倒 XXX 風')], | ||
) | ||
|
||
def test問號上尾改漢羅(self): | ||
self.assertTrue( | ||
訓練過渡格式.objects | ||
.filter(文本=拆文分析器.建立句物件('倒-tiàu-風', 'Tò-tiàu-hong').看分詞()) | ||
.exists() | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters