Merge a4158c2 into c1e9f38

Taiwanese-Corpus · May 10, 2019 · 2eaae01 · 2eaae01
2 parents c1e9f38 + a4158c2
commit 2eaae01
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 105 deletions.
diff --git a/tox.ini b/tox.ini
@@ -103,4 +103,4 @@ deps =
     coverage
 commands=
 	coverage run --source=匯入 manage.py test 試驗.test台灣白話基礎語句
-	
+
diff --git a/匯入/management/commands/教典音檔1轉檔.py b/匯入/management/commands/教典音檔1轉檔.py
@@ -7,6 +7,7 @@
 
 from libavwrapper.avconv import Input, Output, AVConv
 from libavwrapper.codec import AudioCodec, NO_VIDEO
+from subprocess import run
 
 
 class Command(BaseCommand):
@@ -31,14 +32,15 @@ def handle(self, *args, **參數):
             if 檔名.endswith('.mp3'):
                 來源 = join(語料目錄, 檔名)
                 目標 = join(目標目錄, 檔名[:-4] + '.wav')
-                目標聲音格式 = AudioCodec('pcm_s16le')
-                目標聲音格式.channels(1)
-                目標聲音格式.frequence(16000)
-                原始檔案 = Input(來源)
-                網頁檔案 = Output(目標).overwrite()
-                指令 = AVConv('avconv', 原始檔案, 目標聲音格式, NO_VIDEO, 網頁檔案)
-                程序 = 指令.run()
-                程序.wait()
+                run([
+                    'ffmpeg', '-i',
+                    來源,
+                    '-acodec', 'pcm_s16le',
+                    '-ar', '16000',
+                    '-ac', '1',
+                    '-y',
+                    目標,
+                ], check=True)
 
                 匯入數量 += 1
                 if 匯入數量 == 參數['匯入幾筆']:

diff --git a/匯入/management/commands/教典音檔2匯入.py b/匯入/management/commands/教典音檔2匯入.py
@@ -3,23 +3,22 @@
 from posix import listdir
 
 from django.conf import settings
-from django.core.management import call_command
-from django.core.management.base import BaseCommand
 from django.utils import timezone
 
 
-from 臺灣言語資料庫.資料模型 import 來源表
-from 臺灣言語資料庫.資料模型 import 影音表
-from 臺灣言語資料庫.資料模型 import 版權表
 from 臺灣言語工具.解析整理.拆文分析器 import 拆文分析器
-from 臺灣言語工具.解析整理.文章粗胚 import 文章粗胚
-from 臺灣言語工具.音標系統.閩南語.臺灣閩南語羅馬字拼音 import 臺灣閩南語羅馬字拼音
-from 臺灣言語工具.基本物件.公用變數 import 分字符號
-from 臺灣言語工具.基本物件.公用變數 import 分詞符號
 from 臺灣言語工具.解析整理.解析錯誤 import 解析錯誤
+from 匯入.指令 import 匯入枋模
+from 臺灣言語服務.models import 訓練過渡格式
 
 
-class Command(BaseCommand):
+class Command(匯入枋模):
+    公家內容 = {
+        '來源': '臺灣閩南語常用詞辭典',
+        '影音語者': '王秀容',
+        '種類': '字詞',
+        '年代': str(timezone.now().year),
+    }
 
     def add_arguments(self, parser):
         parser.add_argument(
@@ -29,9 +28,7 @@ def add_arguments(self, parser):
             help='試驗用，免一擺全匯'
         )
 
-    def handle(self, *args, **參數):
-        call_command('顯示資料數量')
-
+    def 全部資料(self, *args, **參數):
         'https://github.com/g0v/moedict-data-twblg/tree/master/uni'
         詞目 = {}
         with open(join(
@@ -41,28 +38,12 @@ def handle(self, *args, **參數):
             for 一筆 in DictReader(檔案):
                 編號 = 一筆['主編碼'].strip()
                 漢字 = 一筆['詞目'].strip()
-                拼音 = 一筆['音讀'].strip().split('/')[0]
+                羅馬字 = 一筆['音讀'].strip().split('/')[0]
                 try:
-                    正規化臺羅 = (
-                        拆文分析器
-                        .建立句物件(文章粗胚.建立物件語句前處理減號(臺灣閩南語羅馬字拼音, 拼音))
-                        .轉音(臺灣閩南語羅馬字拼音)
-                        .看型(物件分字符號=分字符號, 物件分詞符號=分詞符號)
-                    )
-                    拆文分析器.對齊組物件(漢字, 正規化臺羅)
-                    詞目[int(編號)] = (漢字, 正規化臺羅)
+                    詞目[int(編號)] = 拆文分析器.對齊組物件(漢字, 羅馬字)
                 except 解析錯誤:
                     pass
-        公家內容 = {
-            '收錄者': 來源表.objects.get_or_create(名='系統管理員')[0].編號(),
-            '來源': 來源表.objects.get_or_create(名='臺灣閩南語常用詞辭典')[0].編號(),
-            '版權': 版權表.objects.get_or_create(版權='會使公開')[0].pk,
-            '種類': '字詞',
-            '語言腔口': '臺語',
-            '著作所在地': '臺灣',
-            '著作年': str(timezone.now().year),
-            '屬性': {'語者': '王秀容'}
-        }
+
         音檔目錄 = join(settings.BASE_DIR, '語料', '教育部閩南語常用詞辭典wav')
         匯入數量 = 0
         for 路徑 in sorted(listdir(音檔目錄)):
@@ -73,22 +54,15 @@ def handle(self, *args, **參數):
                 except ValueError:
                     raise ValueError('有的音檔有重錄過')
                 try:
-                    (漢字, 拼音) = 詞目[音檔編號]
+                    台語物件 = 詞目[音檔編號]
                 except KeyError:  # 有的詞條尾仔提掉矣，親像編號5
                     continue
-                else:
-                    影音內容 = {'影音所在': 音檔路徑}
-                    影音內容.update(公家內容)
-                    影音 = 影音表.加資料(影音內容)
-                    文本內容 = {
-                        '文本資料': 漢字,
-                        '音標資料': 拼音,
-                    }
-                    文本內容.update(公家內容)
-                    影音.寫文本(文本內容)
-
-                    匯入數量 += 1
-                    if 匯入數量 == 參數['匯入幾筆']:
-                        break
 
-        call_command('顯示資料數量')
+                yield 訓練過渡格式(
+                    影音所在=音檔路徑,
+                    文本=台語物件.看分詞(),
+                    **self.公家內容
+                )
+                匯入數量 += 1
+                if 匯入數量 == 參數['匯入幾筆']:
+                    break
diff --git a/匯入/management/commands/新北市900例句.py b/匯入/management/commands/新北市900例句.py
@@ -1,41 +1,28 @@
 from os import makedirs
 from os.path import join, isfile
 from posix import listdir
+from subprocess import run
 from tempfile import TemporaryDirectory
 from urllib.request import urlretrieve
 from zipfile import ZipFile
 
 from django.conf import settings
-from django.core.management import call_command
-from django.core.management.base import BaseCommand
 
-from libavwrapper.avconv import Input, Output, AVConv
-from libavwrapper.codec import AudioCodec, NO_VIDEO
 
+from 臺灣言語服務.models import 訓練過渡格式
+from 匯入.指令 import 匯入枋模
 
-from 臺灣言語工具.解析整理.拆文分析器 import 拆文分析器
-from 臺灣言語資料庫.資料模型 import 來源表
-from 臺灣言語資料庫.資料模型 import 版權表
-from 臺灣言語資料庫.資料模型 import 影音表
 
-
-class Command(BaseCommand):
+class Command(匯入枋模):
     help = 'https://github.com/Taiwanese-Corpus/Sin1pak8tshi7_2015_900-le7ku3'
 
     公家內容 = {
-        '收錄者': 來源表.objects.get_or_create(名='系統管理員')[0].編號(),
-        '來源': (
-            來源表.objects
-            .get_or_create(名='新北市104學年度閩南語字音字形-臺語上美麗辭典揣舉例-900例句工作坊')[0]
-            .編號()
-        ),
-        '版權': 版權表.objects.get_or_create(版權='會使公開')[0].pk,
-        '種類': '字詞',
-        '語言腔口': '臺語',
-        '著作所在地': '臺灣',
-        '著作年': '2015',
-        '屬性': {'語者': '王秀容'}
+        '來源': '新北市104學年度閩南語字音字形-臺語上美麗辭典揣舉例-900例句工作坊',
+        '影音語者': '王秀容',
+        '種類': '語句',
+        '年代': '2015',
     }
+    網址 = 'https://github.com/Taiwanese-Corpus/Sin1pak8tshi7_2015_900-le7ku3/archive/master.zip'
 
     def add_arguments(self, parser):
         parser.add_argument(
@@ -50,15 +37,12 @@ def add_arguments(self, parser):
             help='試驗用，免一擺全匯'
         )
 
-    def handle(self, *args, **參數):
-        call_command('顯示資料數量')
-
-        網址 = 'https://github.com/Taiwanese-Corpus/Sin1pak8tshi7_2015_900-le7ku3/archive/master.zip'
+    def 全部資料(self, *args, **參數):
         語料目錄 = join(settings.BASE_DIR, '語料', '新北市900例句')
         makedirs(語料目錄, exist_ok=True)
         暫時檔案 = join(語料目錄, 'master.zip')
         if not isfile(暫時檔案):
-            urlretrieve(網址, 暫時檔案)
+            urlretrieve(self.網址, 暫時檔案)
         ZipFile(暫時檔案).extractall(語料目錄)
         匯入數量 = 0
         with TemporaryDirectory() as 轉檔目錄:
@@ -69,32 +53,26 @@ def handle(self, *args, **參數):
             for 檔名 in sorted(listdir(音檔目錄), key=lambda 名: int(名.split('.')[0])):
                 來源 = join(音檔目錄, 檔名)
                 目標 = join(轉檔目錄, 檔名)
-                目標聲音格式 = (
-                    AudioCodec('pcm_s16le')
-                    .channels(1)
-                    .frequence(參數['頻率'])
-                )
-                原始檔案 = Input(來源)
-                網頁檔案 = Output(目標).overwrite()
-                指令 = AVConv('avconv', 原始檔案, 目標聲音格式, NO_VIDEO, 網頁檔案)
-                指令.run().wait()
+                run([
+                    'ffmpeg', '-i',
+                    來源,
+                    '-acodec', 'pcm_s16le',
+                    '-ar', '{}'.format(參數['頻率']),
+                    '-ac', '1',
+                    '-y',
+                    目標,
+                ], check=True)
                 音檔陣列.append(目標)
+
             with open(
                 join(語料目錄, 'Sin1pak8tshi7_2015_900-le7ku3-master', 'minnan900.分詞')
             ) as 分詞檔案:
                 for 一逝分詞, 音檔路徑 in zip(分詞檔案.readlines(), 音檔陣列):
-                    章物件 = 拆文分析器.分詞章物件(一逝分詞.strip())
-                    影音內容 = {'影音所在': 音檔路徑}
-                    影音內容.update(self.公家內容)
-                    影音 = 影音表.加資料(影音內容)
-                    文本內容 = {
-                        '文本資料': 章物件.看型(),
-                        '音標資料': 章物件.看音(),
-                    }
-                    文本內容.update(self.公家內容)
-                    影音.寫文本(文本內容)
-
+                    yield 訓練過渡格式(
+                        影音所在=音檔路徑,
+                        文本=一逝分詞.strip(),
+                        **self.公家內容
+                    )
                     匯入數量 += 1
                     if 匯入數量 == 參數['匯入幾筆']:
                         break
-        call_command('顯示資料數量')