Merge pull request #73 from Taiwanese-Corpus/Siu-iong

Siù-iong fix #36
Taiwanese-Corpus · May 13, 2019 · f98d719 · f98d719
2 parents c1e9f38 + 1948388
commit f98d719
Show file tree

Hide file tree

Showing 8 changed files with 105 additions and 135 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -5,7 +5,7 @@ python:
 - '3.5'
 env:
 - TOX_ENV=flake8
-- TOX_ENV=e-dictionary.apc
+# - TOX_ENV=e-dictionary.apc
 - TOX_ENV=moedict-twblg
 - TOX_ENV=sin1pak8tshi7-900-le7ku3
 - TOX_ENV=taihoa-dictionary
@@ -20,24 +20,18 @@ env:
 - TOX_ENV=白話字文獻館
 - TOX_ENV=台灣植物名彙
 - TOX_ENV=台灣白話基礎語句
-addons:
-  apt:
-    sources:
-    - ubuntu-toolchain-r-test
-    packages:
-    - libav-tools
-    - libmp3lame0
+services:
+- docker
 install:
-- sudo apt-get install -y libavcodec-extra-54
-- pip install tox
 - pip install python-coveralls
 branches:
   only:
   - master
   - "/\\d+\\.\\d+\\.\\d+/"
 script:
-- tox -e $TOX_ENV
+- docker build -t travis --build-arg TOX_ENV=$TOX_ENV .
 after_success:
+- docker run --rm travis cat .coverage > .coverage
 - coverage report
 - coveralls
 deploy:

diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,10 @@
+FROM i3thuan5/tai5-uan5_gian5-gi2_kang1-ku7
+MAINTAINER i3thuan5
+
+ARG TOX_ENV
+
+RUN apt-get update && apt-get install -y ffmpeg
+WORKDIR /opt/hue7jip8
+RUN pip install tox
+COPY . .
+RUN tox --sitepackages -e ${TOX_ENV}
diff --git a/tox.ini b/tox.ini
@@ -13,17 +13,13 @@ passenv = /bin/bash, bash, python, avconv, wget, *
 deps =
     coverage
 commands=
-	python manage.py migrate
-	coverage run --source=匯入 manage.py 教典音檔0下載 dropbox
-	coverage run --source=匯入 -a manage.py 教典音檔1轉檔 --匯入幾筆 100 # 匯入100筆就好，試驗用
-	coverage run --source=匯入 -a manage.py 教典音檔2匯入 --匯入幾筆 100 # 匯入100筆就好，試驗用
+   coverage run --source=匯入 manage.py test 試驗.test教典音檔
 
 [testenv:sin1pak8tshi7-900-le7ku3]
 deps =
     coverage
 commands=
-	python manage.py migrate
-	coverage run --source=匯入 manage.py 新北市900例句 --匯入幾筆 10
+   coverage run --source=匯入 manage.py test 試驗.test新北市例句
 
 [testenv:flake8]
 deps =
@@ -48,7 +44,7 @@ commands=
 deps =
     coverage
 commands=
-	coverage run --source=匯入 manage.py test -p 'test教典*'
+	coverage run --source=匯入 manage.py test -p 'test教典[^音]*'
 
 [testenv:TGB]
 deps =
@@ -103,4 +99,4 @@ deps =
     coverage
 commands=
 	coverage run --source=匯入 manage.py test 試驗.test台灣白話基礎語句
-	
+
diff --git a/匯入/management/commands/教典音檔1轉檔.py b/匯入/management/commands/教典音檔1轉檔.py
@@ -5,8 +5,7 @@
 from django.conf import settings
 from django.core.management.base import BaseCommand
 
-from libavwrapper.avconv import Input, Output, AVConv
-from libavwrapper.codec import AudioCodec, NO_VIDEO
+from subprocess import run
 
 
 class Command(BaseCommand):
@@ -31,14 +30,15 @@ def handle(self, *args, **參數):
             if 檔名.endswith('.mp3'):
                 來源 = join(語料目錄, 檔名)
                 目標 = join(目標目錄, 檔名[:-4] + '.wav')
-                目標聲音格式 = AudioCodec('pcm_s16le')
-                目標聲音格式.channels(1)
-                目標聲音格式.frequence(16000)
-                原始檔案 = Input(來源)
-                網頁檔案 = Output(目標).overwrite()
-                指令 = AVConv('avconv', 原始檔案, 目標聲音格式, NO_VIDEO, 網頁檔案)
-                程序 = 指令.run()
-                程序.wait()
+                run([
+                    'ffmpeg', '-i',
+                    來源,
+                    '-acodec', 'pcm_s16le',
+                    '-ar', '16000',
+                    '-ac', '1',
+                    '-y',
+                    目標,
+                ], check=True)
 
                 匯入數量 += 1
                 if 匯入數量 == 參數['匯入幾筆']:

diff --git a/匯入/management/commands/教典音檔2匯入.py b/匯入/management/commands/教典音檔2匯入.py
@@ -3,23 +3,22 @@
 from posix import listdir
 
 from django.conf import settings
-from django.core.management import call_command
-from django.core.management.base import BaseCommand
 from django.utils import timezone
 
 
-from 臺灣言語資料庫.資料模型 import 來源表
-from 臺灣言語資料庫.資料模型 import 影音表
-from 臺灣言語資料庫.資料模型 import 版權表
 from 臺灣言語工具.解析整理.拆文分析器 import 拆文分析器
-from 臺灣言語工具.解析整理.文章粗胚 import 文章粗胚
-from 臺灣言語工具.音標系統.閩南語.臺灣閩南語羅馬字拼音 import 臺灣閩南語羅馬字拼音
-from 臺灣言語工具.基本物件.公用變數 import 分字符號
-from 臺灣言語工具.基本物件.公用變數 import 分詞符號
 from 臺灣言語工具.解析整理.解析錯誤 import 解析錯誤
+from 匯入.指令 import 匯入枋模
+from 臺灣言語服務.models import 訓練過渡格式
 
 
-class Command(BaseCommand):
+class Command(匯入枋模):
+    公家內容 = {
+        '來源': '臺灣閩南語常用詞辭典',
+        '影音語者': '王秀容',
+        '種類': '字詞',
+        '年代': str(timezone.now().year),
+    }
 
     def add_arguments(self, parser):
         parser.add_argument(
@@ -29,9 +28,7 @@ def add_arguments(self, parser):
             help='試驗用，免一擺全匯'
         )
 
-    def handle(self, *args, **參數):
-        call_command('顯示資料數量')
-
+    def 全部資料(self, *args, **參數):
         'https://github.com/g0v/moedict-data-twblg/tree/master/uni'
         詞目 = {}
         with open(join(
@@ -41,28 +38,12 @@ def handle(self, *args, **參數):
             for 一筆 in DictReader(檔案):
                 編號 = 一筆['主編碼'].strip()
                 漢字 = 一筆['詞目'].strip()
-                拼音 = 一筆['音讀'].strip().split('/')[0]
+                羅馬字 = 一筆['音讀'].strip().split('/')[0]
                 try:
-                    正規化臺羅 = (
-                        拆文分析器
-                        .建立句物件(文章粗胚.建立物件語句前處理減號(臺灣閩南語羅馬字拼音, 拼音))
-                        .轉音(臺灣閩南語羅馬字拼音)
-                        .看型(物件分字符號=分字符號, 物件分詞符號=分詞符號)
-                    )
-                    拆文分析器.對齊組物件(漢字, 正規化臺羅)
-                    詞目[int(編號)] = (漢字, 正規化臺羅)
+                    詞目[int(編號)] = 拆文分析器.對齊組物件(漢字, 羅馬字)
                 except 解析錯誤:
                     pass
-        公家內容 = {
-            '收錄者': 來源表.objects.get_or_create(名='系統管理員')[0].編號(),
-            '來源': 來源表.objects.get_or_create(名='臺灣閩南語常用詞辭典')[0].編號(),
-            '版權': 版權表.objects.get_or_create(版權='會使公開')[0].pk,
-            '種類': '字詞',
-            '語言腔口': '臺語',
-            '著作所在地': '臺灣',
-            '著作年': str(timezone.now().year),
-            '屬性': {'語者': '王秀容'}
-        }
+
         音檔目錄 = join(settings.BASE_DIR, '語料', '教育部閩南語常用詞辭典wav')
         匯入數量 = 0
         for 路徑 in sorted(listdir(音檔目錄)):
@@ -73,22 +54,15 @@ def handle(self, *args, **參數):
                 except ValueError:
                     raise ValueError('有的音檔有重錄過')
                 try:
-                    (漢字, 拼音) = 詞目[音檔編號]
+                    台語物件 = 詞目[音檔編號]
                 except KeyError:  # 有的詞條尾仔提掉矣，親像編號5
                     continue
-                else:
-                    影音內容 = {'影音所在': 音檔路徑}
-                    影音內容.update(公家內容)
-                    影音 = 影音表.加資料(影音內容)
-                    文本內容 = {
-                        '文本資料': 漢字,
-                        '音標資料': 拼音,
-                    }
-                    文本內容.update(公家內容)
-                    影音.寫文本(文本內容)
-
-                    匯入數量 += 1
-                    if 匯入數量 == 參數['匯入幾筆']:
-                        break
 
-        call_command('顯示資料數量')
+                yield 訓練過渡格式(
+                    影音所在=音檔路徑,
+                    文本=台語物件.看分詞(),
+                    **self.公家內容
+                )
+                匯入數量 += 1
+                if 匯入數量 == 參數['匯入幾筆']:
+                    break
diff --git a/匯入/management/commands/新北市900例句.py b/匯入/management/commands/新北市900例句.py
@@ -1,66 +1,43 @@
 from os import makedirs
 from os.path import join, isfile
 from posix import listdir
+from subprocess import run
 from tempfile import TemporaryDirectory
 from urllib.request import urlretrieve
 from zipfile import ZipFile
 
 from django.conf import settings
-from django.core.management import call_command
-from django.core.management.base import BaseCommand
 
-from libavwrapper.avconv import Input, Output, AVConv
-from libavwrapper.codec import AudioCodec, NO_VIDEO
 
+from 臺灣言語服務.models import 訓練過渡格式
+from 匯入.指令 import 匯入枋模
 
-from 臺灣言語工具.解析整理.拆文分析器 import 拆文分析器
-from 臺灣言語資料庫.資料模型 import 來源表
-from 臺灣言語資料庫.資料模型 import 版權表
-from 臺灣言語資料庫.資料模型 import 影音表
 
-
-class Command(BaseCommand):
+class Command(匯入枋模):
     help = 'https://github.com/Taiwanese-Corpus/Sin1pak8tshi7_2015_900-le7ku3'
 
     公家內容 = {
-        '收錄者': 來源表.objects.get_or_create(名='系統管理員')[0].編號(),
-        '來源': (
-            來源表.objects
-            .get_or_create(名='新北市104學年度閩南語字音字形-臺語上美麗辭典揣舉例-900例句工作坊')[0]
-            .編號()
-        ),
-        '版權': 版權表.objects.get_or_create(版權='會使公開')[0].pk,
-        '種類': '字詞',
-        '語言腔口': '臺語',
-        '著作所在地': '臺灣',
-        '著作年': '2015',
-        '屬性': {'語者': '王秀容'}
+        '來源': '新北市104學年度閩南語字音字形-臺語上美麗辭典揣舉例-900例句工作坊',
+        '影音語者': '王秀容',
+        '種類': '語句',
+        '年代': '2015',
     }
+    網址 = 'https://github.com/Taiwanese-Corpus/Sin1pak8tshi7_2015_900-le7ku3/archive/master.zip'
 
     def add_arguments(self, parser):
         parser.add_argument(
             '--頻率',
             type=int,
             default=44100,
         )
-        parser.add_argument(
-            '--匯入幾筆',
-            type=int,
-            default=100000,
-            help='試驗用，免一擺全匯'
-        )
 
-    def handle(self, *args, **參數):
-        call_command('顯示資料數量')
-
-        網址 = 'https://github.com/Taiwanese-Corpus/Sin1pak8tshi7_2015_900-le7ku3/archive/master.zip'
+    def 全部資料(self, *args, **參數):
         語料目錄 = join(settings.BASE_DIR, '語料', '新北市900例句')
         makedirs(語料目錄, exist_ok=True)
         暫時檔案 = join(語料目錄, 'master.zip')
         if not isfile(暫時檔案):
-            urlretrieve(網址, 暫時檔案)
+            urlretrieve(self.網址, 暫時檔案)
         ZipFile(暫時檔案).extractall(語料目錄)
-        匯入數量 = 0
         with TemporaryDirectory() as 轉檔目錄:
             音檔目錄 = join(
                 語料目錄, 'Sin1pak8tshi7_2015_900-le7ku3-master', '鉸好的1-150音檔'
@@ -69,32 +46,23 @@ def handle(self, *args, **參數):
             for 檔名 in sorted(listdir(音檔目錄), key=lambda 名: int(名.split('.')[0])):
                 來源 = join(音檔目錄, 檔名)
                 目標 = join(轉檔目錄, 檔名)
-                目標聲音格式 = (
-                    AudioCodec('pcm_s16le')
-                    .channels(1)
-                    .frequence(參數['頻率'])
-                )
-                原始檔案 = Input(來源)
-                網頁檔案 = Output(目標).overwrite()
-                指令 = AVConv('avconv', 原始檔案, 目標聲音格式, NO_VIDEO, 網頁檔案)
-                指令.run().wait()
+                run([
+                    'ffmpeg', '-i',
+                    來源,
+                    '-acodec', 'pcm_s16le',
+                    '-ar', '{}'.format(參數['頻率']),
+                    '-ac', '1',
+                    '-y',
+                    目標,
+                ], check=True)
                 音檔陣列.append(目標)
+
             with open(
                 join(語料目錄, 'Sin1pak8tshi7_2015_900-le7ku3-master', 'minnan900.分詞')
             ) as 分詞檔案:
                 for 一逝分詞, 音檔路徑 in zip(分詞檔案.readlines(), 音檔陣列):
-                    章物件 = 拆文分析器.分詞章物件(一逝分詞.strip())
-                    影音內容 = {'影音所在': 音檔路徑}
-                    影音內容.update(self.公家內容)
-                    影音 = 影音表.加資料(影音內容)
-                    文本內容 = {
-                        '文本資料': 章物件.看型(),
-                        '音標資料': 章物件.看音(),
-                    }
-                    文本內容.update(self.公家內容)
-                    影音.寫文本(文本內容)
-
-                    匯入數量 += 1
-                    if 匯入數量 == 參數['匯入幾筆']:
-                        break
-        call_command('顯示資料數量')
+                    yield 訓練過渡格式(
+                        影音所在=音檔路徑,
+                        文本=一逝分詞.strip(),
+                        **self.公家內容
+                    )
diff --git a/試驗/test教典音檔.py b/試驗/test教典音檔.py
@@ -0,0 +1,15 @@
+from django.core.management import call_command
+from django.test.testcases import TestCase
+from 臺灣言語服務.models import 訓練過渡格式
+
+
+class 教典音檔試驗(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        call_command('教典音檔0下載', 'dropbox')
+        call_command('教典音檔1轉檔', '--匯入幾筆', '100')
+        call_command('教典音檔2匯入', '--匯入幾筆', '100')
+        return super().setUpClass()
+
+    def test句數正確(self):
+        self.assertGreater(訓練過渡格式.資料數量(), 80)
diff --git a/試驗/test新北市例句.py b/試驗/test新北市例句.py
@@ -0,0 +1,13 @@
+from django.core.management import call_command
+from django.test.testcases import TestCase
+from 臺灣言語服務.models import 訓練過渡格式
+
+
+class 新北市例句試驗(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        call_command('新北市900例句')
+        return super().setUpClass()
+
+    def test句數正確(self):
+        self.assertEqual(訓練過渡格式.資料數量(), 300)