Skip to content
Permalink
Browse files

Filter out empty lines

  • Loading branch information
guillaumekln committed Mar 23, 2020
1 parent af1b0d0 commit c44b9ffdb561da7e963f835cb90aa5f56e20c6f4
Showing with 41 additions and 3 deletions.
  1. +3 −1 WNMT2018/vmap/Dockerfile
  2. +6 −2 WNMT2018/vmap/buildPT.sh
  3. +23 −0 WNMT2018/vmap/filter.py
  4. +9 −0 WNMT2018/vmap/paste.py
@@ -29,5 +29,7 @@ WORKDIR /root/fast_align
RUN mkdir build ; cd build ; cmake .. ; make

COPY buildPT.sh /root
COPY paste.py /root
COPY filter.py /root

ENTRYPOINT ["/root/buildPT.sh"]
ENTRYPOINT ["/root/buildPT.sh"]
@@ -12,8 +12,12 @@ fastalign(){
tt=$4
cd ${corpus_dir}
dir=/root/workspace/$base.${ss}2${tt}
filter_corpus_dir=/root/workspace/filtered
mkdir -p $dir
paste $base.$ss $base.$tt | perl -pe 's/\t/ \|\|\| /' > $dir/data.${ss}-${tt}
mkdir -p $filter_corpus_dir
python /root/filter.py $base.$ss $base.$tt $filter_corpus_dir
cd $filter_corpus_dir
python /root/paste.py $base.$ss $base.$tt > $dir/data.${ss}-${tt}
$fast/fast_align -i $dir/data.${ss}-${tt} -d -o -v > $dir/data.${ss}-${tt}.forward 2> $dir/data.${ss}-${tt}.forward.log
$fast/fast_align -i $dir/data.${ss}-${tt} -d -o -v -r > $dir/data.${ss}-${tt}.reverse 2> $dir/data.${ss}-${tt}.reverse.log
echo "$fast/atools -i $dir/data.${ss}-${tt}.forward -j $dir/data.${ss}-${tt}.reverse -c grow-diag-final-and > $dir/${base}.${ss}2${tt}.gdfa"
@@ -22,7 +26,7 @@ fastalign(){

learn_PB(){
echo "BUILD PHRASETABLE"
corpus_dir=$1
corpus_dir=/root/workspace/filtered
work_dir=/root/workspace
name=$2
ss=$3
@@ -0,0 +1,23 @@
"""Filter lines when either the source or the target have no content."""

from __future__ import print_function

import sys
import os

src_path = sys.argv[1]
tgt_path = sys.argv[2]
out_dir = sys.argv[3]

src_path_out = os.path.join(out_dir, os.path.basename(src_path))
tgt_path_out = os.path.join(out_dir, os.path.basename(tgt_path))

with open(src_path, "rb") as src_in, \
open(tgt_path, "rb") as tgt_in, \
open(src_path_out, "wb") as src_out, \
open(tgt_path_out, "wb") as tgt_out:
for src, tgt in zip(src_in, tgt_in):
if not src.strip() or not tgt.strip():
continue
src_out.write(src)
tgt_out.write(tgt)
@@ -0,0 +1,9 @@
"""Paste source and target lines with the ||| separator."""

from __future__ import print_function

import sys

with open(sys.argv[1], "rb") as src_file, open(sys.argv[2], "rb") as tgt_file:
for src, tgt in zip(src_file, tgt_file):
print("%s ||| %s" % (src.strip(), tgt.strip()))

0 comments on commit c44b9ff

Please sign in to comment.
You can’t perform that action at this time.