In [1]:
import pandas as pd

def count_lines(filename):
    """
    Count the number of lines in a file.

    :param filename: Name of the file to count lines in.
    :return: Number of lines in the file.
    """
    with open(filename, 'r') as file:
        line_count = sum(1 for _ in file)
    return line_count

def merge_n_lines_from_files(files_list, n, output_filename):
    """
    Reads the first n lines from each file in files_list and writes them to output_filename.
    
    :param files_list: List of filenames to read from.
    :param n: Number of lines to read from each file.
    :param output_filename: Name of the output file to write lines to.
    """
    i = 0 
    with open(output_filename, 'w') as outfile:
        for filename in files_list:
            try:
                with open(filename, 'r') as infile:
                    # Read the first n lines
                    lines = [next(infile) for _ in range(n[i])]
                    outfile.writelines(lines)
                    i += 1
                    
                    # Optionally, add a separator between data from different files
                    #outfile.write("\n")
            except StopIteration:
                print(f"{filename} had less than {n[i]} lines.")
                n_lines = count_lines(filename)
                print('used ', n_lines, ' lines')
                with open(filename, 'r') as infile:
                    # Read the first n lines
                    lines = [next(infile) for _ in range(n_lines)]
                    outfile.writelines(lines)
                    i += 1
            except FileNotFoundError:
                print(f"{filename} not found.")

def build_shared_vocab(input_files, output_filename):
    """
    Concatenates the contents of multiple input files into a single output file.

    :param input_files: List of filenames to read from.
    :param output_filename: Name of the output file to write lines to.
    """
    with open(output_filename, 'w') as outfile:
        for filename in input_files:
            try:
                with open(filename, 'r') as infile:
                    # Read and write lines to the output file
                    outfile.writelines(infile.readlines())
                    
                    # Optionally, add a separator between data from different files
                    outfile.write("\n")
            except FileNotFoundError:
                print(f"{filename} not found.")



In [2]:
####################
# INPUTS
####################

# Set languages to merge
languages = ['be', 'ru', 'hu']

# Set number of sentences extracted per language for training
n_points = [4500, 10000, 10000]

# Set number of sentences extracted per language for dev
n_points_dev = [450, 1000, 1000]



####################
# Build data
####################

# Build source data
files_list_train = []
files_list_dev  = []

for lan in languages:
    files_list_train.append('en_' + lan + '/train.tok.norm.' + lan)
    files_list_dev.append('en_' + lan + '/dev.tok.norm.' + lan)

merge_n_lines_from_files(files_list_train, n_points, 'merged_files/be_ru_hun/merged_src_train.txt')
merge_n_lines_from_files(files_list_dev, n_points_dev, 'merged_files/be_ru_hun/merged_src_dev.txt')


# Build target data
files_list_train = []
files_list_dev  = []

for lan in languages:
    files_list_train.append('en_' + lan + '/train.tok.norm.en')
    files_list_dev.append('en_' + lan + '/dev.tok.norm.en')

merge_n_lines_from_files(files_list_train, n_points, 'merged_files/be_ru_hun/merged_tar_train.txt')
merge_n_lines_from_files(files_list_dev, n_points_dev, 'merged_files/be_ru_hun/merged_tar_dev.txt')

# Build Vocabularies
# Build src vocab
files_list_vocab  = []
for lan in languages:
    files_list_vocab.append('en_' + lan + '/train.vocab.' + lan)
build_shared_vocab(files_list_vocab, 'merged_files/be_ru_hun/src_vocab.txt')


# Build trg vocab
files_list_vocab  = []
for lan in languages:
    files_list_vocab.append('en_' + lan + '/train.vocab.en')
build_shared_vocab(files_list_vocab, 'merged_files/be_ru_hun/trg_vocab.txt')

en_be/dev.tok.norm.be had less than 4500 lines.
used  248  lines
en_ru/dev.tok.norm.ru had less than 10000 lines.
used  4814  lines
en_hu/dev.tok.norm.hu had less than 10000 lines.
used  3725  lines
en_be/dev.tok.norm.en had less than 4500 lines.
used  248  lines
en_ru/dev.tok.norm.en had less than 10000 lines.
used  4814  lines
en_hu/dev.tok.norm.en had less than 10000 lines.
used  3725  lines
