In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=5

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=5


In [2]:
import pandas as pd
import os
import numpy as np

# Define the language code, used in the file names
lang_code = "UA"

# Main path
main_path = "/home/tajak/Parlamint-translation"

# Define the translation model to be used
opus_lang_code = "sla"

# Check whether the path to the folder with conllu files is ok
path = "{}/Source-data/ParlaMint-{}.conllu/ParlaMint-{}.conllu".format(main_path, lang_code, lang_code)

# Define other paths
extracted_dataframe_path = "{}/results/{}/ParlaMint-{}-extracted-source-data.csv".format(main_path, lang_code, lang_code)

translated_dataframe_path = "{}/results/{}/ParlaMint-{}-translated.csv".format(main_path, lang_code, lang_code)
translated_tokenized_dataframe_path = "{}/results/{}/ParlaMint-{}-translated-tokenized.csv".format(main_path,lang_code, lang_code)
final_dataframe = "{}/results/{}/ParlaMint-{}-final-dataframe.csv".format(main_path,lang_code, lang_code)

final_path = "{}/Final-data/ParlaMint-{}.conllu/ParlaMint-{}.conllu".format(main_path, lang_code, lang_code)

In [46]:
def create_conllu(file, lang_code, main_path, final_dataframe, nlp):
	"""
	The function takes the dataframe (df), created in previous steps and takes only the instances from the df that belong
	to the file that is in the argument. It linguistically processes the translated sentences from the file and saves the file.
	Then we add additional information (metadata and NER annotations) to it with the conllu parser and save the final conllu file.

	Args:
		- file (str): file name from the files list (see above)
		- lang_code (str): the lang code that is used in the names of the files, it should be the same as for extract_text()
	"""

	# Process all sentences in the dataframe and save them to a conllu file
	from stanza.utils.conll import CoNLL
	import stanza
	from conllu import parse
	import ast
	import regex as re
	import os
	import pandas as pd

	# Use the dataframe, created in previous steps
	df = pd.read_csv("{}".format(final_dataframe), sep="\t", index_col = 0, na_filter = False)

	# Filter out only instances from the file in question
	df = df[df["file"] == file]

	# Add information on the target path
	df["target_path"] = df.file_path.str.replace("Source-data", "Final-data")

	# Get target path
	target_path = list(df.target_path.unique())[0]

	# When we open the dataframe file, the lists and dictionaries turn into strings - change them back
	for column in ["space-after-information", 'fwd_align_dict', 'bwd_align_dict', 'substituted_words', "source_indices"]:
		df[column] = df[column].astype("str")
		df[column] = df[column].apply(lambda x: ast.literal_eval(x))

	# Create lists of information that we need to add to the conllu file
	ids_list = df.sentence_id.to_list()
	source_text = df.text.to_list()
	# initial_translation = df.translation.to_list()
	space_after_list = df["space-after-information"].to_list()
	fwd_align_list = df['fwd_align_dict'].to_list()
	bwd_align_list = df['bwd_align_dict'].to_list()
	substituted_words_list = df['substituted_words'].to_list()
	# tokenized_text_list = df["source_indices"].to_list()
	# If translation is empty, replace it with "/"
	trans_list = df.new_translations.to_list()
	sentence_list = []

	for i in trans_list:
		if len(i) == 0:
			sentence_list.append("/")
		else:
			sentence_list.append(i)

	# To feed the entire list into the pipeline, we need to create lists of tokens, split by space
	sentence_list = [x.split(" ") for x in sentence_list]


	# Linguistically process the list
	doc = nlp(sentence_list)

	# Save the conllu file - rename this for troubleshooting
	CoNLL.write_doc2conll(doc, "{}/results/{}/temp/{}-troubleshoot".format(main_path, lang_code, file))

	print("{} processed and saved.".format(file))

	# Open the CONLL-u file with the CONLL-u parser

	data = open("{}/results/{}/temp/{}-troubleshoot".format(main_path, lang_code, file), "r").read()

	sentences = parse(data)

	# Adding additional information to the conllu
	for sentence in sentences:
		# Get the sentence index
		sentence_index = sentences.index(sentence)

		# Add metadata
		sentence.metadata["sent_id"] = ids_list[sentence_index]
		sentence.metadata["source"] = source_text[sentence_index]
		# sentence.metadata["source_indices"] = tokenized_text_list[sentence_index]
		# sentence.metadata["initial_translation"] = initial_translation[sentence_index]

		# Delete the current metadata for text
		del sentence.metadata["text"]

		new_translation_text = ""

		# Iterate through tokens
		for word in sentence:
			word_index = sentence.index(word)
			word_conllu_index = word["id"]

			# Check whether the word conllu index (word id) is in the substituted_words_list (it is if it was substituted)
			# If it is, add information on the original translated word - do not do this for Bulgarian, Portuguese and other languages mentioned
			if lang_code not in ["BG", "PT", "IT", "AT", "GR", "HU", "NO", "TR", "NL", "SI", "HR", "RS", "LV", "UA", "ES-GA", "PL", "ES-CT", "FR", "BE-nl", "BE-fr", "EE", "ES", "FI", "ES-PV-es", "ES-PV-eu"]:
				if substituted_words_list[sentence_index].get(word_conllu_index, None) != None:
					word["misc"]["Translated"] = substituted_words_list[sentence_index][word_conllu_index]
			
			# Do the same for the forward and backward alignment
			if fwd_align_list[sentence_index].get(word_conllu_index, None) != None:
				word["misc"]["ForwardAlignment"] = fwd_align_list[sentence_index][word_conllu_index]

			if bwd_align_list[sentence_index].get(word_conllu_index, None) != None:
				word["misc"]["BackwardAlignment"] = bwd_align_list[sentence_index][word_conllu_index]

			# Remove information on start_char and end_char from the annotation
			del word["misc"]["start_char"]
			del word["misc"]["end_char"]
			
			# Change the NER tags so that they are the same as in the source
			current_ner = word["misc"]["ner"]
			del word["misc"]["ner"]
			
			# Substitute parts of the tags so that they are the same as in source
			current_ner = re.sub("S-", "B-", current_ner)
			current_ner = re.sub("E-", "I-", current_ner)

			word["misc"]["NER"] = current_ner

			try:
				# Get information about the space after based on the index
				current_space_after = space_after_list[sentence_index][word_index]
			except:
				print("Error based on current_space after in sentence {}, sentence index: {}, word {}, word index {}.".format(sentence, sentence_index, word, word_index))
				current_space_after = "Yes"

		# Create new text from translation, correcting the spaces around words
		# based on the SpaceAfter information
			if current_space_after == "No":
				word["misc"]["SpaceAfter"] = "No"
				new_translation_text += word["form"]
			elif current_space_after == "Last":
				new_translation_text += word["form"]
			else:
				new_translation_text += word["form"]
				new_translation_text += " "
		
		sentence.metadata["text"] = new_translation_text
	
"""
	# Create a new conllu file with the updated information
	# do not save - for troubleshooting

	os.makedirs(os.path.dirname(target_path), exist_ok=True)
	final_file = open("{}".format(target_path), "w")

	for sentence in sentences:
		final_file.write(sentence.serialize())
	
	final_file.close()

	print("Final file {} is saved.".format(target_path))

"""

'\n\t# Create a new conllu file with the updated information\n\t# do not save - for troubleshooting\n\n\tos.makedirs(os.path.dirname(target_path), exist_ok=True)\n\tfinal_file = open("{}".format(target_path), "w")\n\n\tfor sentence in sentences:\n\t\tfinal_file.write(sentence.serialize())\n\t\n\tfinal_file.close()\n\n\tprint("Final file {} is saved.".format(target_path))\n\n'

In [21]:
def produce_final_conllu(lang_code, final_dataframe):
	import pandas as pd
	import stanza
	import time
	from stanza.pipeline.core import DownloadMethod
	
	df = pd.read_csv("{}".format(final_dataframe), sep="\t", index_col=0, na_filter = False)

	# Extract only the problematic file
	df = df[df["file"] == "ParlaMint-UA_2023-10-17-m0.conllu"]

	# Create a list of files
	files = list(df.file.unique())
	
	start_time = time.time()

	print("Processing started.")

	# Define the pipeline, instruct it to use a specific package: 	CoNLL03
	nlp = stanza.Pipeline(lang='en', processors="tokenize,mwt,pos,lemma,ner", package={"ner": ["conll03"]}, tokenize_pretokenized=True, download_method=DownloadMethod.REUSE_RESOURCES, use_gpu=True)

	for file in files:
		sen_list = create_conllu(file, lang_code, main_path, final_dataframe, nlp)
		current_end_time = round((time.time() - start_time)/60,2)
		print("Current running time: {}".format(current_end_time))
	
	end_time = round((time.time() - start_time)/60,2)

	print("Processing completed. It took {} minutes.".format(end_time))

	return sen_list

In [12]:
df = pd.read_csv("{}".format(final_dataframe), sep="\t", index_col=0, na_filter = False)

# Extract only the problematic file
test_df = df[df["file"] == "ParlaMint-UA_2023-10-17-m0.conllu"]

test_df

Unnamed: 0,file_path,file,sentence_id,text,tokenized_text,proper_nouns,length,translation,translation-tokenized,space-after-information,fwd_align_dict,bwd_align_dict,alignments,new_translations,substitution_info,substituted_pairs,substituted_words,errors,source_indices,post-processed_translations
2004780,/home/tajak/Parlamint-translation/Source-data/...,ParlaMint-UA_2023-10-17-m0.conllu,ParlaMint-UA_2023-10-17-m0.u1.p1.lang1.s1,Слава Україні!,Слава Україні !,{},2,Glory to Ukraine!,Glory to Ukraine !,"['Yes', 'Yes', 'No', 'Last']","{1: '1', 2: '1', 3: '2', 4: '3'}","{1: '1', 3: '2', 4: '3'}","{0: 0, 1: 2, 2: 3}",Glory to Ukraine !,[],0,{},No,"[['Слава', 1], ['Україні', 2], ['!', 3]]",Glory to Ukraine !
2004781,/home/tajak/Parlamint-translation/Source-data/...,ParlaMint-UA_2023-10-17-m0.conllu,ParlaMint-UA_2023-10-17-m0.u1.p2.lang1.s1,"Шановні народні депутати України, відповідно д...","Шановні народні депутати України , відповідно ...",{},30,"Honourable National Deputies of Ukraine, in ac...","Honourable National Deputies of Ukraine , in a...","['Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes...","{1: '1', 2: '2', 3: '3', 4: '4', 5: '4', 6: '5...","{1: '1', 2: '2', 3: '3', 5: '4', 6: '5', 8: '6...","{0: 0, 1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: ...","Honourable National Deputies of Ukraine , in a...",[],0,{},No,"[['Шановні', 1], ['народні', 2], ['депутати', ...","Honourable National Deputies of Ukraine , in a..."
2004782,/home/tajak/Parlamint-translation/Source-data/...,ParlaMint-UA_2023-10-17-m0.conllu,ParlaMint-UA_2023-10-17-m0.u1.p3.lang1.s1,"Шановні колеги, звертаю увагу, що згідно з при...","Шановні колеги , звертаю увагу , що згідно з п...",{},40,"Dear colleagues, I point out that, according t...","Dear colleagues , I point out that , according...","['Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No'...","{1: '1', 2: '2', 3: '3', 4: '4', 5: '4', 6: '5...","{1: '1', 2: '2', 3: '3', 5: '4', 6: '5', 8: '6...","{0: 0, 1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 6, 7: ...","Dear colleagues , I point out that , according...",[],0,{},No,"[['Шановні', 1], ['колеги', 2], [',', 3], ['зв...","Dear colleagues , I point out that , according..."
2004783,/home/tajak/Parlamint-translation/Source-data/...,ParlaMint-UA_2023-10-17-m0.conllu,ParlaMint-UA_2023-10-17-m0.u1.p3.lang1.s2,Я прошу всіх колег неухильно дотримуватись при...,Я прошу всіх колег неухильно дотримуватись при...,{},21,I ask all colleagues to abide by the decision ...,I ask all colleagues to abide by the decision ...,"['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Ye...","{1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6...","{1: '1', 2: '2', 3: '3', 4: '4', 6: '5, 6', 10...","{0: 0, 1: 1, 2: 2, 3: 3, 4: 5, 5: 5, 6: 9, 7: ...",I ask all colleagues to abide by the decision ...,[],0,{},No,"[['Я', 1], ['прошу', 2], ['всіх', 3], ['колег'...",I ask all colleagues to abide by the decision ...
2004784,/home/tajak/Parlamint-translation/Source-data/...,ParlaMint-UA_2023-10-17-m0.conllu,ParlaMint-UA_2023-10-17-m0.u1.p4.lang1.s1,"Шановні колеги, у разі повітряної тривоги буде...","Шановні колеги , у разі повітряної тривоги буд...",{},32,"Dear colleagues, in the event of air anxiety, ...","Dear colleagues , in the event of air anxiety ...","['Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes...","{1: '1', 2: '2', 3: '3', 4: '4', 6: '5', 8: '6...","{1: '1', 2: '2', 3: '3', 4: '4', 6: '5', 8: '6...","{0: 0, 1: 1, 2: 2, 3: 3, 4: 5, 5: 7, 6: 8, 7: ...","Dear colleagues , in the event of air anxiety ...",[],0,{},No,"[['Шановні', 1], ['колеги', 2], [',', 3], ['у'...","Dear colleagues , in the event of air anxiety ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2006226,/home/tajak/Parlamint-translation/Source-data/...,ParlaMint-UA_2023-10-17-m0.conllu,ParlaMint-UA_2023-10-17-m0.u145.p21.lang1.s1,"По фракціях покажіть, будь ласка.","По фракціях покажіть , будь ласка .",{},5,"Show the factions, please.","Show the factions , please .","['Yes', 'Yes', 'No', 'Yes', 'No', 'Last']","{1: '3', 2: '1', 3: '2', 4: '4', 5: '5', 6: '7'}","{1: '1', 3: '2, 3', 4: '4', 5: '5, 6', 6: '7'}","{0: 0, 1: 2, 2: 2, 3: 3, 4: 4, 5: 4, 6: 5}","Show the factions , please .",[],0,{},No,"[['По', 1], ['фракціях', 2], ['покажіть', 3], ...","Show the factions , please ."
2006227,/home/tajak/Parlamint-translation/Source-data/...,ParlaMint-UA_2023-10-17-m0.conllu,ParlaMint-UA_2023-10-17-m0.u145.p22.lang1.s1,"Шановні колеги, розгляд питань порядку денного...","Шановні колеги , розгляд питань порядку денног...",{},14,"Dear colleagues, the consideration of the agen...","Dear colleagues , the consideration of the age...","['Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes...","{1: '1', 2: '2', 3: '3', 5: '4', 6: '5', 7: '6...","{1: '1', 2: '2', 3: '3', 5: '4', 7: '5', 8: '6...","{0: 0, 1: 1, 2: 2, 3: 4, 4: 6, 5: 7, 6: 7, 7: ...","Dear colleagues , the consideration of the age...",[],0,{},No,"[['Шановні', 1], ['колеги', 2], [',', 3], ['ро...","Dear colleagues , the consideration of the age..."
2006228,/home/tajak/Parlamint-translation/Source-data/...,ParlaMint-UA_2023-10-17-m0.conllu,ParlaMint-UA_2023-10-17-m0.u145.p23.lang1.s1,Зараз я оголошую перерву у нашому пленарному з...,Зараз я оголошую перерву у нашому пленарному з...,{},20,I now announce a recess in our plenary session...,I now announce a recess in our plenary session...,"['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Ye...","{1: '2', 2: '1', 3: '3', 4: '4', 5: '4', 6: '5...","{2: '1', 1: '2', 3: '3', 5: '4', 6: '5', 7: '6...","{0: 1, 1: 0, 2: 2, 3: 4, 4: 5, 5: 6, 6: 7, 7: ...",I now announce a recess in our plenary session...,[],0,{},No,"[['Зараз', 1], ['я', 2], ['оголошую', 3], ['пе...",I now announce a recess in our plenary session...
2006229,/home/tajak/Parlamint-translation/Source-data/...,ParlaMint-UA_2023-10-17-m0.conllu,ParlaMint-UA_2023-10-17-m0.u145.p23.lang1.s2,Нагадую на необхідності непоширення інформації...,Нагадую на необхідності непоширення інформації...,{},8,I remind you of the need for non-proliferation...,I remind you of the need for non-proliferation...,"['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Ye...","{1: '1', 2: '1', 3: '1', 6: '3', 7: '3', 8: '4...","{2: '1', 4: '2', 6: '3', 8: '4', 10: '5', 11: ...","{0: 1, 1: 3, 2: 5, 3: 7, 4: 9, 5: 10, 6: 11, 7...",I remind you of the need for non-proliferation...,[],0,{},No,"[['Нагадую', 1], ['на', 2], ['необхідності', 3...",I remind you of the need for non-proliferation...


In [47]:
sen_list = produce_final_conllu(lang_code, final_dataframe)



Processing started.


2024-02-21 12:38:36 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| ner       | conll03  |

2024-02-21 12:38:36 INFO: Use device: gpu
2024-02-21 12:38:36 INFO: Loading: tokenize
2024-02-21 12:38:36 INFO: Loading: pos
2024-02-21 12:38:37 INFO: Loading: lemma
2024-02-21 12:38:37 INFO: Loading: ner
2024-02-21 12:38:37 INFO: Done loading processors!


ParlaMint-UA_2023-10-17-m0.conllu processed and saved.
Current running time: 1.38
Processing completed. It took 1.38 minutes.
