Workflow:
1. Extract information from the CONLL-U
2. Translate
3. Tokenize English translations with Stanza
4. Word alignment, substitute English NE translations with lemmas from the source, get information on NE annotations for each translated word from the source annotations
5. Linguistically process English translation with Stanza (lemmas, POS)
6. Parse CONLL-u file and add additional information (sentence ids, alignments, NER annotations)

In [1]:
from conllu import parse
import pandas as pd
import os

In [2]:
!pip freeze > requirements.txt

In [2]:
# Define the main information
lang_code = "SI"
files_path = "/home/tajak/Parlamint-translation/ParlaMint-SI/ParlaMint-SI.conllu"
opus_lang_code = "sla"

In [3]:
# Get a list of TEI files
dir_list = os.listdir(files_path)

# Keep only files with parliamentary sessions:

parl_list = []

# Filter out only relevant files
for i in dir_list:
	if "ParlaMint-SI_" in i:
		if ".conllu" in i:
			parl_list.append("{}".format(i))

len(parl_list)

414

In [4]:
parl_list[0]

'ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu'

## Extract information from CONLL-U files

In [None]:
"""
Format:

# newdoc id = ParlaMint-SI_2014-08-01-SDZ7-Redna-01.u1
# newpar id = ParlaMint-SI_2014-08-01-SDZ7-Redna-01.seg1
# sent_id = ParlaMint-SI_2014-08-01-SDZ7-Redna-01.seg1.1
# text = Spoštovani, prosim, da zasedete svoja mesta.
1	Spoštovani	spoštovan	ADJ	Appmpn	Case=Nom|Degree=Pos|Gender=Masc|Number=Sing|VerbForm=Part	3	discourse	_	NER=O|SpaceAfter=No
2	,	,	PUNCT	Z	_	1	punct	_	NER=O
3	prosim	prositi	VERB	Vmpr1s	Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin	0	root	_	NER=O|SpaceAfter=No
4	,	,	PUNCT	Z	_	6	punct	_	NER=O
5	da	da	SCONJ	Cs	_	6	mark	_	NER=O
6	zasedete	zasesti	VERB	Vmer2p	Aspect=Perf|Mood=Ind|Number=Plur|Person=2|Tense=Pres|VerbForm=Fin	3	ccomp	_	NER=O
7	svoja	svoj	DET	Px-npa	Case=Acc|Gender=Neut|Number=Plur|Poss=Yes|PronType=Prs|Reflex=Yes	8	det	_	NER=O
8	mesta	mesto	NOUN	Ncnpa	Case=Acc|Gender=Neut|Number=Plur	6	obj	_	NER=O|SpaceAfter=No
9	.	.	PUNCT	Z	_	3	punct	_	NER=O
"""

In [None]:
# CONLLU parser cheatsheet
"""
# Find which words are proper names with the filtering function
sentence.filter(misc__NER=lambda x: x != "O")

# Adding new metadata to the file
sentences[0].metadata["alignment"] =  "1-1"

# To turn back into conll-u
print(sentences[1].serialize())
"""

Extract from each sentence in the CONLL-u file:
- sent_id (in metadata) (# sent_id = ParlaMint-SI_2014-08-01-SDZ7-Redna-01.seg1.1)
- "text" (in metadata): to be feed into the MT system (# text = Spoštovani, prosim, da zasedete svoja mesta.)
- tokenized text (punctuation separated from words by space): by iterating through the tokens in the sentence - create a list of tokens and join them into a string (["Spoštovani", "prosim", ",", "da"] -> "Spoštovani prosim , da)
- list of NE annotations (same length as the tokens) - we want NE annotations for all tokens, with the information on the lemma and index if the NE is not "0": [{0:["O"]}, {1:["O"]}, {2:["O"]}, {3: ["PER-I", "Borut"]}]

In [4]:
# Create an empty df
df = pd.DataFrame({"file": [""], "sentence_id": [""], "text": [""], "tokenized_text": [""], "NER": [""], "proper_nouns": [""]})

In [5]:
for doc in parl_list:
	# Open the file
	data = open("{}/{}".format(files_path,doc), "r").read()

	sentences = parse(data)

KeyboardInterrupt: 

In [5]:
# Parse the data with CONLL-u parser

for doc in parl_list[:4]:
	# Open the file
	data = open("{}/{}".format(files_path,doc), "r").read()

	sentences = parse(data)

	sentence_id_list = []
	text_list = []
	tokenized_text_list = []
	NER_list = []
	proper_noun_list = []

	for sentence in sentences:
		# Find sentence ids
		current_sentence_id = sentence.metadata["sent_id"]
		sentence_id_list.append(current_sentence_id)

		# Find text
		current_text = sentence.metadata["text"]
		text_list.append(current_text)

		# Create a string out of tokens
		current_token_list = []
		current_ner_dict = {}
		word_dict = {}

		for token in sentence:
			current_token_list.append(token["form"])

			# Create a list of NE annotations with word indices.
			# I'll substract one from the word index, because indexing in the CONLLU file starts with 1, not 0
			current_index = int(token["id"]) - 1

			current_ner_dict[current_index] = token["misc"]["NER"]

			# Add information on the lemma if the NE is personal name
			if "PER" in token["misc"]["NER"]:
				word_dict[current_index] = [token["form"], token["lemma"]]

		proper_noun_list.append(word_dict)

		current_string = " ".join(current_token_list)

		tokenized_text_list.append(current_string)
		NER_list.append(current_ner_dict)
	
	new_df = pd.DataFrame({"sentence_id": sentence_id_list, "text": text_list, "tokenized_text": tokenized_text_list, "NER": NER_list, "proper_nouns": proper_noun_list})

	new_df["file"] = doc

	# Merge df to the previous df
	df = pd.concat([df, new_df])

In [6]:
# Reset index
df = df.reset_index(drop=True)

# Remove the first row
df = df.drop([0], axis="index")

# Show the results
df.describe(include="all")

Unnamed: 0,file,sentence_id,text,tokenized_text,NER,proper_nouns
count,11452,11452,11452,11452,11452,11452
unique,4,11452,10411,10411,2928,920
top,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg1.1,Hvala lepa.,Hvala lepa .,"{0: 'O', 1: 'O'}",{}
freq,3487,1,270,270,408,10402


In [7]:
df.head(2)

Unnamed: 0,file,sentence_id,text,tokenized_text,NER,proper_nouns
1,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg1.1,Nadaljujemo s prekinjeno 17. sejo zbora.,Nadaljujemo s prekinjeno 17. sejo zbora .,"{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'O', 5: 'O...",{}
2,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,"Prehajamo na 2. TOČKO DNEVNEGA REDA, TO JE NA ...","Prehajamo na 2. TOČKO DNEVNEGA REDA , TO JE NA...","{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'O', 5: 'O...",{}


In [8]:
# Inspect an example
df.iloc[23].to_dict()

{'file': 'ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu',
 'sentence_id': 'ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg11.1',
 'text': 'Besedo dajem Marjanu Maučecu, predstavniku Državnega sveta kot predlagatelja predloga zakona za predstavitev stališča do predloga matičnega delovnega telesa.',
 'tokenized_text': 'Besedo dajem Marjanu Maučecu , predstavniku Državnega sveta kot predlagatelja predloga zakona za predstavitev stališča do predloga matičnega delovnega telesa .',
 'NER': {0: 'O',
  1: 'O',
  2: 'B-PER',
  3: 'I-PER',
  4: 'O',
  5: 'O',
  6: 'B-ORG',
  7: 'I-ORG',
  8: 'O',
  9: 'O',
  10: 'O',
  11: 'O',
  12: 'O',
  13: 'O',
  14: 'O',
  15: 'O',
  16: 'O',
  17: 'O',
  18: 'O',
  19: 'O',
  20: 'O'},
 'proper_nouns': {2: ['Marjanu', 'Marjan'], 3: ['Maučecu', 'Maučec']}}

In [14]:
# Add information on length
df["length"] = df["text"].str.split().str.len()

print("Number of words in the corpora: {}".format(df["length"].sum()))

df.describe()

Number of words in the corpora: 215856


Unnamed: 0,length
count,11452.0
mean,18.84876
std,14.520027
min,1.0
25%,8.0
50%,16.0
75%,26.0
max,143.0


In [15]:
# Save the dataframe
df.to_csv("results/SI/Parlamint-SI-sentences-conllu-workflow-sample.csv", sep="\t")

## Translate

In [10]:
# Open the df
df = pd.read_csv("results/SI/Parlamint-SI-sentences-conllu-workflow-sample.csv", sep="\t", index_col = 0)
df.head(2)

Unnamed: 0,file,sentence_id,text,tokenized_text,NER,proper_nouns
1,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg1.1,Nadaljujemo s prekinjeno 17. sejo zbora.,Nadaljujemo s prekinjeno 17. sejo zbora .,"{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'O', 5: 'O...",{}
2,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,"Prehajamo na 2. TOČKO DNEVNEGA REDA, TO JE NA ...","Prehajamo na 2. TOČKO DNEVNEGA REDA , TO JE NA...","{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'O', 5: 'O...",{}


We need to translate the following corpora into English:
- Belgian (BE) - which language??
- Bulgarian (BG)
- Croatian (HR) - We will use "South Slavic MT" based on the manual analysis
- Czech (CZ)
- Danish (DK)
- Dutch (NL)
- French (FR)
- Hungarian (HU) - multilingual model only
- Icelandic (IS)
- Italian (IT)
- Latvian (LV)
- Lithuanian (LT)
- Polish (PL)
- Slovenian (SI) - We will use "Slavic MT" based on the results of the manual analysis
- Spanish? (ES)
- Turkish (TR)
- Austrian (AT)
- Basque (ES-PV)
- Bosnian (BA)
- Catalan (ES-CT)
- Estonian (EE)
- Finnish (FI)
- Galician (ES-GA)
- Greek (GR)
- Norwegian (NO) - NO OPUS-MT model (!) - we can use GT or eTranslation
- Portuguese (PT)
- Romanian (RO)
- Serbian (RS)
- Swedish (SE)
- Ukrainian (UA)

Explanation of language codes:
- sla = Slavic
- zls = South Slavic
- zlw = West Slavic
- zle = East Slavic
- gmq = North Germanic
- gem = Germanic
- gmw = West Germanic
- roa = Romance
- itc = Italic
- bat = Baltic
- trk = Turkic
- urj = Uralic
- fiu = Finno-Ugrian

In [12]:
# Create a dictionary with all possible models for each corpora
lang_models_dict = {"BG": ["bg", "sla", "zls"], "HR": ["zls"], "CZ": ["cs", "sla", "zlw" ], "DK": ["da", "gmq", "gem"], "NL": ["nl", "gem", "gmw"], "FR": ["fr", "itc","roa"], "HU": ["mul"], "IS": ["is","gmq", "gem"], "IT": ["it", "roa", "itc"], "LV": ["lv","bat"], "LT": ["bat"], "PL": ["pl", "sla", "zlw"], "SI": ["sla"], "ES": ["es", "roa", "itc"], "TR": ["tr", "trk" ], "AT": ["de", "gem", "gmw"], "ES-PV": ["eu", "mul"], "BA": ["sla", "zls"], 
"ES-CT": ["ca", "roa", "itc"], "EE": ["et", "urj", "fiu"], "FI": ["fi", "urj", "fiu"], "ES-GA": ["gl", "roa", "itc"], "GR": ["grk"], "PT": ["roa", "itc"], "RO":["roa", "itc"], "RS": ["zls", "sla"], "SE": ["sv", "gmq", "gem"], "UA":["uk", "sla", "zle"]}

choose_model()

In [27]:
def choose_model(lang_code):
	"""
	Compare a small sample of translations of all OPUS-MT models that are available
	for the language, to decide which one to use. The function prints out a dataframe with all translations of the sample and saves it as ParlaMint-{lang_code}-sample-model-comparison.csv.

	Args:
	- lang_code: the lang code that is used in the names of the files, it should be the same as for extract_text()
	"""
	import pandas as pd
	import regex as re
	from easynmt import EasyNMT
	from IPython.display import display
	
	lang_models_dict = {"BG": ["bg", "sla", "zls"], "HR": ["zls"], "CZ": ["cs", "sla", "zlw" ], "DK": ["da", "gmq", "gem"], "NL": ["nl", "gem", "gmw"], "FR": ["fr", "itc","roa"], "HU": ["mul"], "IS": ["is","gmq", "gem"], "IT": ["it", "roa", "itc"], "LV": ["lv","bat"], "LT": ["bat"], "PL": ["pl", "sla", "zlw"], "SI": ["sla", "zls"], "ES": ["es", "roa", "itc"], "TR": ["tr", "trk" ], "AT": ["de", "gem", "gmw"], "ES-PV": ["eu", "mul"], "BA": ["sla", "zls"], "ES-CT": ["ca", "roa", "itc"], "EE": ["et", "urj", "fiu"], "FI": ["fi", "urj", "fiu"], "ES-GA": ["gl", "roa", "itc"], "GR": ["grk"], "PT": ["roa", "itc"], "RO":["roa", "itc"], "RS": ["zls", "sla"], "SE": ["sv", "gmq", "gem"], "UA":["uk", "sla", "zle"]}

	# Open the file, created in the previous step
	#df = pd.read_csv("results/{}/ParlaMint-{}-file-for-MT.csv".format(lang_code, lang_code), sep="\t", index_col=0)
	
	# Open the sample
	df = pd.read_csv("results/SI/Parlamint-SI-sentences-conllu-workflow-sample.csv", sep="\t", index_col = 0)

	# Define the model
	model = EasyNMT('opus-mt')

	print("Entire corpus has {} sentences and {} words.".format(df["text"].count(), df["length"].sum()))

	# Create a smaller sample - just a couple of sentences from one file
	df = df[df.file == list(df["file"].unique())[0]][:20]

	print("Sample files has {} sentences and {} words.".format(df["text"].count(), df["length"].sum()))

	# Create a list of sentences from the df
	sentence_list = df.text.to_list()

	# Translate the sample using all available models for this language
	for opus_lang_code in lang_models_dict[lang_code]:
		translation_list = model.translate(sentence_list, source_lang = "{}".format(opus_lang_code), target_lang='en')

		# Add the translations to the df
		df["translation-{}".format(opus_lang_code)] = translation_list
	
	df = df.drop(columns=["file", "sentence_id", "tokenized_text", "NER", "proper_nouns", "length"])

	# Save the df
	df.to_csv("results/{}/ParlaMint-{}-sample-model-comparison.csv".format(lang_code, lang_code))

	print("The file is saved as results/{}/ParlaMint-{}-sample-model-comparison.csv. ".format(lang_code, lang_code))

	return df


In [26]:
df = choose_model(lang_code)

Entire corpus has 11452 sentences and 215856 words.
Sample files has 20 sentences and 456 words.


Unnamed: 0,text,translation-sla,translation-zls
1,Nadaljujemo s prekinjeno 17. sejo zbora.,We continue with the adjourned 17th meeting of the choir.,Let's continue with the 17th session of the assembly.
2,"Prehajamo na 2. TOČKO DNEVNEGA REDA, TO JE NA DRUGO OBRAVNAVO PREDLOGA ZAKONA V SPREMEMBI IN DOPOLNITVI ZAKONA O ZDRAVSTVENEM VARSTVU IN ZDRAVSTVENEM ZAVAROVANJU V OKVIRU SKRAJŠANEGA POSTOPKA.","We're moving to 2. At the end of the day, this is, on a second level, a proposal for a law amending and supplementing the Health Protection and Health Insurance Act within the framework of a shorter procedure.","Moving to 2. ACT OF DAYS, THIS IS A SECOND DETECTION OF THE PROPOSAL OF THE LAW IN THE AMENDMENT OF THE AMENDMENT AND ADJUSTMENT OF THE HEAL PROTECTION LAW AND OF THE SPECIFIC PROTECTION IN THE FACILITIES OF THE CONCERNED PROCEDURE."
3,Predlog zakona je v obravnavo zboru predložil Državni svet.,The draft law was submitted to the Council of State for consideration.,The proposal for a law was submitted to the Assembly by the State Council.
4,"V zvezi s tem predlogom zakona Odbor za zdravstvo predlaga Državnemu zboru sprejem sklepa, da predlog zakona ni primeren za nadaljnjo obravnavo.","With regard to this bill, the Health Committee proposes to the National Assembly that it be decided that the proposal for a law is not suitable for further consideration.","As regards this proposal for a law, the Committee on Health proposes to the National Assembly to adopt the conclusion that the proposal for a law is not appropriate for further consideration."
5,Besedo dajem predsednici odbora Anji Bah Žibert za dopolnilno obrazložitev predloga sklepa.,"I give the floor to the President of the Committee, Anja Bah Žibert, to further explain the proposal for a decision.",I hereby give the President of the Anji Bah Žiber Committee for an additional explanation of the proposal for a decision.
6,"Hvala lepa, predsedujoči.","Thank you very much, Chairman.","Thank you very much, President."
7,"Odbor za zdravstvo je na svoji 7. seji 12. 5. 2020 kot matično delovno telo obravnaval Predlog zakona o spremembi in dopolnitvi Zakona o zdravstvenem varstvu in zdravstvenem zavarovanju, ki ga je Državnemu zboru Republike Slovenije v obravnavo predložil Državni svet Republike Slovenije.",The Health Committee is at its seventh session of 12. 5. 2020 as a working mother body has considered the proposal for a law amending and supplementing the Health and Health Insurance Act submitted to the National Assembly of the Republic of Slovenia for consideration by the National Council of the Republic of Slovenia.,"The Committee on Health is attending its 7th meeting on 12. 5. As a parent body, 2020 considered the draft Act amending and supplementing the Health and Health Insurance Act submitted to the National Assembly of the Republic of Slovenia for consideration by the State Council of the Republic of Slovenia."
8,"Kolegij predsednika Državnega zbora je na 48. seji 23. 10. 2020 odločil, da se predlog zakona obravnava po skrajšanem postopku.",The College of the President of the National Assembly shall be held at the 48th session of the 23rd. Ten. 2020 decided that the proposal for a law should be dealt with in a simplified manner.,"The College of Presidents of the National Assembly shall be held at the 48rd meeting of the 23rd meeting. Ten. In 2020, the proposal for a law is decided to be dealt with by a shortened procedure."
9,V poslovniškem roku amandmaji k predlogu zakona niso bili vloženi.,No amendments to the draft law were tabled within the time limit.,No amendments to the draft law have been tabled within the business period.
10,"Dopolnilno obrazložitev je podal državni svetnik Marjan Maučec in poudaril, da je temeljni cilj predloga zakona uveljavitev pravne podlage, po kateri bi se zdravstveno ranljivim prebivalcem Slovenije zagotovila potrebna zdravstvena oskrba.",Further explanation was given by State Councillor Marjan Maučec and stressed that the basic objective of the bill is to enforce the legal basis by which health care would be provided to the health-stricken inhabitants of Slovenia.,"The additional explanation was given by State Councilman Marjan Maučec, stressing that the fundamental objective of the proposal for a law is to enforce the legal basis for providing medically vulnerable people in Slovenia with the necessary medical care."


In [30]:
# Open the analysed sample

sample = pd.read_csv("results/{}/ParlaMint-{}-sample-model-comparison.csv".format(lang_code, lang_code), index_col = 0)
sample.head(2)

Unnamed: 0,text,translation-sla,translation-zls,comparison
1,Nadaljujemo s prekinjeno 17. sejo zbora.,We continue with the adjourned 17th meeting of...,Let's continue with the 17th session of the as...,zls
2,"Prehajamo na 2. TOČKO DNEVNEGA REDA, TO JE NA ...","We're moving to 2. At the end of the day, this...","Moving to 2. ACT OF DAYS, THIS IS A SECOND DET...",sla


In [33]:
sample.comparison.value_counts()

zls    8
sla    5
Name: comparison, dtype: int64

translate()

In [34]:
def translate(lang_code, opus_lang_code):
	"""
	This function translates the text from the dataframe, created with the extract_text() function
	with OPUS-MT models using EasyNMT. It returns a dataframe with the translation.

	Args:
	- lang_code: the lang code that is used in the names of the files, it should be the same as for extract_text()
	- opus_lang_code: the lang code to be used in the OPUS-MT model - use the one that performed the best in the comparison (see function choose_model())
	"""
	import pandas as pd
	import regex as re
	from easynmt import EasyNMT
	from IPython.display import display

	# Open the file, created in the previous step
	#df = pd.read_csv("results/{}/ParlaMint-{}-file-for-MT.csv".format(lang_code, lang_code), sep="\t", index_col=0)

	# Open the sample
	df = pd.read_csv("results/SI/Parlamint-SI-sentences-conllu-workflow-sample.csv", sep="\t", index_col = 0)

	# Define the model
	model = EasyNMT('opus-mt')

	print("Entire corpus has {} sentences and {} words.".format(df["text"].count(), df["length"].sum()))

	# Create a list of sentences from the df
	sentence_list = df.text.to_list()

	#Translate the list of sentences - you need to provide the source language as it is in the name of the model - the opus_lang_code
	translation_list = model.translate(sentence_list, source_lang = "{}".format(opus_lang_code), target_lang='en')

	# Add the translations to the df
	df["translation"] = translation_list

	# Display the df
	display(df[:3])

	# Save the df
	df.to_csv("results/{}/ParlaMint-{}-translated.csv".format(lang_code, lang_code), sep="\t")

	return df

In [35]:
df = translate(lang_code, opus_lang_code)

Entire corpus has 11452 sentences and 215856 words.
|    | file                                         | sentence_id                                  | text                                                                                                                                                                                             | tokenized_text                                                                                                                                                                                     | NER                                                                                                                                                                                                                                                                                      | proper_nouns   |   length | translation                                                                                                                                     

In [37]:
df.tail(2)

Unnamed: 0,file,sentence_id,text,tokenized_text,NER,proper_nouns,length,translation
11451,ParlaMint-SI_2017-05-08-SDZ7-Izredna-43.conllu,ParlaMint-SI_2017-05-08-SDZ7-Izredna-43.seg626.1,"Obveščam vas, da se bo 29. seja Državnega zbor...","Obveščam vas , da se bo 29. seja Državnega zbo...","{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'O', 5: 'O...",{},16,I inform you that the 29th meeting of the Nati...
11452,ParlaMint-SI_2017-05-08-SDZ7-Izredna-43.conllu,ParlaMint-SI_2017-05-08-SDZ7-Izredna-43.seg627.1,Zaključujem tudi 43. izredno sejo Državnega zb...,Zaključujem tudi 43. izredno sejo Državnega zb...,"{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'O', 5: 'B...",{},7,I also conclude the 43rd extraordinary meeting...


In [49]:
df.translation.to_list()[:3]

['We continue with the adjourned 17th meeting of the choir.',
 "We're moving to 2. At the end of the day, this is, on a second level, a proposal for a law amending and supplementing the Health Protection and Health Insurance Act within the framework of a shorter procedure.",
 'The draft law was submitted to the Council of State for consideration.']

## Word alignment

### Tokenization with Stanza

- We apply the stanza tokenization over the translation; use tokenize_no_ssplit to avoid splitting sentences in multiple sentences.

In [54]:
import stanza

nlp = stanza.Pipeline(lang='en', processors='tokenize', tokenize_no_ssplit = True)

2023-01-10 08:50:08 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-01-10 08:50:09 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2023-01-10 08:50:09 INFO: Use device: gpu
2023-01-10 08:50:09 INFO: Loading: tokenize
2023-01-10 08:50:09 INFO: Done loading processors!


In [65]:
# Apply tokenization to English translation and add the sentences to the df

En_sentences = df.translation.to_list()

tokenized_sentences = []

for i in En_sentences:
	doc = nlp(i)
	current_sentence_list = []

	# Loop through the tokens in the sentence and add them to a current sentence list
	for sentence in doc.sentences:
		for word in sentence.words:
			current_sentence_list.append(word.text)
	
	# Join the list into a space-separated string
	current_string = " ".join(current_sentence_list)

	tokenized_sentences.append(current_string)

# Add the result to the df
df["translation-tokenized"] = tokenized_sentences

df.head()

Unnamed: 0,file,sentence_id,text,tokenized_text,NER,proper_nouns,length,translation,translation-tokenized
1,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg1.1,Nadaljujemo s prekinjeno 17. sejo zbora.,Nadaljujemo s prekinjeno 17. sejo zbora .,"{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'O', 5: 'O...",{},6,We continue with the adjourned 17th meeting of...,We continue with the adjourned 17th meeting of...
2,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,"Prehajamo na 2. TOČKO DNEVNEGA REDA, TO JE NA ...","Prehajamo na 2. TOČKO DNEVNEGA REDA , TO JE NA...","{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'O', 5: 'O...",{},28,"We're moving to 2. At the end of the day, this...","We 're moving to 2 . At the end of the day , t..."
3,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.1,Predlog zakona je v obravnavo zboru predložil ...,Predlog zakona je v obravnavo zboru predložil ...,"{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'O', 5: 'O...",{},9,The draft law was submitted to the Council of ...,The draft law was submitted to the Council of ...
4,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.2,V zvezi s tem predlogom zakona Odbor za zdravs...,V zvezi s tem predlogom zakona Odbor za zdravs...,"{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'O', 5: 'O...",{},22,"With regard to this bill, the Health Committee...","With regard to this bill , the Health Committe..."
5,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1,Besedo dajem predsednici odbora Anji Bah Žiber...,Besedo dajem predsednici odbora Anji Bah Žiber...,"{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'B-PER', 5...","{4: ['Anji', 'Anja'], 5: ['Bah', 'Bah'], 6: ['...",12,I give the floor to the President of the Commi...,I give the floor to the President of the Commi...


In [66]:
# Save the df
df.to_csv("results/{}/ParlaMint-{}-translated.csv".format(lang_code, lang_code), sep="\t")

In [67]:
df.head(1)

Unnamed: 0,file,sentence_id,text,tokenized_text,NER,proper_nouns,length,translation,translation-tokenized
1,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg1.1,Nadaljujemo s prekinjeno 17. sejo zbora.,Nadaljujemo s prekinjeno 17. sejo zbora .,"{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'O', 5: 'O...",{},6,We continue with the adjourned 17th meeting of...,We continue with the adjourned 17th meeting of...


In [137]:
df["NER"] = df["NER"].astype("str")
df["NER"] = df["NER"].apply(lambda x: ast.literal_eval(x))

test = df.NER.to_list()[:3]

for i in test:
	for element in list(i.items()):
		if element[1] != "O":
			print(element)

(18, 'B-MISC')
(19, 'I-MISC')
(20, 'I-MISC')
(21, 'I-MISC')
(7, 'B-ORG')
(8, 'I-ORG')


In [None]:
# When we open the dataframe file, the dictionaries with proper names changed into strings - Change strings in the column proper_nouns into dictionaries

df["proper_nouns"] = df.proper_nouns.astype("str")
df["proper_nouns"] = df.proper_nouns.apply(lambda x: ast.literal_eval(x))

df["NER"] = df["NER"].astype("str")
df["NER"] = df["NER"].apply(lambda x: ast.literal_eval(x))

# Change nan values in the proper_nouns columns
df = df.fillna(0)

# Substitute words in the translation based on alignments
intermediate_list = list(zip(df["translation-tokenized"], df["proper_nouns"], df["alignments"], df["NER"]))

new_translations = []
substituted_all_info = []
substituted_only = []
NER_alignments = []

# Add information whether an error occurred
error_list = []

for i in intermediate_list[:3]:
	current_substituted_list = []
	current_substituted_only = []
	current_error = "No"

	# Create a list of NER alignments - at first, let all elements be "not NE", then we will substitute elements with appropriate tags
	# "O" is repeated as many times as there are tokens in the translation
	current_NER_list = ["O"] * len(intermediate_list[0].split())

	# Loop through the NER list for the source of this sentence
	for NER_pair in list(i[3].items()):
		# If the pair is not "O", get the word index
		if NER_pair[1] != "O":
			source_NE_index = NER_pair[0]

			# Find to which target index it corresponds:
			substituted_word_index = i[2][source_NE_index]

			# Substitute the element it the NER list under this index with the NE tag
			current_NER_list[substituted_word_index] = NER_pair[1]

	# If no proper names were detected, do not change the translation
	if i[1] == 0:
		new_translations.append(i[0])
	
	else:
		current_translation = i[0]

		# Substitute the word with the Slovene lemma based on the index - loop through the proper nouns to be changed
		for word_index in list(i[1].keys()):
			try:
				# split the translation into list of words
				word_list = current_translation.split()

				# Get index of the substituted word
				substituted_word_index = i[2][word_index]

				# Get the lemma to substitute the word with
				correct_lemma = i[1][word_index][1]

				# If the substitute word and lemma are not the same, get substituted word and its match
				if word_list[substituted_word_index] != correct_lemma:
					current_substituted_list.append((word_list[substituted_word_index], correct_lemma))
					current_substituted_only.append((word_list[substituted_word_index], correct_lemma))

					# Substitute the word in the word list
					word_list[substituted_word_index] = correct_lemma
				
				else:
					# Add information that substitution was not performed
					current_substituted_list.append(f"No substitution: {word_list[substituted_word_index], correct_lemma}")
				
				# Change the translation by merging the words back into a string
				current_translation = " ".join(word_list)

			except:
				print(f"Issue: index {word_index}: {i[1][word_index]}")
				current_error = f"Issue: index {word_index}: {i[1][word_index]}"

		# After the loop through proper nouns, save the new translation
		new_translations.append(current_translation)
	
	# Add information on what was substituted
	substituted_all_info.append(current_substituted_list)
	substituted_only.append(current_substituted_only)
	error_list.append(current_error)

## Alignment

- Perform word alignment.
- Save forward and reverse alignment information for each sentence (2 additional columns).
- Transfer NE annotations to the translated sentence based on the alignment: add a column with information to which English token this information should go to (e.g. [{3: "B-PER", 5:"I-LOC"}])
- Substitute translated NE words with lemmas based on the annotation, save new translation to a new column.

In [227]:
def correct_proper_nouns(lang_code):
	"""
	This function takes the translated text and the source text, aligns words with eflomal and corrects proper nouns.
	It takes the dataframe that was created in the function extract_text() and to which the translation was added
	in the function translate().

	To use eflomal, you need to install it first:
	!git clone https://github.com/robertostling/eflomal
	%cd eflomal
	!make
	!sudo make install
	!python3 setup.py install

	Args:
	- lang_code: the lang code that is used in the names of the files, it should be the same as for extract_text()
	"""
	import pandas as pd
	import re
	import ast
	from IPython.display import display

	# Open the file, created in the previous step
	df = pd.read_csv("./results/{}/ParlaMint-{}-translated.csv".format(lang_code, lang_code), sep="\t", index_col=0)

	# Move into the eflomal folder
	%cd eflomal

	# Then we need to create files for all texts and all translations
	source_sentences = open("source_sentences.txt", "w")
	English_sentences = open("English_sentences.txt", "w")

	for i in df["tokenized_text"].to_list():
		source_sentences.write(i)
		source_sentences.write("\n")

	for i in df["translation-tokenized"].to_list():
		English_sentences.write(i)
		English_sentences.write("\n")

	source_sentences.close()
	English_sentences.close()

	# Align sentences with eflomal and get out a file with alignments
	!python3 align.py -s source_sentences.txt -t English_sentences.txt --model 3 -r source-en.rev -f source-en.fwd

	# Create a list of alignments from the returned files which will be added to the final conllu

	# Create target alignments from the source alignment direction (by changing the direction in the fwd file)
	aligns_list_target = open("source-en.fwd", "r").readlines()
	aligns_list_target = [i.replace("\n", "") for i in aligns_list_target]
	aligns_list_target = [i.split(" ") for i in aligns_list_target]

	aligns_list_target_final = []

	for i in aligns_list_target:
		current_sentence_align = ""
		for pair in i:
			current_pair = pair.split("-")
			current_sentence_align += "{}-{}".format(current_pair[1], current_pair[0])
			current_sentence_align += " "
	
		aligns_list_target_final.append(current_sentence_align)
	
	# Add aligns_list to the df
	df["aligns-target"] = aligns_list_target_final

	# Create a list of alignments for the source file
	aligns_list = open("source-en.rev", "r").readlines()
	aligns_list = [i.replace("\n", "") for i in aligns_list]

	# Add information to be added to the conllu
	df["aligns-source"] = aligns_list

	# Continue with processing the list to create the final alignments format which I'll use to correct proper names
	aligns_list = [i.split(" ") for i in aligns_list]

	for i in aligns_list:
		for pair in i:
			current_pair = pair.split("-")
			i[i.index(pair)] = {int(current_pair[0]): int(current_pair[1])}
	
	final_aligns = []

	# Create a dictionary out of the rev alignments
	for i in aligns_list:
		current_line = {}

		try:
			for element in i:
				a = list(element.items())[0][0]
				b = list(element.items())[0][1]
				current_line[a] = b
		
			# Check whether the number of pairs in the list is the same as number of items
			if len(i) != len(list(current_line.items())):
				print("Not okay:")
				print(i)
				print(current_line)

			final_aligns.append(current_line)
		
		except:
			print("error")
			print(aligns_list.index(i))
			print(i)
			final_aligns.append("Error")
		
	print("Number of aligned sentences: {}".format(len(final_aligns)))

	# Add a to the df
	df["alignments"] = final_aligns

	# Remove the rev and fwd file
	%rm source-en.rev
	%rm source-en.fwd

	# When we open the dataframe file, the dictionaries with proper names changed into strings - Change strings in the column proper_nouns into dictionaries

	df["proper_nouns"] = df.proper_nouns.astype("str")
	df["proper_nouns"] = df.proper_nouns.apply(lambda x: ast.literal_eval(x))

	df["NER"] = df["NER"].astype("str")
	df["NER"] = df["NER"].apply(lambda x: ast.literal_eval(x))

	# Change nan values in the proper_nouns columns
	df = df.fillna(0)

	# Substitute words in the translation based on alignments
	intermediate_list = list(zip(df["translation-tokenized"], df["proper_nouns"], df["alignments"], df["NER"]))

	new_translations = []
	substituted_all_info = []
	substituted_only = []
	NER_alignments = []

	# Add information whether an error occurred
	error_list = []

	for i in intermediate_list:
		current_substituted_list = []
		current_substituted_only = []
		current_error = "No"

		# Create a list of NER alignments - at first, let all elements be "not NE", then we will substitute elements with appropriate tags
		# "O" is repeated as many times as there are tokens in the translation
		current_NER_list = ["O"] * len(i[0].split())

		# Loop through the NER list for the source of this sentence
		for NER_pair in list(i[3].items()):
			# If the pair is not "O", get the word index
			if NER_pair[1] != "O":
				try:
					source_NE_index = NER_pair[0]

					# Find to which target index it corresponds:
					substituted_word_index = i[2][source_NE_index]

					# Substitute the element it the NER list under this index with the NE tag
					current_NER_list[substituted_word_index] = NER_pair[1]
				except:
					continue
		
		# Add to the main list
		NER_alignments.append(current_NER_list)

		# If no proper names were detected, do not change the translation
		if i[1] == 0:
			new_translations.append(i[0])
		
		else:
			current_translation = i[0]

			# Substitute the word with the Slovene lemma based on the index - loop through the proper nouns to be changed
			for word_index in list(i[1].keys()):
				try:
					# split the translation into list of words
					word_list = current_translation.split()

					# Get index of the substituted word
					substituted_word_index = i[2][word_index]

					# Get the lemma to substitute the word with
					correct_lemma = i[1][word_index][1]

					# If the substitute word and lemma are not the same, get substituted word and its match
					if word_list[substituted_word_index] != correct_lemma:
						current_substituted_list.append((word_list[substituted_word_index], correct_lemma))
						current_substituted_only.append((word_list[substituted_word_index], correct_lemma))

						# Substitute the word in the word list
						word_list[substituted_word_index] = correct_lemma
					
					else:
						# Add information that substitution was not performed
						current_substituted_list.append(f"No substitution: {word_list[substituted_word_index], correct_lemma}")
					
					# Change the translation by merging the words back into a string
					current_translation = " ".join(word_list)

				except:
					print(f"Issue: index {word_index}: {i[1][word_index]}")
					current_error = f"Issue: index {word_index}: {i[1][word_index]}"

			# After the loop through proper nouns, save the new translation
			new_translations.append(current_translation)
		
		# Add information on what was substituted
		if len(substituted_all_info) != 0:
			substituted_all_info.append(current_substituted_list)
		else:
			substituted_all_info.append(0)

		if len(current_substituted_only) != 0:
			substituted_only.append(current_substituted_only)
		else:
			substituted_only.append(0)

		error_list.append(current_error)


	# Add to the df
	df["new_translations"] = new_translations
	df["substitution_info"] = substituted_all_info
	df["substituted_words"] = substituted_only
	df["errors"] = error_list
	df["target-NER-annotations"] = NER_alignments

	# Change the working directory once again
	%cd ..

	# Save the df
	df.to_csv("results/{}/ParlaMint-{}-final.csv".format(lang_code, lang_code), sep="\t")

	# Display most common substitutions
	df_substituted = df[df["proper_nouns"] != "0"]
	display(df_substituted.substituted_words.value_counts()[:20])

	return df

In [230]:
df = correct_proper_nouns(lang_code)

/home/tajak/Parlamint-translation/eflomal
Number of aligned sentences: 11452
Issue: index 32: ['Slovenci', 'Slovenec']
Issue: index 40: ['Samo', 'Samo']
Issue: index 1: ['Levici', 'Levica']
Issue: index 5: ['Sicer', 'sicer']
Issue: index 0: ['Slovenci', 'Slovenec']
/home/tajak/Parlamint-translation


TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


0                                 10843
[(Slovenians, Slovenec)]             17
[(Left, Levica)]                     16
[(Mir, Miro)]                        14
[(Mira, Miro)]                       11
[(Grimes, Grims)]                    11
[(Irgle, Irgl)]                      11
[(Moon, Mesec)]                       9
[(Weber, Veber)]                      9
[(Horvath, Horvat)]                   8
[(Bah, Žibert)]                       8
[(Juliana, Julijana)]                 8
[(Franz, Franc)]                      7
[(Jozet, Jože), (Tank, Tanko)]        7
[(Luke, Luka)]                        7
[(Mucha, Muha)]                       6
[(Luke, Luka), (Moon, Mesec)]         6
[(Verbich, Verbič)]                   5
[(Tadej, Rebrica)]                    5
[(Levi, Levica)]                      5
Name: substituted_words, dtype: int64

In [231]:
df[df["substituted_words"]!= 0][:5]

Unnamed: 0,file,sentence_id,text,tokenized_text,NER,proper_nouns,length,translation,translation-tokenized,aligns-target,aligns-source,alignments,new_translations,substitution_info,substituted_words,errors,target-NER-annotations
5,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1,Besedo dajem predsednici odbora Anji Bah Žiber...,Besedo dajem predsednici odbora Anji Bah Žiber...,"{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'B-PER', 5...","{4: ['Anji', 'Anja'], 5: ['Bah', 'Bah'], 6: ['...",12,I give the floor to the President of the Commi...,I give the floor to the President of the Commi...,0-1 1-1 3-0 4-1 6-2 9-3 11-4 12-5 13-6 15-7 16...,0-3 1-1 2-6 3-9 4-11 5-12 6-12 7-15 8-16 9-17 ...,"{0: 3, 1: 1, 2: 6, 3: 9, 4: 11, 5: 12, 6: 12, ...",I give the floor to the President of the Commi...,"[No substitution: ('Anja', 'Anja'), No substit...","[(Bah, Žibert)]",No,"[O, O, O, O, O, O, O, O, O, O, O, B-PER, I-PER..."
24,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg11.1,"Besedo dajem Marjanu Maučecu, predstavniku Drž...","Besedo dajem Marjanu Maučecu , predstavniku Dr...","{0: 'O', 1: 'O', 2: 'B-PER', 3: 'I-PER', 4: 'O...","{2: ['Marjanu', 'Marjan'], 3: ['Maučecu', 'Mau...",19,"I give the floor to Marjan Maucec, the represe...","I give the floor to Marjan Maucec , the repres...",0-1 1-1 3-0 4-1 5-2 6-3 7-4 9-5 12-7 14-6 15-8...,0-3 1-1 2-5 3-6 4-7 5-9 6-14 7-12 8-15 9-17 10...,"{0: 3, 1: 1, 2: 5, 3: 6, 4: 7, 5: 9, 6: 14, 7:...","I give the floor to Marjan Maučec , the repres...","[No substitution: ('Marjan', 'Marjan'), (Mauce...","[(Maucec, Maučec)]",No,"[O, O, O, O, O, B-PER, I-PER, O, O, O, O, O, I..."
27,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg13.1,Državni svet Republike Slovenije je Predlog za...,Državni svet Republike Slovenije je Predlog za...,"{0: 'B-ORG', 1: 'I-ORG', 2: 'I-ORG', 3: 'I-ORG...","{29: ['Franca', 'Franc'], 30: ['Kanglerja', 'K...",35,The National Council of the Republic of Sloven...,The National Council of the Republic of Sloven...,1-0 2-1 5-2 7-3 8-4 10-5 13-6 14-8 15-9 16-10 ...,0-1 1-2 2-5 3-7 4-9 5-10 6-13 8-14 9-15 10-16 ...,"{0: 1, 1: 2, 2: 5, 3: 7, 4: 9, 5: 10, 6: 13, 8...",The National Council of the Republic of Sloven...,"[(Frank, Franc), No substitution: ('Kangler', ...","[(Frank, Franc)]",No,"[O, B-ORG, I-ORG, O, O, I-ORG, O, I-ORG, O, O,..."
44,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg16.1,Za predstavitev stališča do predloga matičnega...,Za predstavitev stališča do predloga matičnega...,"{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'O', 5: 'O...","{13: ['Tini', 'Tina'], 14: ['Bregant', 'Bregan...",21,I give the floor to the representative of the ...,I give the floor to the representative of the ...,0-8 1-8 3-9 6-10 9-11 11-12 12-13 13-14 14-15 ...,0-23 1-25 2-26 3-27 4-29 5-32 6-33 7-34 8-1 9-...,"{0: 23, 1: 25, 2: 26, 3: 27, 4: 29, 5: 32, 6: ...",I give the floor to the representative of the ...,"[(Tini, Tina), No substitution: ('Bregant', 'B...","[(Tini, Tina)]",No,"[O, O, O, O, O, O, O, O, O, B-ORG, O, O, B-PER..."
103,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg33.1,Stališče Poslanske skupine Nove Slovenije – kr...,Stališče Poslanske skupine Nove Slovenije – kr...,"{0: 'O', 1: 'B-ORG', 2: 'I-ORG', 3: 'I-ORG', 4...","{10: ['Iva', 'Iva'], 11: ['Dimic.', 'dimic.']}",12,The position of the New Slovenia Group – Chris...,The position of the New Slovenia Group – Chris...,1-0 2-1 3-2 4-3 5-4 6-2 7-5 8-6 9-7 10-5 11-8 ...,0-1 1-3 2-6 3-4 4-5 5-7 6-8 7-9 8-11 9-13 10-1...,"{0: 1, 1: 3, 2: 6, 3: 4, 4: 5, 5: 7, 6: 8, 7: ...",The position of the New Slovenia Group – Chris...,"[No substitution: ('Iva', 'Iva'), (Dimic, dimi...","[(Dimic, dimic.)]",No,"[O, O, O, B-ORG, I-ORG, I-ORG, I-ORG, O, O, O,..."


In [187]:
df.head(1)

Unnamed: 0,file,sentence_id,text,tokenized_text,NER,proper_nouns,length,translation,translation-tokenized,alings-target,aligns-source,alignments,new_translations,substitution_info,substituted_words,errors,target-NER-annotations
1,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg1.1,Nadaljujemo s prekinjeno 17. sejo zbora.,Nadaljujemo s prekinjeno 17. sejo zbora .,"{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'O', 5: 'O...",{},6,We continue with the adjourned 17th meeting of...,We continue with the adjourned 17th meeting of...,"[{0: 0}, {1: 0}, {2: 1}, {4: 2}, {5: 3}, {6: 4...","[{0: 1}, {1: 2}, {2: 4}, {3: 5}, {4: 6}, {5: 9...","{0: 1, 1: 2, 2: 4, 3: 5, 4: 6, 5: 9, 6: 10}",We continue with the adjourned 17th meeting of...,0,0,No,"[O, O, O, O, O, O, O, O, O, O, O]"


## Linguistic processing of translated text

This will have to be done for each file separately - from now onwards, we need to separate the df into files.

In [253]:
# Create a list of files
files = list(df.file.unique())
files

['ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu',
 'ParlaMint-SI_2015-09-22-SDZ7-Redna-11.conllu',
 'ParlaMint-SI_2018-03-27-SDZ7-Redna-39.conllu',
 'ParlaMint-SI_2017-05-08-SDZ7-Izredna-43.conllu']

### Linguistically process with Stanza

- We use Stanza to get POS and lemmas. Send in the "pre-tokenized text" (created in previous steps).
- Transform the result into CONLL-u (which should contain tokens, lemmas, pos).

- Parse the CONLL-u file and add:
	1) sentence_id as metadata
	2) forward and reverse alignment as metadata (# align_s = 1-1 2-2... and #align_t = 1-1 2-2...),
	3) based on alignment, add NER information to each token (misc = {NER:} field)
- Save the file as CONLLU with the same name as the source CONLLU file (so each file will be saved separately). The number of sentences should be the same as in the source CONLLU and ANA file.

In [263]:
def create_conllu(file, lang_code):
	"""
	The function takes the dataframe (df), created in previous steps and takes only the instances from the df that belong
	to the file that is in the argument. It linguistically processes the translated sentences from the file and saves the file.
	Then we add additional information (metadata and NER annotations) to it with the conllu parser and save the final conllu file.

	Args:
		- file (str): file name from the files list (see above)
		- lang_code (str): the lang code that is used in the names of the files, it should be the same as for extract_text()
	"""

	# Process all sentences in the dataframe and save them to a conllu file
	from stanza.utils.conll import CoNLL
	from conllu import parse
	import ast

	# Use the dataframe, created in previous steps
	df = pd.read_csv("results/{}/ParlaMint-{}-final.csv".format(lang_code, lang_code), sep="\t")

	# When we open the df, the NER list turns into a string - we need to change it into a list
	df["target-NER-annotations"] = df["target-NER-annotations"].apply(lambda x: ast.literal_eval(x))
	
	# Filter out only instances from the file in question
	df = df[df["file"] == file]

	# Create lists of information that we need to add to the conllu file
	ids_list = df.sentence_id.to_list()
	aligns_source = df["aligns-source"].to_list()
	aligns_target = df["aligns-target"].to_list()
	ner_list = df["target-NER-annotations"].to_list()
	
	sentence_list = df.new_translations.to_list()

	# To feed the entire list into the pipeline, we need to create lists of tokens, split by space
	sentence_list = [x.split(" ") for x in sentence_list]

	# Linguistically process the list
	doc = nlp(sentence_list)

	# Save the conllu file
	CoNLL.write_doc2conll(doc, "results/{}/ParlaMint-{}-translated.conllu/temp/{}".format(lang_code, lang_code, file))

	print("{} processed and saved.".format(file))

	# Open the CONLL-u file with the CONLL-u parser

	data = open("results/{}/ParlaMint-{}-translated.conllu/temp/{}".format(lang_code, lang_code, file), "r").read()

	sentences = parse(data)

	# Adding additional information to the conllu
	for sentence in sentences:
		# Get the sentence index
		sentence_index = sentences.index(sentence)

		# Add metadata
		sentence.metadata["sent_id"] = ids_list[sentence_index]
		sentence.metadata["align_s"] = aligns_source[sentence_index]
		sentence.metadata["align_t"] = aligns_target[sentence_index]

		# Make the # text element be the last 
		current_text = sentence.metadata["text"]
		del sentence.metadata["text"]

		sentence.metadata["text"] = current_text

		# Iterate through tokens and add NER information to each
		for word in sentence:
			word_index = sentence.index(word)
			# Add NER information based on the word index
			word["misc"]["NER"] = ner_list[sentence_index][word_index]
		
	# Create a new conllu file with the updated information

	final_file = open("results/{}/ParlaMint-{}-translated.conllu/{}".format(lang_code, lang_code, file), "w")

	for sentence in sentences:
		final_file.write(sentence.serialize())
	
	final_file.close()

	print("Final file {} is saved.".format(file))

In [264]:
import stanza

# Now, let's feed the changed translation to the Stanza pipeline to create the final format
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_pretokenized=True)

for file in files[:1]:
	create_conllu(file, lang_code)

2023-01-10 14:00:37 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-01-10 14:00:37 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

2023-01-10 14:00:37 INFO: Use device: gpu
2023-01-10 14:00:37 INFO: Loading: tokenize
2023-01-10 14:00:37 INFO: Loading: pos
2023-01-10 14:00:37 INFO: Loading: lemma
2023-01-10 14:00:37 INFO: Done loading processors!


ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu processed and saved.
Final file <_io.TextIOWrapper name='results/SI/ParlaMint-SI-translated.conllu/ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu' mode='w' encoding='UTF-8'> is saved.


In [265]:
# Check whether the translated and source file have the same no. of sentences
from conllu import parse

source = open("ParlaMint-SI/ParlaMint-SI.conllu/ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu", "r").read()
source_sen = parse(source)

translation = open("results/SI/ParlaMint-SI-translated.conllu/ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu", "r").read()

translations_sen = parse(translation)

TypeError: list indices must be integers or slices, not str

In [267]:
# Check if number of sentences match
print(len(source_sen))
print(len(translations_sen))

3487
3487


In [268]:
# Check if ids match
for i in [3484, 3485, 3486]:
	print(source_sen[i].metadata["sent_id"])
	print(translations_sen[i].metadata["sent_id"])

ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg1075.2
ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg1075.2
ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg1077.1
ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg1077.1
ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg1078.1
ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg1078.1


In [272]:
# Check if content matches
for i in [3484, 3485, 3486]:
	print(source_sen[i].metadata["text"])
	print(translations_sen[i].metadata["text"])

Navzočih je 75 poslank in poslancev, za je glasovalo 29, proti 46.
There are 75 Members and Members present , 29 votes against 46 .
Ugotavljam, da sklep ni sprejet.
I take it that the decision is not taken .
S tem zaključujem to točko in 17. sejo Državnega zbora.
This concludes this point and the 17th session of the National Assembly .


In [273]:
# Check weird stuff in the final product
df[df["sentence_id"] == "ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1"]

Unnamed: 0,file,sentence_id,text,tokenized_text,NER,proper_nouns,length,translation,translation-tokenized,aligns-target,aligns-source,alignments,new_translations,substitution_info,substituted_words,errors,target-NER-annotations
5,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1,Besedo dajem predsednici odbora Anji Bah Žiber...,Besedo dajem predsednici odbora Anji Bah Žiber...,"{0: 'O', 1: 'O', 2: 'O', 3: 'O', 4: 'B-PER', 5...","{4: ['Anji', 'Anja'], 5: ['Bah', 'Bah'], 6: ['...",12,I give the floor to the President of the Commi...,I give the floor to the President of the Commi...,0-1 1-1 3-0 4-1 6-2 9-3 11-4 12-5 13-6 15-7 16...,0-3 1-1 2-6 3-9 4-11 5-12 6-12 7-15 8-16 9-17 ...,"{0: 3, 1: 1, 2: 6, 3: 9, 4: 11, 5: 12, 6: 12, ...",I give the floor to the President of the Commi...,"[""No substitution: ('Anja', 'Anja')"", ""No subs...","[('Bah', 'Žibert')]",No,"[O, O, O, O, O, O, O, O, O, O, O, B-PER, I-PER..."
