In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [3]:
import regex as re
import pandas as pd
from bs4 import BeautifulSoup as bs
import os
import argparse
from knockknock import discord_sender

In [3]:
# Define the lang code
lang_code = "AT"

# Define the lang code to use by the OPUS-MT models
opus_lang_code = "gmw"

# Define the path to the Source TEI folder
path = "/home/tajak/Parlamint-translation/Note-translation/Source-data-TEI/ParlaMint-{}.TEI".format(lang_code)

# Define final path
final_path = "/home/tajak/Parlamint-translation/Note-translation/Final-data-CSV/ParlaMint-{}.notes.translated.csv".format(lang_code)


## Create df

In [4]:
# Extract a list with paths to conllu files and a list with their names
parl_list = []
file_name_list = []

for dir1 in os.listdir(path):
    full_path = os.path.join(path, dir1)
    if os.path.isdir(full_path):
        current = os.listdir(full_path)
        # Keep only files with parliamentary sessions:
        for file in current:
            if "ParlaMint-{}_".format(lang_code) in file:
                if ".xml" in file:
                    final_path = "{}/{}".format(full_path, file)
                    parl_list.append(final_path)
                    file_name_list.append(file)

# See how many files we have:
print("No. of files: {}.".format(len(parl_list)))

No. of files: 1197.


In [13]:
# Define function to extract all tags

def extract_tag(tag, df, content):
	if tag in ["note", "head"]:
		# Extract all notes from the file
		note_list = content.find_all(tag)
		note_list_final = []

		for i in note_list:
			current_note = []
			type = ""

			if i.attrs.get('type', 'None') != "None":
			
			#if len(list(i.attrs.values())) == 1:
			#	current_note.append(list(i.attrs.values())[0])
			#elif len(list(i.attrs.values())) == 0:
			#	current_note.append("")
			#else:
			#	print("Error: there are more than 1 attribute!")
			#	print(i)

				type = i.get("type")

			elif i.attrs.get('reason', 'None') != "None":
				type = i.get("reason")
			
			else:
				type = ""

			if i.attrs.get("xml:lang", "None") != "None":
				lang = i.get("xml:lang")
			else:
				lang = ""
			
			current_note.append(type)
			current_note.append(i.get_text())
			current_note.append(lang)
			note_list_final.append(current_note)
		
		new_df = pd.DataFrame({"type": [x[0] for x in note_list_final], "content": [x[1] for x in note_list_final], "xml:lang": [x[2] for x in note_list_final]})
		new_df["tag"] = tag

		# Merge df to the previous df
		df = pd.concat([df, new_df])
	
	else:
		# Extract all other notes from the file
		note_list = content.find_all(tag)
		note_list_final = []

		for i in note_list:
			desc_list = []
			type = ""
			if i.attrs.get('type', 'None') != "None":
				type = i.get("type")

			elif i.attrs.get('reason', 'None') != "None":
				type = i.get("reason")
			
			else:
				type = ""

			desc_list = i.find_all("desc")
			if len(desc_list) == 0:
				print("Error - empty desc_list")
				print(i)
			else:
				for desc in desc_list:
					current_note = []
					if "xml:lang" in list(desc.attrs.keys()):
						lang = desc.get("xml:lang")
					else:
						lang = ""
					current_note.append(type)
					current_note.append(desc.get_text())
					current_note.append(lang)
					note_list_final.append(current_note)
		
		new_df = pd.DataFrame({"type": [x[0] for x in note_list_final], "content": [x[1] for x in note_list_final], "xml:lang": [x[2] for x in note_list_final]})
		new_df["tag"] = tag

		# Merge df to the previous df
		df = pd.concat([df, new_df])

	return df

In [14]:
# Get notified once the code ends
webhook_url = open("/home/tajak/Parlamint-translation/discord_key.txt", "r").read()
@discord_sender(webhook_url=webhook_url)

def create_note_df(parl_list):
    import pandas as pd
    from bs4 import BeautifulSoup as bs
    # Create an empty df
    df = pd.DataFrame({"tag": [""],"type": [""], "content": [""], "xml:lang": [""]})

    # Go through all files in the list of files and extract notes from all of them
    for path in parl_list:
        file = open(path, "r")
        # Parse the file with beautifulsoup
        content = bs(file, "xml")

        # Extract all tags from the file
        for tag in ["note", "gap", "head", "kinesic", "vocal", "incident"]:
            df = extract_tag(tag, df, content)
    
    # At the end, edit the df by deleting the first (empty) row and reseting the index
    # Reset index
    df = df.reset_index(drop=True)

    # Remove the first row
    df = df.drop([0], axis="index")

    # Reset index
    df = df.reset_index(drop=True)

    print("Statistics before droping duplicates:\n\n\n")

    # Show the results
    print(df.describe(include="all").to_markdown())

    print("\n")

    print(df.head().to_markdown())

    print("\n")
    
    print("Statistics for tags:\n")

    print(df.tag.value_counts().to_markdown())

    print("\n")

    print(df.groupby("tag").type.value_counts().to_markdown())

    print("Most common notes:\n")

    print(df.content.value_counts()[:20].to_markdown())

    # Remove duplicated rows (exact duplicates - all values in all columns match)
    df = df.drop_duplicates()

    print("Statistics after deduplication:\n")

    # Add information on length
    df["length"] = df["content"].str.split().str.len()

    print("Number of words in the notes: {}\n".format(df["length"].sum()))

    print(df.describe(include="all").to_markdown())

    print("\n")

    print(df.head().to_markdown())

    print("\n")
    
    print("Statistics for tags:\n")

    print(df.tag.value_counts().to_markdown())

    print("\n")

    print(df.groupby("tag").type.value_counts().to_markdown())

    return df

In [15]:
df = create_note_df(parl_list)

Statistics before droping duplicates:



|        | tag    | type    | content       | xml:lang   |
|:-------|:-------|:--------|:--------------|:-----------|
| count  | 789776 | 789776  | 789776        | 789776     |
| unique | 2      | 4       | 130717        | 1          |
| top    | note   | speaker | De voorzitter |            |
| freq   | 783676 | 530090  | 139293        | 789776     |


|    | tag   | type    | content                   | xml:lang   |
|---:|:------|:--------|:--------------------------|:-----------|
|  0 | note  | speaker | De voorzitter             |            |
|  1 | note  | speaker | Mevrouw Ouwehand (PvdD)   |            |
|  2 | note  | comment | Motie                     |            |
|  3 | note  | comment | De Kamer,                 |            |
|  4 | note  | comment | gehoord de beraadslaging, |            |


Statistics for tags:

|      |    tag |
|:-----|-------:|
| note | 783676 |
| head |   6100 |


|                     |   type |
|:--------

## Translate

In [15]:
# Get notified once the code ends
webhook_url = open("/home/tajak/Parlamint-translation/discord_key.txt", "r").read()
@discord_sender(webhook_url=webhook_url)


def translate(opus_lang_code, df, final_path):
	"""
	This function translates the text from the dataframe, created with the create_note_df() function
	with OPUS-MT models using EasyNMT. It returns a dataframe with the translation.

	Args:
	- opus_lang_code: the lang code to be used in the OPUS-MT model - use the one that performed the best in the comparison (see function choose_model())
	"""
	import pandas as pd
	import regex as re
	from easynmt import EasyNMT
	from IPython.display import display
	import time

	# Define the model
	model = EasyNMT('opus-mt')

	# Create a list of sentences from the df
	sentence_list = df.content.to_list()

	lang_models_dict = {"BG": ["bg"], "HR": ["zls", "sla"], "CZ": ["cs", "sla", "zlw" ], "DK": ["da"], "NL": ["nl", "gem", "gmw"], "FR": ["fr", "itc","roa"], "HU": ["hu", "fiu", "urj"], "IS": ["is","gmq", "gem"], "IT": ["it", "roa", "itc"], "LV": ["lv","bat"], "LT": ["bat"], "PL": ["pl", "sla", "zlw"], "SI": ["sla"], "ES": ["es", "roa", "itc"], "TR": ["tr", "trk" ], "AT": ["de", "gem", "gmw"], "ES-PV": ["eu", "mul"], "BA": ["sla", "zls"], "ES-CT": ["ca", "roa", "itc"], "EE": ["et", "urj", "fiu"], "FI": ["fi", "urj", "fiu"], "ES-GA": ["gl", "roa", "itc"], "GR": ["el","grk"], "NO": ["gem", "gmq"], "PT": ["pt", "roa", "itc"], "RO":["roa", "itc"], "RS": ["zls", "sla"], "SE": ["sv", "gmq", "gem"], "UA":["uk", "sla", "zle"]}

	print("Translation started.")

	start_time = time.time()

	#Translate the list of sentences - you need to provide the source language as it is in the name of the model - the opus_lang_code
	#for opus_lang_code in lang_models_dict[lang_code]:
	translation_list = model.translate(sentence_list, source_lang = "{}".format(opus_lang_code), target_lang='en')

	translation_time = round((time.time() - start_time)/60,2)

	print("Translation completed. It took {} minutes for {} instances - {} minutes per one sentence.".format(translation_time, len(sentence_list), translation_time/len(sentence_list)))

	# Add the translations to the df
	df["translation"] = translation_list

	# Display the df
	print(df[:3].to_markdown())

	print("\n\n\n")

	# Save the df
	df.to_csv("{}".format(final_path), sep="\t")

	print("The file is saved as {}".format(final_path))

	return df

In [16]:
df = translate(opus_lang_code, df, final_path)

  from .autonotebook import tqdm as notebook_tqdm
2023-03-09 15:41:30.220550: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-09 15:41:31.190218: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-09 15:41:31.190310: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


Translation started.




Translation completed. It took 3.56 minutes for 10236 instances - 0.0003477921062915201 minutes per one sentence.
|    | tag   | type    | content                       | xml:lang   |   length | translation                        |
|---:|:------|:--------|:------------------------------|:-----------|---------:|:-----------------------------------|
|  0 | note  | time    | Seja se je začela ob 12. uri. |            |        7 | The meeting started at 12 o'clock. |
|  1 | note  | speaker | PREDSEDNIK JANKO VEBER:       |            |        3 | President JANKO WEBER:             |
|  2 | note  |         | Za je glasovalo 56.           |            |        4 | 56 votes in favour.                |




The file is saved as /home/tajak/Parlamint-translation/Note-translation/Final-data-CSV/ParlaMint-SI.notes.translated.csv


In [None]:
df.head()

# Inspect the results

In [6]:
df = pd.read_csv("Final-data-CSV/ParlaMint-IS.notes.translated.csv", sep="\t")

df.head(2)

Unnamed: 0,tag,type,content,xml:lang,length,translation,corpus
0,note,signing,"Þorsteinn V. Einarsson, 10. þm. Reykv. n., og ...",is,18,"Stone of V. Einarsson, 1 0th. Smoke. No., and ...",IS
1,kinesic,ringing,Forseti hringir.,is,2,The president calls.,IS


In [8]:
df.sort_values(by=["tag", "type"])[:20]

Unnamed: 0,tag,type,content,xml:lang,length,translation,corpus
491,incident,editorial,Leiðr. ráðherra: 1%,is,3,I'm sorry. Minister: 1%,IS
492,incident,editorial,"Leiðr. ráðherra: 0,6%",is,3,I'm sorry. Minister: 0.6%,IS
864,incident,editorial,leiðrétting viðmiða vegna greiðslna ríkissjóðs,is,5,correction of state funds payment criteria,IS
532,incident,incident,Ljósin kvikna aftur.,is,3,The lights are on again.,IS
16,incident,,Þingmenn risu úr sætum.,is,4,Senators rose from seats.,IS
32,incident,,"Þingmenn risu úr sætum og forsætisráðherra, Ka...",is,24,The senators rose up from their seats and prim...,IS
190,incident,,"Þingmenn risu úr sætum og forsætisráðherra, Bj...",is,24,The senators rose up from their seats and prim...,IS
597,incident,,"Þingmenn risu úr sætum og forsætisráðherra, Si...",is,25,The senators rose from their seats and prime m...,IS
953,incident,,Þingmenn rísa úr sætum.,is,4,Senators rise from their seats.,IS
1753,incident,,Þingmenn berja í borð.,is,4,The senators are hitting the table.,IS


In [25]:
df[df["xml:lang"] == "en"]

Unnamed: 0,tag,type,content,xml:lang,length,translation


In [26]:
df.head(50)

Unnamed: 0,tag,type,content,xml:lang,length,translation
0,note,time,\n4:31:17\n,,1,\n4:31:17\n
1,note,time,Beginn der Sitzung:14:31Uhr,,3,Start of meeting: 14 a.m.:31 p.m.
2,note,chairpersons,Zweiter Präsident Karlheinz Kopf,,4,Second President Karlheinz Kopf
3,note,speaker,Präsident Karlheinz Kopf,,3,President Karlheinz Kopp
4,note,time,\n14:31:24\n,,1,\n14:31:24\n
7,note,time,\n14:32:37\n,,1,\n14:32:37\n
8,note,time,Schluss der Sitzung:14:33Uhr,,3,End of meeting: 14 a.m.: 33 p.m.
10,gap,editorial,Titelseite und Inhaltsangabe entfernt,de,4,Page of the title and content removed
12,gap,editorial,Zitierte Druckfassung entfernt,de,3,Quoted print version removed
13,note,time,\n10:01:17\n,,1,\n10:01:17\n


In [7]:
df[df["xml:lang"] == "en"].to_dict()

{'tag': {343: 'gap'},
 'type': {343: 'editorial'},
 'content': {343: 'The frontmatter of the document has been removed'},
 'xml:lang': {343: 'en'},
 'length': {343: 8},
 'translation': {343: 'The frontmatter of the document has been removable'}}

In [22]:
# Remove the English translation from the df

df = df[df["xml:lang"] != "en"]
df.shape

(287509, 6)

In [27]:
print("Number of words in the notes: {}\n".format(df["length"].sum()))

Number of words in the notes: 2160675



In [28]:
print(df.describe(include="all").to_markdown())

|        | tag    | type                      | content                     | xml:lang   |       length | translation                         |
|:-------|:-------|:--------------------------|:----------------------------|:-----------|-------------:|:------------------------------------|
| count  | 287509 | 287509                    | 287509                      | 4          | 287509       | 287509                              |
| unique | 4      | 17                        | 287495                      | 1          |    nan       | 281408                              |
| top    | note   | unauthorized_interruption | Präsident Dr. Heinz Fischer | de         |    nan       | (The President cut off the speaker) |
| freq   | 267589 | 213265                    | 2                           | 4          |    nan       | 293                                 |
| mean   | nan    | nan                       | nan                         | nan        |      7.51516 | nan                           

In [29]:

print(df.head().to_markdown())

|    | tag   | type         | content                          |   xml:lang |   length | translation                       |
|---:|:------|:-------------|:---------------------------------|-----------:|---------:|:----------------------------------|
|  0 | note  | time         | 4:31:17                          |        nan |        1 | 4:31:17                           |
|  1 | note  | time         | Beginn der Sitzung:14:31Uhr      |        nan |        3 | Start of meeting: 14 a.m.:31 p.m. |
|  2 | note  | chairpersons | Zweiter Präsident Karlheinz Kopf |        nan |        4 | Second President Karlheinz Kopf   |
|  3 | note  | speaker      | Präsident Karlheinz Kopf         |        nan |        3 | President Karlheinz Kopp          |
|  4 | note  | time         | 14:31:24                         |        nan |        1 | 14:31:24                          |


In [30]:
print("Statistics for tags:\n")

print(df.tag.value_counts().to_markdown())

Statistics for tags:

|         |    tag |
|:--------|-------:|
| note    | 267589 |
| vocal   |  10419 |
| kinesic |   9497 |
| gap     |      4 |


In [31]:
print(df.groupby("tag").type.value_counts().to_markdown())

|                                       |   type |
|:--------------------------------------|-------:|
| ('gap', 'editorial')                  |      4 |
| ('kinesic', 'applause')               |   9357 |
| ('kinesic', 'signal')                 |    140 |
| ('note', 'unauthorized_interruption') | 213265 |
| ('note', 'time')                      |  40351 |
| ('note', 'comment')                   |   6373 |
| ('note', 'speaker_action')            |   3020 |
| ('note', 'speaker')                   |   2352 |
| ('note', 'procedural')                |    925 |
| ('note', 'side_talk')                 |    568 |
| ('note', 'referencing_document')      |    353 |
| ('note', 'objection')                 |    193 |
| ('note', 'chairpersons')              |    111 |
| ('note', 'inquietude')                |     72 |
| ('note', 'p')                         |      6 |
| ('vocal', 'interruption')             |   8109 |
| ('vocal', 'laughter')                 |   2310 |


In [23]:
# Save the df
df.to_csv("/home/tajak/Parlamint-translation/Note-translation/Final-data-CSV/ParlaMint-AT.notes.translated.csv", sep="\t")

In [17]:
df.groupby("tag").type.value_counts()

tag   type   
note  comment    122829
      time         4026
      speaker       762
Name: type, dtype: int64

In [18]:
import numpy as np
with pd.option_context('display.max_colwidth', None):
	display(df[df["tag"] == "head"].head(10))

Unnamed: 0,tag,type,content,xml:lang,length,translation
142,head,,Landbouw- en Visserijraad d.d. 6 en 7 november 2017,,9,Council of Agriculture and Fisheries 6 and 7 November 2017
303,head,,Begroting Koninkrijksrelaties 2018,,3,Budget Kingdom relations 2018
310,head,,Regeling van werkzaamheden,,3,Order of business
313,head,,Regeling van werkzaamheden (stemmingen),,4,Order of business (votes)
316,head,,Herdenking van de heer H. van Rossum,,7,Commemoration of Mr H. van Rossum
334,head,,Verwerving F-35,,2,Acquisition F-35
529,head,,"Sterk beroepsonderwijs, praktijkleren en passende ondersteuning mbo-studenten",,7,"Strong vocational education, practical learning and appropriate support for MBO students"
533,head,,Vragenuur: Vragen Diertens,,3,Question Time: Questions about animals
810,head,,Belastingontwijking,,1,Tax avoidance
937,head,,Opwarming in de steden,,4,Global warming in the cities


In [7]:
import numpy as np
with pd.option_context('display.max_colwidth', None):
	display(df[df.type.isnull()].head(60))

Unnamed: 0,tag,type,content,xml:lang,length,translation
3,note,,Mikrofon otomatik cihaz tarafından kapatıldı,tr,5,Microphone turned off by automatic device
10,note,,İstanbul,tr,1,İstanbul
11,note,,Gaziantep,tr,1,Gaziantep
12,note,,Aydın,tr,1,Aydın
13,note,,Erzincan,tr,1,Erzincan
14,note,,Balıkesir,tr,1,Balıkesir
15,note,,Sivas,tr,1,Sivas
16,note,,Adana,tr,1,Adana
17,note,,Hatay,tr,1,Hatay
18,note,,İzmir,tr,1,İzmir
