## Import the data

In [1]:
import pandas as pd
import regex as re
from bs4 import BeautifulSoup as bs
import os
import tqdm as tqdm

### Option 1: Beautiful Soup (for TEI) - worse option if you need sentences

I helped myself with the tutorial at https://komax.github.io/blog/text/python/xml/parsing_tei_xml_python/

In [None]:
# Get a lis of TEI files

path = "ParlaMint-SI.TEI"
dir_list = os.listdir(path)

# Keep only files with parliamentary sessions:

parl_list = []

for i in dir_list:
	if "ParlaMint-SI_" in i:
		parl_list.append(i)

parl_list


The number of all files is 414 sessions.

In [None]:
with open(f"ParlaMint-SI.TEI.ana/ParlaMint-SI_2014-08-01-SDZ7-Redna-01.ana.xml", "r") as file:
	# Read each line in the file, readlines() returns a list of lines
	content = bs(file, 'lxml')

In [None]:
# Search for all segments
seg_list = content.find("seg")

# Find sentences and words using the findChildren method
sentences = seg_list.findChildren("s", recursive=False)
sentences

for i in sentences:
	print(i.getText())

In [None]:
# Read each XML file using the lxml parser.
# This parser transforms the XML document into a traversable tree, a beautiful soup stored in variable content.

# Create an empty df
df = pd.DataFrame({"session": [""], "xlmid": [""], "text": [""]})

for i in parl_list:
    with open(f"ParlaMint-SI.TEI.ana/{i}", "r") as file:
        # Read each line in the file, readlines() returns a list of lines
        content = bs(file, 'lxml')

        # Search for all segments
        seg_list = content.find_all("seg")

        # Get the session ID
        current_id = content.tei["xml:id"]

        current_df = pd.DataFrame({"xlmid": [x["xml:id"] for x in seg_list], "text": [x.getText() for x in seg_list]})
        current_df["session"] = current_id

        # Merge the df with the previous df
        df = pd.concat([df,current_df])

In [None]:
df.describe(include="all")

### Option 2: From CONLLU with regex

In [2]:
# Get a list of TEI files

path = "ParlaMint-SI.conllu"
dir_list = os.listdir(path)

# Keep only files with parliamentary sessions:

parl_list = []

# Filter out only relevant files
for i in dir_list:
	if "ParlaMint-SI_" in i:
		if ".conllu" in i:
			parl_list.append(i)

len(parl_list)


414

In [3]:
# Create an empty df
df = pd.DataFrame({"file": [""], "sentence_id": [""], "text": [""]})

In [18]:
# Create regexes
sen_re = re.compile("# sent_id = (.*?)\n")
text_re = re.compile("# text = (.*?)\n")

In [5]:
# Find all relevant information from the conllu files and save everything in a dataframe

for i in parl_list:

	file = open(f"ParlaMint-SI.conllu/{i}", "r").read()

	# Find sentence ids and texts
	sen_ids = sen_re.findall(file)
	texts = text_re.findall(file)
	
	new_df = pd.DataFrame({"sentence_id": sen_ids, "text": texts})
	new_df["file"] = i

	# Merge df to the previous df
	df = pd.concat([df, new_df])

df.head()

Unnamed: 0,file,sentence_id,text
0,,,
0,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg1.1,Nadaljujemo s prekinjeno 17. sejo zbora.
1,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,"Prehajamo na 2. TOČKO DNEVNEGA REDA, TO JE NA ..."
2,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.1,Predlog zakona je v obravnavo zboru predložil ...
3,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.2,V zvezi s tem predlogom zakona Odbor za zdravs...


In [6]:
# Remove the first row
df = df.drop([0], axis="index")

# Reset index
df = df.reset_index(drop=True)

# Show the results
df.describe(include="all")

Unnamed: 0,file,sentence_id,text
count,1078584,1078584,1078584
unique,414,1078584,918195
top,ParlaMint-SI_2016-11-23-SDZ7-Redna-24.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,Hvala lepa.
freq,6185,1,27653


The Parlamint-SI consists of 414 files and 1,078,584 sentences.

In [7]:
# Add information on length
df["length"] = df["text"].str.split().str.len()

df.head(2)


Unnamed: 0,file,sentence_id,text,length
0,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,"Prehajamo na 2. TOČKO DNEVNEGA REDA, TO JE NA ...",28
1,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.1,Predlog zakona je v obravnavo zboru predložil ...,9


In [8]:
# Add information on the segment id (which is also used in TEI files)
seg_re = re.compile("(.*?)\.\d+")

df["seg_id"] = [seg_re.findall(x)[0] for x in df.sentence_id]

df.head(2)


Unnamed: 0,file,sentence_id,text,length,seg_id
0,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,"Prehajamo na 2. TOČKO DNEVNEGA REDA, TO JE NA ...",28,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2
1,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.1,Predlog zakona je v obravnavo zboru predložil ...,9,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3


In [9]:
df.tail()

Unnamed: 0,file,sentence_id,text,length,seg_id
1078579,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.conllu,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.seg835.1,Glasujemo.,1,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.seg835
1078580,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.conllu,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.seg835.2,"Navzočih je 59 poslank in poslancev, za je gla...",13,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.seg835
1078581,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.conllu,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.seg837.1,"Ugotavljam, da je zakon sprejet.",5,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.seg837
1078582,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.conllu,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.seg837.2,S tem zaključujem to točko dnevnega reda.,7,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.seg837
1078583,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.conllu,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.seg838.1,"Prekinjam 36. sejo Državnega zbora, ki jo bomo...",13,ParlaMint-SI_2017-12-14-SDZ7-Redna-36.seg838


In [10]:
# Save the dataframe
df.to_csv("Parlamint-SI-sentences.csv", sep="\t")

## Machine translate a sample

In [2]:
# Open the file, created in the previous step
df = pd.read_csv("Parlamint-SI-sentences.csv", sep="\t", index_col=0)

df.head()


  mask |= (ar1 == a)


Unnamed: 0,file,sentence_id,text,length,seg_id
0,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,"Prehajamo na 2. TOČKO DNEVNEGA REDA, TO JE NA ...",28,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2
1,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.1,Predlog zakona je v obravnavo zboru predložil ...,9,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3
2,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.2,V zvezi s tem predlogom zakona Odbor za zdravs...,22,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3
3,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1,Besedo dajem predsednici odbora Anji Bah Žiber...,12,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4
4,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg5.1,"Hvala lepa, predsedujoči.",3,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg5


In [10]:
df.describe(include="all")

Unnamed: 0,file,sentence_id,text,length,seg_id
count,1078584,1078584,1078584,1078584.0,1078584
unique,414,1078584,918195,,280629
top,ParlaMint-SI_2016-11-23-SDZ7-Redna-24.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,Hvala lepa.,,ParlaMint-SI_2014-11-20-SDZ7-Redna-02.seg69
freq,6185,1,27653,,147
mean,,,,18.75484,
std,,,,14.89782,
min,,,,1.0,
25%,,,,7.0,
50%,,,,16.0,
75%,,,,26.0,


In [19]:
sample_files = list(df["file"].unique())[:3]
sample_files

['ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu',
 'ParlaMint-SI_2015-09-22-SDZ7-Redna-11.conllu',
 'ParlaMint-SI_2018-03-27-SDZ7-Redna-39.conllu']

In [20]:
# Take the first 3 files for a sample

df_sample = df[df.file.isin(sample_files)]
df_sample.describe(include="all")

Unnamed: 0,file,sentence_id,text,length,seg_id
count,8217,8217,8217,8217.0,8217
unique,3,8217,7386,,2055
top,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,Hvala lepa.,,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg408
freq,3486,1,211,,95
mean,,,,18.55799,
std,,,,14.180222,
min,,,,1.0,
25%,,,,8.0,
50%,,,,16.0,
75%,,,,26.0,


The sample file has 8217 sentences.

### Find proper names in the sample

In [84]:
# Use the TEI files instead of conllu
sample_files_TEI = [x.replace("conllu", "ana.xml") for x in sample_files]
sample_files_TEI

['ParlaMint-SI_2020-05-27-SDZ8-Redna-17.ana.xml',
 'ParlaMint-SI_2015-09-22-SDZ7-Redna-11.ana.xml',
 'ParlaMint-SI_2018-03-27-SDZ7-Redna-39.ana.xml']

In [90]:
# Create a regex to separate word id into segment id and word index
word_seg_re = re.compile("(.*)\.(\d+)")

In [102]:
# Find names from the TEI file with beautiful soup
names_list = []

for i in sample_files_TEI:

	# Open the file
	file = open(f"ParlaMint-SI.TEI.ana/{i}", "r")
	content = bs(file, 'lxml')

	# Search for all sentences
	seg_list = content.find_all("s")

	# Create a list of dictionaries for each sentence

	for i in content.find_all("s"):
		result = i.findChildren("name", type = "PER", recursive=False)
		if len(result) > 0:
			result_list = []
			for i in result:
				words = i.findChildren("w", recursive = False)
				for word in words:
					current_name = word.getText()
					current_lemma = word["lemma"]
					current_word_id = word["xml:id"]
					current_seg_id = word_seg_re.findall(current_word_id)
					word_list = [current_word_id, current_name, current_lemma]
					word_list.extend(current_seg_id)
					result_list.append(word_list)
			names_list.append(result_list)

len(names_list)

816

In [101]:
names_list[:3]

[['ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1.5',
  'Anji',
  'Anja',
  ('ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1', '5')],
 ['ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1.6',
  'Bah',
  'Bah',
  ('ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1', '6')],
 ['ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1.7',
  'Žibert',
  'Žibert',
  ('ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1', '7')]]

In [104]:
# Create a dictionary of sentences out of the names list
sent_dict = {}

for sentence in names_list:
	info_dic = {}
	for word in sentence:
		current_sent_id = word[3][0]
		# I'll substract one from the word index, because indexing in the TEI file starts with 1, not 0
		current_word_index = int(word[3][1]) - 1
		info_dic[current_word_index] = (word[1], word[2])
	sent_dict[current_sent_id] = info_dic

sent_dict

{'ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1': {4: ('Anji', 'Anja'),
  5: ('Bah', 'Bah'),
  6: ('Žibert', 'Žibert')},
 'ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg8.1': {6: ('Marjan', 'Marjan'),
  7: ('Maučec', 'Maučec')},
 'ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg9.1': {7: ('Tomaž', 'Tomaž'),
  8: ('Gantar', 'Gantar')},
 'ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg11.1': {2: ('Marjanu', 'Marjan'),
  3: ('Maučecu', 'Maučec')},
 'ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg13.1': {29: ('Franca', 'Franc'),
  30: ('Kanglerja', 'Kangler')},
 'ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg16.1': {13: ('Tini', 'Tina'),
  14: ('Bregant', 'Bregant')},
 'ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg21.2': {9: ('Monika', 'Monika'),
  10: ('Gregorčič', 'Gregorčič')},
 'ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg23.2': {21: ('Krisa', 'Kris')},
 'ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg28.1': {7: ('Željko', 'Željko'),
  8: ('Cigler', 'Cigler')},
 'ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg33.1': {10: ('Iva', 'Iva

In [107]:
# Create a dataframe out of this information
names_df = pd.DataFrame({"sentence_id":list(sent_dict.keys()), "proper_nouns": list(sent_dict.values())})

names_df.head()

Unnamed: 0,sentence_id,proper_nouns
0,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1,"{4: ('Anji', 'Anja'), 5: ('Bah', 'Bah'), 6: ('..."
1,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg8.1,"{6: ('Marjan', 'Marjan'), 7: ('Maučec', 'Mauče..."
2,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg9.1,"{7: ('Tomaž', 'Tomaž'), 8: ('Gantar', 'Gantar')}"
3,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg11.1,"{2: ('Marjanu', 'Marjan'), 3: ('Maučecu', 'Mau..."
4,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg13.1,"{29: ('Franca', 'Franc'), 30: ('Kanglerja', 'K..."


In [108]:
# Add this information to the main table - sample df

df_sample = pd.merge(df_sample, names_df, on="sentence_id", how="outer")

df_sample.head()

Unnamed: 0,file,sentence_id,text,length,seg_id,no_punctuation,list_of_words,proper_nouns
0,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,"Prehajamo na 2. TOČKO DNEVNEGA REDA, TO JE NA ...",28,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2,Prehajamo na 2 TOČKO DNEVNEGA REDA TO JE NA DR...,"[Prehajamo, na, 2, TOČKO, DNEVNEGA, REDA, TO, ...",
1,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.1,Predlog zakona je v obravnavo zboru predložil ...,9,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3,Predlog zakona je v obravnavo zboru predložil ...,"[Predlog, zakona, je, v, obravnavo, zboru, pre...",
2,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.2,V zvezi s tem predlogom zakona Odbor za zdravs...,22,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3,V zvezi s tem predlogom zakona Odbor za zdravs...,"[V, zvezi, s, tem, predlogom, zakona, Odbor, z...",
3,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1,Besedo dajem predsednici odbora Anji Bah Žiber...,12,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4,Besedo dajem predsednici odbora Anji Bah Žiber...,"[Besedo, dajem, predsednici, odbora, Anji, Bah...","{4: ('Anji', 'Anja'), 5: ('Bah', 'Bah'), 6: ('..."
4,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg5.1,"Hvala lepa, predsedujoči.",3,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg5,Hvala lepa predsedujoči,"[Hvala, lepa, predsedujoči]",


In [110]:
df_sample.describe(include="all")

Unnamed: 0,file,sentence_id,text,length,seg_id,no_punctuation,list_of_words,proper_nouns
count,8217,8217,8217,8217.0,8217,8217,8217,816
unique,3,8217,7386,,2055,7373,7373,721
top,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,Hvala lepa.,,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg408,Hvala lepa,"[Hvala, lepa]","{1: ('Levici', 'Levica')}"
freq,3486,1,211,,95,212,212,7
mean,,,,18.55799,,,,
std,,,,14.180222,,,,
min,,,,1.0,,,,
25%,,,,8.0,,,,
50%,,,,16.0,,,,
75%,,,,26.0,,,,


In [113]:
# Save sample file
df_sample.to_csv("Parlamint-SI-sample.csv")

### Machine translate

In [None]:
# Install easynmt
#!pip install -q -U easynmt

In [114]:
from easynmt import EasyNMT

# Define the model - The system will automatically detect the suitable Opus-MT model and load it.
model = EasyNMT('opus-mt')

In [115]:
# Create a list of sentences from the df
sentence_list = df_sample.text.to_list()
len(sentence_list)

8217

In [116]:
#Translate the list of sentences - you need to provide the source language as it is in the name of the model (zls - South Slavic)
translation_list = model.translate(sentence_list, source_lang = 'zls', target_lang='en')

# Add the translations to the df
df_sample["translation"] = translation_list

df_sample.head()

Unnamed: 0,file,sentence_id,text,length,seg_id,no_punctuation,list_of_words,proper_nouns,translation
0,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,"Prehajamo na 2. TOČKO DNEVNEGA REDA, TO JE NA ...",28,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2,Prehajamo na 2 TOČKO DNEVNEGA REDA TO JE NA DR...,"[Prehajamo, na, 2, TOČKO, DNEVNEGA, REDA, TO, ...",,"Moving to 2. ACT OF DAYS, THIS IS A SECOND DET..."
1,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.1,Predlog zakona je v obravnavo zboru predložil ...,9,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3,Predlog zakona je v obravnavo zboru predložil ...,"[Predlog, zakona, je, v, obravnavo, zboru, pre...",,The proposal for a law was submitted to the As...
2,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.2,V zvezi s tem predlogom zakona Odbor za zdravs...,22,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3,V zvezi s tem predlogom zakona Odbor za zdravs...,"[V, zvezi, s, tem, predlogom, zakona, Odbor, z...",,"As regards this proposal for a law, the Commit..."
3,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1,Besedo dajem predsednici odbora Anji Bah Žiber...,12,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4,Besedo dajem predsednici odbora Anji Bah Žiber...,"[Besedo, dajem, predsednici, odbora, Anji, Bah...","{4: ('Anji', 'Anja'), 5: ('Bah', 'Bah'), 6: ('...",I hereby give the President of the Anji Bah Ži...
4,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg5.1,"Hvala lepa, predsedujoči.",3,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg5,Hvala lepa predsedujoči,"[Hvala, lepa, predsedujoči]",,"Thank you very much, President."


230 segments (11,158 words) were translated in 19 s -> 10,000 words are translated in 17s. We would need 10 hours to translate the entire ParlaMint-SL.

In [117]:
# Save the df
df_sample.to_csv("Parlamint-SI-sample-translated.csv", sep = "\t")