## Import the data

In [72]:
import pandas as pd
import regex as re
from bs4 import BeautifulSoup as bs
import os
import time

### Option 1: Beautiful Soup (for TEI)

I helped myself with the tutorial at https://komax.github.io/blog/text/python/xml/parsing_tei_xml_python/

To get correct text (with punctuation marks separated from the words), we need to extract them from the annotated XLM TEI file.

In [7]:
# Get a list of TEI files

path = "ParlaMint-SI.TEI.ana"
dir_list = os.listdir(path)

# Keep only files with parliamentary sessions:

parl_list = []

for i in dir_list:
	if "ParlaMint-SI_" in i:
		parl_list.append(i)

len(parl_list)


414

The number of all files is 414 sessions.

In [49]:
# Create an empty df
df = pd.DataFrame({"file": [""], "sentence_id": [""], "text": [""], "proper_nouns": [""]})

In [30]:
# Create a regex to separate word id into segment id and word index
word_seg_re = re.compile("(.*)\.(\d+)")

In [50]:
for doc in parl_list:

	# Open the file
	file = open(f"ParlaMint-SI.TEI.ana/{doc}", "r")
	content = bs(file, 'lxml')

	# Create a list of sentences
	sentence_list = []
	sen_id_list = []
	proper_nouns_list = []

	# Search for all segments
	seg_list = content.find_all("seg")

	for segment in seg_list:
		sentences = segment.find_all("s")
		for sentence in sentences:
			# Get text, replace \n with a space and remove spaces from the beginning and end of string
			sentence_list.append(sentence.getText().replace("\n", " ").strip(" "))
			
			# Add information on sentence id
			sen_id = sentence["xml:id"]
			sen_id_list.append(sen_id)

			# Add information on proper nouns
			current_proper_nouns_list = []

			result = sentence.find_all("name", type = "PER", recursive=False)

			if len(result) > 0:
				word_dict = {}
				for i in result:
					words = i.find_all("w", recursive = False)
					for word in words:
						current_name = word.getText()
						current_lemma = word["lemma"]
						current_word_id = word["xml:id"]
						current_word_index = word_seg_re.findall(current_word_id)[0][1]
						# I'll substract one from the word index, because indexing in the TEI file starts with 1, not 0
						current_word_index = int(current_word_index) - 1
						word_dict[current_word_index] = [current_name, current_lemma]
				
				current_proper_nouns_list.append(word_dict)

			proper_nouns_list.append(current_proper_nouns_list)

	new_df = pd.DataFrame({"sentence_id": sen_id_list, "text": sentence_list, "proper_nouns": proper_nouns_list})
	new_df["file"] = doc

	# Merge df to the previous df
	df = pd.concat([df, new_df])

In [51]:
df.head()

Unnamed: 0,file,sentence_id,text,proper_nouns
0,,,,
0,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.ana.xml,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.seg1.1,"Spoštovane kolegice poslanke , spoštovani kole...",[]
1,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.ana.xml,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.seg2.1,Začenjam nadaljevanje 12. seje Državnega zbor...,[]
2,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.ana.xml,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.seg3.1,"Obveščen sem , da se današnje seje ne morejo u...","[{15: ['Eva', 'Eva'], 16: ['Irgl', 'Irgl'], 19..."
3,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.ana.xml,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.seg3.2,Vsem prisotnim še enkrat lep pozdrav !,[]


In [52]:
df.describe(include="all")

Unnamed: 0,file,sentence_id,text,proper_nouns
count,1078999,1078999.0,1078999,1078999
unique,415,1078999.0,918207,46914
top,ParlaMint-SI_2016-11-23-SDZ7-Redna-24.ana.xml,,Hvala lepa .,[]
freq,6186,1.0,27651,978131


### Option 2: From CONLLU with regex

In [2]:
# Get a list of TEI files

path = "ParlaMint-SI.conllu"
dir_list = os.listdir(path)

# Keep only files with parliamentary sessions:

parl_list = []

# Filter out only relevant files
for i in dir_list:
	if "ParlaMint-SI_" in i:
		if ".conllu" in i:
			parl_list.append(i)

len(parl_list)


414

In [3]:
# Create an empty df
df = pd.DataFrame({"file": [""], "sentence_id": [""], "text": [""]})

In [18]:
# Create regexes
sen_re = re.compile("# sent_id = (.*?)\n")
text_re = re.compile("# text = (.*?)\n")

In [5]:
# Find all relevant information from the conllu files and save everything in a dataframe

for i in parl_list:

	file = open(f"ParlaMint-SI.conllu/{i}", "r").read()

	# Find sentence ids and texts
	sen_ids = sen_re.findall(file)
	texts = text_re.findall(file)
	
	new_df = pd.DataFrame({"sentence_id": sen_ids, "text": texts})
	new_df["file"] = i

	# Merge df to the previous df
	df = pd.concat([df, new_df])

df.head()

Unnamed: 0,file,sentence_id,text
0,,,
0,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg1.1,Nadaljujemo s prekinjeno 17. sejo zbora.
1,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,"Prehajamo na 2. TOČKO DNEVNEGA REDA, TO JE NA ..."
2,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.1,Predlog zakona je v obravnavo zboru predložil ...
3,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.2,V zvezi s tem predlogom zakona Odbor za zdravs...


### Continue editing the dataframe (from option 1 or 2)

In [53]:
# Remove the first row
df = df.drop([0], axis="index")

# Reset index
df = df.reset_index(drop=True)

# Show the results
df.describe(include="all")

Unnamed: 0,file,sentence_id,text,proper_nouns
count,1078584,1078584,1078584,1078584
unique,414,1078584,918159,46912
top,ParlaMint-SI_2016-11-23-SDZ7-Redna-24.ana.xml,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.seg2.1,Hvala lepa .,[]
freq,6185,1,27651,977718


The Parlamint-SI consists of 414 files and 1,078,584 sentences.

In [54]:
# Add information on length
df["length"] = df["text"].str.split().str.len()

df.head(2)


Unnamed: 0,file,sentence_id,text,proper_nouns,length
0,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.ana.xml,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.seg2.1,Začenjam nadaljevanje 12. seje Državnega zbor...,[],7
1,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.ana.xml,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.seg3.1,"Obveščen sem , da se današnje seje ne morejo u...","[{15: ['Eva', 'Eva'], 16: ['Irgl', 'Irgl'], 19...",89


In [57]:
df.length.describe().round()

count    1078584.0
mean          22.0
std           17.0
min            1.0
25%            9.0
50%           18.0
75%           30.0
max          532.0
Name: length, dtype: float64

In [63]:
df.tail()

Unnamed: 0,file,sentence_id,text,proper_nouns,length
1078579,ParlaMint-SI_2018-02-13-SDZ7-Redna-38.ana.xml,ParlaMint-SI_2018-02-13-SDZ7-Redna-38.seg540.1,Na glasovanje dajem naslednji sklep : Državni...,"[{24: ['Dejana', 'Dejan'], 25: ['Židana', 'Žid...",41
1078580,ParlaMint-SI_2018-02-13-SDZ7-Redna-38.ana.xml,ParlaMint-SI_2018-02-13-SDZ7-Redna-38.seg541.1,Glasujemo .,[],2
1078581,ParlaMint-SI_2018-02-13-SDZ7-Redna-38.ana.xml,ParlaMint-SI_2018-02-13-SDZ7-Redna-38.seg541.2,"Navzočih je 65 poslank in poslancev , za je gl...",[],14
1078582,ParlaMint-SI_2018-02-13-SDZ7-Redna-38.ana.xml,ParlaMint-SI_2018-02-13-SDZ7-Redna-38.seg543.1,"Ugotavljam , da sklep ni sprejet .",[],7
1078583,ParlaMint-SI_2018-02-13-SDZ7-Redna-38.ana.xml,ParlaMint-SI_2018-02-13-SDZ7-Redna-38.seg544.1,S tem zaključujem 1. točko dnevnega reda in pr...,[],23


In [64]:
# Save the dataframe
df.to_csv("Parlamint-SI-sentences.csv", sep="\t")

## Machine translate a sample

In [2]:
# Open the file, created in the previous step
df = pd.read_csv("Parlamint-SI-sentences.csv", sep="\t", index_col=0)

df.head()


  mask |= (ar1 == a)


Unnamed: 0,file,sentence_id,text,length,seg_id
0,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,"Prehajamo na 2. TOČKO DNEVNEGA REDA, TO JE NA ...",28,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2
1,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.1,Predlog zakona je v obravnavo zboru predložil ...,9,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3
2,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3.2,V zvezi s tem predlogom zakona Odbor za zdravs...,22,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg3
3,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4.1,Besedo dajem predsednici odbora Anji Bah Žiber...,12,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg4
4,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg5.1,"Hvala lepa, predsedujoči.",3,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg5


In [10]:
df.describe(include="all")

Unnamed: 0,file,sentence_id,text,length,seg_id
count,1078584,1078584,1078584,1078584.0,1078584
unique,414,1078584,918195,,280629
top,ParlaMint-SI_2016-11-23-SDZ7-Redna-24.conllu,ParlaMint-SI_2020-05-27-SDZ8-Redna-17.seg2.1,Hvala lepa.,,ParlaMint-SI_2014-11-20-SDZ7-Redna-02.seg69
freq,6185,1,27653,,147
mean,,,,18.75484,
std,,,,14.89782,
min,,,,1.0,
25%,,,,7.0,
50%,,,,16.0,
75%,,,,26.0,


In [65]:
sample_files = list(df["file"].unique())[:3]
sample_files

['ParlaMint-SI_2019-10-23-SDZ8-Redna-12.ana.xml',
 'ParlaMint-SI_2019-12-19-SDZ8-Redna-14.ana.xml',
 'ParlaMint-SI_2019-03-27-SDZ8-Redna-06.ana.xml']

In [66]:
# Take the first 3 files for a sample

df_sample = df[df.file.isin(sample_files)]
df_sample.describe(include="all")

Unnamed: 0,file,sentence_id,text,proper_nouns,length
count,11750,11750,11750,11750,11750.0
unique,3,11750,10639,1052,
top,ParlaMint-SI_2019-12-19-SDZ8-Redna-14.ana.xml,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.seg2.1,Hvala lepa .,[],
freq,5958,1,312,10499,
mean,,,,,20.336936
std,,,,,15.324123
min,,,,,1.0
25%,,,,,9.0
50%,,,,,17.0
75%,,,,,28.0


The sample file has 11750 sentences, out of which 1251 (11%) have proper nouns.

In [69]:
# Save sample file
df_sample.to_csv("Parlamint-SI-sample.csv")

### Machine translate

In [None]:
# Install easynmt
#!pip install -q -U easynmt

In [70]:
from easynmt import EasyNMT

# Define the model - The system will automatically detect the suitable Opus-MT model and load it.
model = EasyNMT('opus-mt')

In [71]:
# Create a list of sentences from the df
sentence_list = df_sample.text.to_list()
len(sentence_list)

11750

In [73]:
#Translate the list of sentences - you need to provide the source language as it is in the name of the model (zls - South Slavic)
translation_list = model.translate(sentence_list, source_lang = 'zls', target_lang='en')

# Add the translations to the df
df_sample["translation"] = translation_list

df_sample.head()

1193.013848043


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample["translation"] = translation_list


Unnamed: 0,file,sentence_id,text,proper_nouns,length,translation
0,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.ana.xml,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.seg2.1,Začenjam nadaljevanje 12. seje Državnega zbor...,[],7,I am starting a continuation of the 12th meeti...
1,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.ana.xml,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.seg3.1,"Obveščen sem , da se današnje seje ne morejo u...","[{15: ['Eva', 'Eva'], 16: ['Irgl', 'Irgl'], 19...",89,I am informed that today's meeting cannot be a...
2,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.ana.xml,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.seg3.2,Vsem prisotnim še enkrat lep pozdrav !,[],7,I'm here to say hello again!
3,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.ana.xml,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.seg4.1,"Prehajamo na 2. , 3. , 4. IN 5. TOČKO DNEVNEGA...",[],70,"Moving to 2, 3, 4. AND 5. DETECTION OF THE DAY..."
4,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.ana.xml,ParlaMint-SI_2019-10-23-SDZ8-Redna-12.seg5.1,Predloge zakonov je v obravnavo Državnemu zbo...,[],10,Proposals for laws were submitted by the Gover...


The sample which consists of 11750 sentences and 23,516,095 words (3 files) was translated in 6 minutes.

In [78]:
# Save the df
df_sample.to_csv("Parlamint-SI-sample-translated.csv", sep = "\t")