# The genre prediction pipeline

## Definitions of functions

download_unzip

In [1]:
def download_unzip(url, folder_name):
	"""
	The function downloads the zipped folder from an URL (from the CLARIN.SI repository),
	unzips it and saves the TMX file. It should be used for the MaCoCu data which is zipped
	using the GZ appendix and the name of the file is the same as the name of the folder. It should be followed by the tmx_to_json function.

	Args:
	- url(string): the URL from which the file can be downloaded
	- folder_name (string): name of the zipped folder; without ".GZ"
	"""
	import gzip
	import shutil
	import wget

	# Downloading the file by sending the request to the URL
	corpus_file = wget.download(url)
	print('Downloading Completed')

	# Unzip the file
	with gzip.open(f'{folder_name}.gz', 'rb') as f_in:
		with open(f'{folder_name}', 'wb') as f_out:
			shutil.copyfileobj(f_in, f_out)

tmx_to_json

In [2]:
def tmx_to_json(file_name, lang_code):
	"""
	Takes the TMX file of the MaCoCu corpora and transforms it into a JSON.
	It saves the JSON file to which preprocess function is to be applied.

	Args:
	- file_name (string): name of the TMX file
	- lang_code (string): the language code for the language, used along English - it is the same as in the name of the corpus (e.g. "mk" for MaCoCu-mk-en)
	"""
	import regex as re
	import json
	
	corpus = open(f"{file_name}", "r").read()
	corpus_sample = open(f"{file_name}", "r").read(5000)

	# Prepare all the regexes
	# Compile all tus
	tu_re = re.compile('<tu tuid=".*?>\n(.*?)<\/tu>', re.DOTALL)

	# Compile relevant information inside tus
	bi_score_re = re.compile('<prop type="score-bicleaner-ai">(.*?)</prop>')
	biroamer_re = re.compile('<prop type="biroamer-entities">(.*?)</prop>')
	translation_dir_re = re.compile('<prop type="translation-direction">(.*?)</prop>')
	en_source_re = re.compile('<tuv xml:lang="en">.*?<prop type="source-document">(.*?)</prop>', re.DOTALL)
	en_par_id_re = re.compile('<tuv xml:lang="en">.*?<prop type="paragraph-id">(.*?)</prop', re.DOTALL)
	en_par_re = re.compile('<tuv xml:lang="en">.*?<seg>(.*?)</seg>', re.DOTALL)
	en_var_doc_re = re.compile('<prop type="english-variant-document">(.*?)</prop>')
	en_var_dom_re = re.compile('<prop type="english-variant-domain">(.*?)</prop>')
	sl_source_re = re.compile(f'<tuv xml:lang="{lang_code}">.*?<prop type="source-document">(.*?)</prop>', re.DOTALL)
	sl_par_id_re = re.compile(f'<tuv xml:lang="{lang_code}">.*?<prop type="paragraph-id">(.*?)</prop', re.DOTALL)
	sl_par_re = re.compile(f'<tuv xml:lang="{lang_code}">.*?<seg>(.*?)</seg>', re.DOTALL)

	# Create a list of all tus from the sample corpus
	tus_list_sample = tu_re.findall(corpus_sample)

	# View the tus_list
	print("A sample of the tus in the corpora:\n")
	print(tus_list_sample[1])

	# Check if regexes work
	regexes =  [bi_score_re, biroamer_re, translation_dir_re, en_source_re, en_par_id_re, en_par_re, en_var_doc_re, en_var_dom_re, sl_source_re, sl_par_id_re, sl_par_re]

	print("A check if regexes work:")

	for rex in regexes:
		test_list = rex.findall(tus_list_sample[1])
		print(test_list)

	# Create a list of all tus from the corpus
	tus_list = tu_re.findall(corpus)
	print("\n\nAll tus from the corpora were extracted. The number of sentence pairs (tus) is:")
	print(len(tus_list))

		# Create a list of dictionaries from the tus_list based on regexes
	tus_content = []

	for i in tus_list:
		# Find all relevant information based on regexes
		bi_score = bi_score_re.search(i).group(1)
		biroamer = biroamer_re.search(i).group(1)
		translation_dir = translation_dir_re.search(i).group(1)
		en_source = en_source_re.search(i).group(1)
		en_par_id = en_par_id_re.search(i).group(1)
		en_par = en_par_re.search(i).group(1)
		en_var_doc = en_var_doc_re.search(i).group(1)
		en_var_dom = en_var_dom_re.search(i).group(1)
		sl_source = sl_source_re.search(i).group(1)
		sl_par_id = sl_par_id_re.search(i).group(1)
		sl_par = sl_par_re.search(i).group(1)

		# Add information to the dictionary
		current_tu = {"score_bicleaner_ai": float(bi_score), "biroamer_entities": biroamer, "translation_direction": translation_dir, "en_source": en_source, "en_par_id": en_par_id, "en_par": en_par, "en_var_doc": en_var_doc, "en_var_dom": en_var_dom, f"{lang_code}_source": sl_source, f"{lang_code}_par_id": sl_par_id, f"{lang_code}_par": sl_par}
		# Append the dictionary to the list
		tus_content.append(current_tu)

	print("\n\nThe JSON format created. A sample: \n")
	# Print some instances of the tus_content
	print(tus_content[:2])

	# Save json

	with open(f"MaCoCu-{lang_code}-en.json", "w") as file:
		json.dump(tus_content,file, indent= "")
	
	print(f"\n\nThe JSON file is saved as MaCoCu-{lang_code}-en.json.")

preprocess

In [3]:
def preprocess(lang_code):
	"""
	Takes the JSON file name, created in the tmx_to_json function,
	transforms it into a pandas DataFrame, preprocesses it
	and saves the final document-level CSV file to which filter_non_textual function is to be applied.

	Args:
	- file name (str): the path to the JSON file
	- lang code: the code of the language that is in the pair with English,	it is the same as in the name of the MaCoCu file (e.g., mk in MaCoCu-mk-en)
	"""
	import pandas as pd
	import numpy as np
	import regex as re
	import json

	with open(f"MaCoCu-{lang_code}-en.json", "r") as file:
		tus_content = json.load(file)

	# Convert data to a dataframe
	corpus_df = pd.DataFrame(tus_content)

	# Sort by english url and then by en_par_id to order the paragraphs into texts
	corpus_df = corpus_df.sort_values(by = ["en_source", "en_par_id"])

	# Add information about domains
	domain_re=re.compile(r'^https?://(?:www\.)?(.+?)[/$]')

	en_domain_list = [domain_re.search(i).group(1) for i in corpus_df.en_source.to_list()]

	corpus_df["en_domain"] = en_domain_list

	# Repeat with domain of the other language
	sl_domain_list = [domain_re.search(i).group(1) for i in corpus_df[f"{lang_code}_source"].to_list()]
	corpus_df[f"{lang_code}_domain"] = sl_domain_list

	# Add information whether the domains are the same
	corpus_df["same_domains"] = np.where(corpus_df["en_domain"] == corpus_df[f"{lang_code}_domain"], "yes", 'no')

	# Add column for domains that are different
	corpus_df["different_domains"] = corpus_df["en_domain"] + " " + corpus_df[f"{lang_code}_domain"]

	# Print the information
	print("Information about the web domains for the two languages is added. See the head of the dataframe:\n")
	display(corpus_df.head(2))

	print("Number of same and different domains in the corpus:\n")

	print(corpus_df["same_domains"].value_counts().to_markdown())

	# Number of texts and sentences up to now
	previous_no_sentences = corpus_df.en_source.count()
	previous_no_texts = len(corpus_df.en_source.unique())
	print(f"\nCurrent number of sentences: {previous_no_sentences}")
	print(f"Current number of texts: {previous_no_texts}\n\n")

	# See number of discarded texts and sentences
	def calculate_discarded(previous_no_sentences, previous_no_texts, calculate_texts_only):
		new_number_sentences = corpus_df.en_source.count()
		new_number_texts = len(corpus_df.en_source.unique())
		if calculate_texts_only == False:
			print(f"New number of sentences: {new_number_sentences}")
			print(f"No. of discarded sentences: {previous_no_sentences-new_number_sentences}, percentage: {(previous_no_sentences-new_number_sentences)/previous_no_sentences}")
		
		print(f"New number of texts: {new_number_texts}")
		print(f"No. of discarded texts: {previous_no_texts-new_number_texts}, percentage: {(previous_no_texts-new_number_texts)/previous_no_texts}")

		return new_number_sentences, new_number_texts
	
	# Discard instances that are from different domains
	corpus_df = corpus_df[corpus_df["same_domains"] == "yes"]

	print("Instances from different domains were discarded.\n")

	sentences_same_domains, texts_same_domains = calculate_discarded(previous_no_sentences, previous_no_texts, False)

	# Calculate average bicleaner ai score based on the en_source
	corpus_df["average_score"] = corpus_df["score_bicleaner_ai"].groupby(corpus_df['en_source']).transform('mean')

	# Join par id and text
	corpus_df["en-par-text"] = corpus_df["en_par_id"] + "-" + corpus_df["en_par"]

	# Discard all duplicated English paragraphs with the same par id
	corpus_df = corpus_df.drop_duplicates("en-par-text")

	print("\nAll duplicated English sentences with the same paragraph and sentence ID were discarded.\n")

	sentences_dupl_sent, text_dupl_sent = calculate_discarded(sentences_same_domains, texts_same_domains, False)

	# Add to each instance from the same en_source joint text from all sentences
	corpus_df["en_doc"] = corpus_df["en_par"].groupby(corpus_df['en_source']).transform(' '.join)

	# Repeat with the text in other language
	corpus_df[f"{lang_code}_doc"] = corpus_df[f"{lang_code}_par"].groupby(corpus_df[f'{lang_code}_source']).transform(' '.join)

	# Keep only one example of each text
	corpus_df = corpus_df.drop_duplicates("en_doc")

	print("\nThe sentences were merged into texts based on the source URL and the English duplicated texts were removed.\n")

	sentences_after_text_deduplication, texts_after_text_deduplication = calculate_discarded(sentences_dupl_sent, text_dupl_sent, True)

	# Add information about length
	corpus_df["en_length"] = corpus_df.en_doc.str.split().str.len()

	# Add information about length of the other language
	corpus_df[f"{lang_code}_length"] = corpus_df[f"{lang_code}_doc"].str.split().str.len()


	print("\nInitial length of texts in the corpus:")

	print(corpus_df.en_length.describe().to_markdown())

	# Discard instances that have length less than  79 (median from other datasets)
	corpus_df = corpus_df[corpus_df["en_length"] > 78]

	print("\nTexts that have less than 79 words were discarded.\n")

	sentences_after_length, texts_after_length = calculate_discarded(sentences_after_text_deduplication, texts_after_text_deduplication, True)

	# Discard irrelevant columns
	corpus_df = corpus_df.drop(columns = ['score_bicleaner_ai', 'en_par_id', 'en_par', f'{lang_code}_par_id', f'{lang_code}_par', 'en-par-text', 'same_domains', 'different_domains'])

	# View the final dataframe
	print("The final dataframe: \n")

	display(corpus_df.head(5))

	# Save the dataframe to csv
	corpus_df.to_csv(f"Macocu-{lang_code}-en-doc-format.csv", sep= "\t")

	print(f"The preparation of the file is finished and the file is saved as Macocu-{lang_code}-en-doc-format.csv.")

filter_non_textual

In [4]:
def filter_non_textual(lang_code, lower_limit = 0.015, upper_limit = 0.2):
	"""
	Takes the CSV file, produced with the preprocess function
	and applies filtering of the non-textual texts based on a no. of punctuations per no. of words heuristic.

	Args:
	- file_name (str): path to the CSV file, without the ".csv" (!)
	- lower_limit (float): default is 0.015, can be changed if the results show that this would filter out mostly okay texts
	- upper_limit (float): default is 0.2, can be changed if the results show that this would filter out mostly okay texts
	
	Saves the filtered dataframe as a CSV to which genre predictions are to be made.
	"""
	import pandas as pd
	import regex as re

	corpus_df = pd.read_csv(f"Macocu-{lang_code}-en-doc-format.csv", sep= "\t", index_col = 0)

	# Filter out the non-textual texts

	# Calculate ratio of punctuations per words

	def paragraph_punct_ratio(text):
		token_re=re.compile(r'\w+|\S',re.UNICODE)
		tokens=token_re.findall(text)
		punct=len([e for e in tokens if e in '.;,!?:'])
		ratio = punct/len(tokens)
		return ratio

	corpus_df["punct_ratio"] = corpus_df.en_doc.apply(paragraph_punct_ratio)

	print(f"Texts (first 5) that would be discarded with the lower limit: {lower_limit}\n")

	# With the ratio below the lower limit, we catch non-textual texts without any punctuation
	for i in corpus_df.query(f"punct_ratio < {lower_limit}").en_doc.to_list()[:5]:
		print(i)

	print(f"\n\nTexts (first 5) that would be discarded with the upper limit: {upper_limit}\n")

	# With ratio above the upper limit, we catch non-textual texts with a lot of punctuations
	for i in corpus_df.query(f"punct_ratio > {upper_limit}").en_doc.to_list()[:5]:
		print(i)

	# Number of texts up to now
	previous_no_texts = len(corpus_df.en_source.unique())

	# See number of discarded texts and sentences
	def calculate_discarded(previous_no_sentences, previous_no_texts, calculate_texts_only):
		new_number_sentences = corpus_df.en_source.count()
		new_number_texts = len(corpus_df.en_source.unique())
		if calculate_texts_only == False:
			print(f"New number of sentences: {new_number_sentences}")
			print(f"No. of discarded sentences: {previous_no_sentences-new_number_sentences}, percentage: {(previous_no_sentences-new_number_sentences)/previous_no_sentences}")
		
		print(f"New number of texts: {new_number_texts}")
		print(f"No. of discarded texts: {previous_no_texts-new_number_texts}, percentage: {(previous_no_texts-new_number_texts)/previous_no_texts}")

		return new_number_sentences, new_number_texts

	# Filter the corpus by using only instances with ratio between the lower and upper limit
	corpus_df = corpus_df.query(f"punct_ratio >= {lower_limit} & punct_ratio <= {upper_limit}")

	print("The non-textual texts were discarded.\n")

	sentences_after_heuristic, texts_after_heuristic = calculate_discarded(100, previous_no_texts, True)

	display(corpus_df.head(5))

	# Save the dataframe to csv
	corpus_df.to_csv(f"Macocu-{lang_code}-en-doc-format-filtered.csv", sep= "\t")

	print(f"The preparation of the file is finished and the file is saved as Macocu-{lang_code}-en-doc-format-filtered.csv.")

analyze_prepared_corpus

In [5]:
def analyze_prepared_corpus(lang_code):
	"""
	Takes the CSV file, created by the filter_non_textual function and analyzes the corpus.

	Args:
	- file_name (str): path to the CSV file
	"""
	import pandas as pd

	corpus_df = pd.read_csv(f"Macocu-{lang_code}-en-doc-format-filtered.csv", sep= "\t", index_col = 0)

	print("View the corpus:")
	display(corpus_df.head(3))

	# Inspect corpus information
	print("All information about the corpus: \n")
	display(corpus_df.describe(include="all"))

	# Inspect en_var_doc statistics

	print("\nPrediction of English varieties (on document level):\n")
	print(corpus_df.en_var_doc.value_counts(normalize = True).to_markdown())

	print("\nPrediction of English varieties (on domain level):\n")
	print(corpus_df.en_var_dom.value_counts(normalize = True).to_markdown())

	# Inspect translation direction
	print("\nPrediction of translation direction:\n")
	print(corpus_df.translation_direction.value_counts(normalize = True).to_markdown())

	print("\nInformation on the bicleaner score:\n")
	print(corpus_df.average_score.describe().to_markdown())

	print("\nFinal length of texts in the corpus:")
	print(corpus_df.en_length.describe().to_markdown())
	
	# Analyze English domains in the corpus_df
	count = pd.DataFrame({"Count": list(corpus_df.en_domain.value_counts())[:30], "Percentage": list(corpus_df.en_domain.value_counts(normalize="True")*100)[:30]}, index = corpus_df.en_domain.value_counts()[:30].index)

	print("\nAn analysis of the 30 most frequent English domains:")
	print(count.to_markdown())

	print("\n\nAnalysis completed.")


postprocess_results

In [12]:
def postprocess_results(file_name, lang_code):
	"""
	Takes the CSV file with genre predictions, applies filtering - discards some of the non-reliable results,
	and saves the final file as CSV.

	Args:
	- file_name: path to the CSV file with predictions
	"""
	import pandas as pd
	import numpy as np

	corpus = pd.read_csv(f"{file_name}", sep = "\t", index_col = 0)

	# View the Dataframe
	display(corpus.head(3))

	# Analyze genre distribution
	count = pd.DataFrame({"Count": list(corpus["X-GENRE"].value_counts()), "Percentage": list(corpus["X-GENRE"].value_counts(normalize="True")*100)}, index = corpus["X-GENRE"].value_counts().index)

	print("Genre distribution before post-processing: \n")
	print(count.to_markdown())

	initial_number_of_labels = corpus["X-GENRE"].count()

	# Post-process the data

	# Copy all predicted labels to a new column, except if the label is "Other"
	corpus["final-X-GENRE"] = np.where(corpus["X-GENRE"] == "Other", np.nan, corpus["X-GENRE"])

	# Copy all predicted labels to a column "final-X-GENRE", except if the label is "Forum"
	corpus["final-X-GENRE"] = np.where(corpus["final-X-GENRE"] == "Forum", np.nan, corpus["final-X-GENRE"])

	print("The Forum and Other label were discarded from the column with final genre labels.")

	print("New genre distribution:\n")
	print(corpus["final-X-GENRE"].value_counts().to_markdown())

	current_no_final_labels = corpus["final-X-GENRE"].count()

	# Copy all predicted labels to a column "final-X-GENRE", except if the prediction confidence is lower than 0.9
	corpus["final-X-GENRE"] = np.where(corpus["chosen_category_distr"] < 0.9, np.nan, corpus["final-X-GENRE"])

	print("Labels, predicted with confidence, lower than 0.9, were discarded from the final labels.\n")

	final_no_of_labels = corpus["final-X-GENRE"].count()

	print(f"Number of discarded labels due to confidence being to low: {current_no_final_labels-final_no_of_labels}, percentage: {(current_no_final_labels-final_no_of_labels)/current_no_final_labels}")

	print(f"Final number of labelled texts: {final_no_of_labels}")

	print(f"Total number of labels discarded due to post-processing: {initial_number_of_labels-final_no_of_labels}, percentage: {(initial_number_of_labels-final_no_of_labels)/initial_number_of_labels}")

	# Analyze final genre distribution
	count = pd.DataFrame({"Count": list(corpus["final-X-GENRE"].value_counts()), "Percentage": list(corpus["final-X-GENRE"].value_counts(normalize="True")*100)}, index = corpus["final-X-GENRE"].value_counts().index)

	print("Final genre distribution:\n")
	print(count.to_markdown())

	LABELS = list(corpus["final-X-GENRE"].unique())

	# Save the new file
	corpus.to_csv(f"Macocu-{lang_code}-en-predicted-post-processed.csv")

	print(f"The file with final labels is saved as {file_name}-post-processed.csv.")

analyze_results

In [20]:
def analyze_results(lang_code, no_of_domains):
	"""
	Takes the post-processed CSV file with genre predictions, produced with the postprocess_results function and analyzes the results.
	Saves a file with results, named "file_name-analysis.txt"

	Args:
	- file_name (str): path to the post-processed CSV file with predictions
	- no_of_domains (int): define how many most frequent domains you want to analyze in terms of genre distribution - usually, we take the number of domains that represent more than 1% of data (this is analyzed in analyze_prepared_corpus)
	"""
	import pandas as pd
	import numpy as np

	corpus = pd.read_csv(f"Macocu-{lang_code}-en-predicted-post-processed.csv", index_col = 0)

	results_file = open(f"MaCoCu-{lang_code}-en-predicted-analysis.txt", "w")
	results_file.write(f"Analysis of results for file: Macocu-{lang_code}-en-predicted-post-processed.csv\n\n")

	# View the Dataframe
	print("View the final dataframe:\n\n")
	display(corpus.head(3))

	# Analyze English domains in the corpus
	count = pd.DataFrame({"Count": list(corpus.en_domain.value_counts()), "Percentage": list(corpus.en_domain.value_counts(normalize="True")*100)}, index = corpus.en_domain.value_counts().index)

	domains_to_analyse = count.index.to_list()[:no_of_domains]

	# See the distribution of genres in the most frequent domains:
	results_file.write("Distribution of genres in the most frequent domains:\n\n")

	for i in domains_to_analyse:
		results_file.write("\n\n")
		results_file.write(i)
		results_file.write("\n\n")
		filtered_corpus = corpus[corpus["en_domain"] == i]
		results_file.write(filtered_corpus["final-X-GENRE"].value_counts(normalize="True").to_markdown())

	print("The distribution of genres in the most frequent domains analyzed.")

	# Analyze differences in genres based on domain frequency
	results_file.write("\n\nDistribution of domains in genres:\n\n")

	for i in ['Opinion/Argumentation', 'News', 'Legal', 'Information/Explanation', 'Promotion', 'Instruction', 'Prose/Lyrical']:
		results_file.write("\n\n")
		results_file.write(i)
		results_file.write("\n\n")
		filtered_corpus = corpus[corpus["final-X-GENRE"] == i]
		results_file.write(filtered_corpus["en_domain"].value_counts(normalize="True")[:5].to_markdown())

	print("Differences of the domain distribution for each genre analyzed.")

	# Analyze differences in genres based on language varieties
	results_file.write("\n\nDistribution of English varieties in genres:\n\n")

	for i in ['News', 'Opinion/Argumentation', 'Promotion', 'Instruction', 'Information/Explanation', 'Legal', 'Prose/Lyrical']:
		results_file.write("\n\n")
		results_file.write(i)
		results_file.write("\n\n")
		filtered_corpus = corpus[corpus["final-X-GENRE"] == i]
		results_file.write(filtered_corpus["en_var_doc"].value_counts(normalize="True").to_markdown())

	print("Differences in language varieties distribution per genres analyzed.")

	# Length distribution of the entire corpus
	results_file.write("\n\nLength distribution of the entire corpus:\n\n")
	results_file.write(corpus["en_length"].describe().to_markdown())

	# Analyze differences in genres based on text length
	results_file.write("\n\nLength distribution for each of the genre subcorpus:\n\n")

	for i in ['News', 'Opinion/Argumentation', 'Promotion', 'Instruction', 'Information/Explanation', 'Legal', 'Prose/Lyrical']:
		results_file.write("\n\n")
		results_file.write(i)
		results_file.write("\n\n")
		filtered_corpus = corpus[corpus["final-X-GENRE"] == i]
		results_file.write(filtered_corpus["en_length"].describe().to_markdown())
	
	results_file.close()
	
	print(f"Analysis completed. Inspect the result file MaCoCu-{lang_code}-en-predicted-analysis.txt for results.")


## The pipeline

In [None]:
# Complete pipeline of MaCoCu corpus preparation for genre prediction

# Define the URL from which the MaCoCu corpus is to be downloaded:
url = 

# Define the name of the zipped folder; without ".GZ":
folder_name =

# Define the language code of the language that is in the combination with English
# (it is in the name of the MaCoCu file, e.g. "mk" in Macocu-mk-en)
lang_code = 

# After the prediction of genres is complete, run the post-processing and analysis of results
# Define the name of the file with predictions (stated in the predict_genres.py)
file_name = ""

# Define the number of the most frequent domains you wish to analyze
# (the best is to choose domains that are present in more than 1% of data - this information
# is obtained with the analyze_prepared_corpus function.)
no_of_domains = 7

# No need to change anything below this point.
# -------------------------------------------------------------------------------------


In [None]:
# Download and unzip the files
download_unzip(url, folder_name)

In [None]:
# Transform the downloaded TMX file to JSON
tmx_to_json(folder_name, lang_code)

In [None]:
# Preprocess the JSON file, transformed to a DataFrame and save it to the CSV file
preprocess(lang_code)

In [None]:
# Apply final preprocessing step: filter out non-textual texts if applicable (change lower and upper limit if necessary)
filter_non_textual(lang_code, lower_limit = 0.015, upper_limit = 0.2)

In [None]:
# Analyze prepared corpus
analyze_prepared_corpus(lang_code)

# After preparation of the corpus, you can apply genre prediction to it - modify the beginning of the predict_genres.py file and run it in the terminal:
# nohup python predict_genres.py

In [None]:
# Post-process the results
postprocess_results(lang_code)

In [None]:
# Analyze the results
analyze_results(lang_code, no_of_domains)