# The genre prediction pipeline

## Definitions of functions

download_unzip

In [1]:
def download_unzip(url, folder_name):
	"""
	The function downloads the zipped folder from an URL (from the CLARIN.SI repository),
	unzips it and saves the TMX file. It should be used for the MaCoCu data which is zipped
	using the GZ appendix and the name of the file is the same as the name of the folder. It should be followed by the tmx_to_json function.

	Args:
	- url(string): the URL from which the file can be downloaded
	- folder_name (string): name of the zipped folder; without ".GZ"
	"""
	import gzip
	import shutil
	import wget

	# Downloading the file by sending the request to the URL
	corpus_file = wget.download(url)
	print('Downloading Completed')

	# Unzip the file
	with gzip.open(f'{folder_name}.gz', 'rb') as f_in:
		with open(f'{folder_name}', 'wb') as f_out:
			shutil.copyfileobj(f_in, f_out)

tmx_to_json

In [2]:
def tmx_to_json(file_name, lang_code):
	"""
	Takes the TMX file of the MaCoCu corpora and transforms it into a JSON.
	It saves the JSON file to which preprocess function is to be applied.

	Args:
	- file_name (string): name of the TMX file
	- lang_code (string): the language code for the language, used along English - it is the same as in the name of the corpus (e.g. "mk" for MaCoCu-mk-en)
	"""
	import regex as re
	import json
	
	corpus = open(f"{file_name}", "r").read()
	corpus_sample = open(f"{file_name}", "r").read(5000)

	# Prepare all the regexes
	# Compile all tus
	tu_re = re.compile('<tu tuid=".*?>\n(.*?)<\/tu>', re.DOTALL)

	# Compile relevant information inside tus
	bi_score_re = re.compile('<prop type="score-bicleaner-ai">(.*?)</prop>')
	biroamer_re = re.compile('<prop type="biroamer-entities">(.*?)</prop>')
	translation_dir_re = re.compile('<prop type="translation-direction">(.*?)</prop>')
	en_source_re = re.compile('<tuv xml:lang="en">.*?<prop type="source-document">(.*?)</prop>', re.DOTALL)
	en_par_id_re = re.compile('<tuv xml:lang="en">.*?<prop type="paragraph-id">(.*?)</prop', re.DOTALL)
	en_par_re = re.compile('<tuv xml:lang="en">.*?<seg>(.*?)</seg>', re.DOTALL)
	en_var_doc_re = re.compile('<prop type="english-variant-document">(.*?)</prop>')
	en_var_dom_re = re.compile('<prop type="english-variant-domain">(.*?)</prop>')
	sl_source_re = re.compile(f'<tuv xml:lang="{lang_code}">.*?<prop type="source-document">(.*?)</prop>', re.DOTALL)
	sl_par_id_re = re.compile(f'<tuv xml:lang="{lang_code}">.*?<prop type="paragraph-id">(.*?)</prop', re.DOTALL)
	sl_par_re = re.compile(f'<tuv xml:lang="{lang_code}">.*?<seg>(.*?)</seg>', re.DOTALL)

	# Create a list of all tus from the sample corpus
	tus_list_sample = tu_re.findall(corpus_sample)

	# View the tus_list
	print("A sample of the tus in the corpora:\n")
	print(tus_list_sample[1])

	# Check if regexes work
	regexes =  [bi_score_re, biroamer_re, translation_dir_re, en_source_re, en_par_id_re, en_par_re, en_var_doc_re, en_var_dom_re, sl_source_re, sl_par_id_re, sl_par_re]

	print("A check if regexes work:")

	for rex in regexes:
		test_list = rex.findall(tus_list_sample[1])
		print(test_list)

	# Create a list of all tus from the corpus
	tus_list = tu_re.findall(corpus)
	print("\n\nAll tus from the corpora were extracted. The number of sentence pairs (tus) is:")
	print(len(tus_list))

		# Create a list of dictionaries from the tus_list based on regexes
	tus_content = []

	for i in tus_list:
		# Find all relevant information based on regexes
		bi_score = bi_score_re.search(i).group(1)
		biroamer = biroamer_re.search(i).group(1)
		translation_dir = translation_dir_re.search(i).group(1)
		en_source = en_source_re.search(i).group(1)
		en_par_id = en_par_id_re.search(i).group(1)
		en_par = en_par_re.search(i).group(1)
		en_var_doc = en_var_doc_re.search(i).group(1)
		en_var_dom = en_var_dom_re.search(i).group(1)
		sl_source = sl_source_re.search(i).group(1)
		sl_par_id = sl_par_id_re.search(i).group(1)
		sl_par = sl_par_re.search(i).group(1)

		# Add information to the dictionary
		current_tu = {"score_bicleaner_ai": float(bi_score), "biroamer_entities": biroamer, "translation_direction": translation_dir, "en_source": en_source, "en_par_id": en_par_id, "en_par": en_par, "en_var_doc": en_var_doc, "en_var_dom": en_var_dom, f"{lang_code}_source": sl_source, f"{lang_code}_par_id": sl_par_id, f"{lang_code}_par": sl_par}
		# Append the dictionary to the list
		tus_content.append(current_tu)

	print("\n\nThe JSON format created. A sample: \n")
	# Print some instances of the tus_content
	print(tus_content[:2])

	# Save json

	with open(f"MaCoCu-{lang_code}-en.json", "w") as file:
		json.dump(tus_content,file, indent= "")
	
	print(f"\n\nThe JSON file is saved as MaCoCu-{lang_code}-en.json.")

preprocess

In [3]:
def preprocess(lang_code):
	"""
	Takes the JSON file name, created in the tmx_to_json function,
	transforms it into a pandas DataFrame, preprocesses it
	and saves the final document-level CSV file to which filter_non_textual function is to be applied.

	Args:
	- file name (str): the path to the JSON file
	- lang code: the code of the language that is in the pair with English,	it is the same as in the name of the MaCoCu file (e.g., mk in MaCoCu-mk-en)
	"""
	import pandas as pd
	import numpy as np
	import regex as re
	import json

	with open(f"MaCoCu-{lang_code}-en.json", "r") as file:
		tus_content = json.load(file)

	# Convert data to a dataframe
	corpus_df = pd.DataFrame(tus_content)

	# Sort by english url and then by en_par_id to order the paragraphs into texts
	corpus_df = corpus_df.sort_values(by = ["en_source", "en_par_id"])

	# Add information about domains
	domain_re=re.compile(r'^https?://(?:www\.)?(.+?)[/$]')

	en_domain_list = [domain_re.search(i).group(1) for i in corpus_df.en_source.to_list()]

	corpus_df["en_domain"] = en_domain_list

	# Repeat with domain of the other language
	sl_domain_list = [domain_re.search(i).group(1) for i in corpus_df[f"{lang_code}_source"].to_list()]
	corpus_df[f"{lang_code}_domain"] = sl_domain_list

	# Add information whether the domains are the same
	corpus_df["same_domains"] = np.where(corpus_df["en_domain"] == corpus_df[f"{lang_code}_domain"], "yes", 'no')

	# Add column for domains that are different
	corpus_df["different_domains"] = corpus_df["en_domain"] + " " + corpus_df[f"{lang_code}_domain"]

	# Print the information
	print("Information about the web domains for the two languages is added. See the head of the dataframe:\n")
	display(corpus_df.head(2))

	print("Number of same and different domains in the corpus:\n")

	print(corpus_df["same_domains"].value_counts().to_markdown())

	# Number of texts and sentences up to now
	previous_no_sentences = corpus_df.en_source.count()
	previous_no_texts = len(corpus_df.en_source.unique())
	print(f"\nCurrent number of sentences: {previous_no_sentences}")
	print(f"Current number of texts: {previous_no_texts}\n\n")

	# See number of discarded texts and sentences
	def calculate_discarded(previous_no_sentences, previous_no_texts, calculate_texts_only):
		new_number_sentences = corpus_df.en_source.count()
		new_number_texts = len(corpus_df.en_source.unique())
		if calculate_texts_only == False:
			print(f"New number of sentences: {new_number_sentences}")
			print(f"No. of discarded sentences: {previous_no_sentences-new_number_sentences}, percentage: {(previous_no_sentences-new_number_sentences)/previous_no_sentences}")
		
		print(f"New number of texts: {new_number_texts}")
		print(f"No. of discarded texts: {previous_no_texts-new_number_texts}, percentage: {(previous_no_texts-new_number_texts)/previous_no_texts}")

		return new_number_sentences, new_number_texts
	
	# Discard instances that are from different domains
	corpus_df = corpus_df[corpus_df["same_domains"] == "yes"]

	print("Instances from different domains were discarded.\n")

	sentences_same_domains, texts_same_domains = calculate_discarded(previous_no_sentences, previous_no_texts, False)

	# Calculate average bicleaner ai score based on the en_source
	corpus_df["average_score"] = corpus_df["score_bicleaner_ai"].groupby(corpus_df['en_source']).transform('mean')

	# Join par id and text
	corpus_df["en-par-text"] = corpus_df["en_par_id"] + "-" + corpus_df["en_par"]

	# Discard all duplicated English paragraphs with the same par id
	corpus_df = corpus_df.drop_duplicates("en-par-text")

	print("\nAll duplicated English sentences with the same paragraph and sentence ID were discarded.\n")

	sentences_dupl_sent, text_dupl_sent = calculate_discarded(sentences_same_domains, texts_same_domains, False)

	# Add to each instance from the same en_source joint text from all sentences
	corpus_df["en_doc"] = corpus_df["en_par"].groupby(corpus_df['en_source']).transform(' '.join)

	# Repeat with the text in other language
	corpus_df[f"{lang_code}_doc"] = corpus_df[f"{lang_code}_par"].groupby(corpus_df[f'{lang_code}_source']).transform(' '.join)

	# Keep only one example of each text
	corpus_df = corpus_df.drop_duplicates("en_doc")

	print("\nThe sentences were merged into texts based on the source URL and the English duplicated texts were removed.\n")

	sentences_after_text_deduplication, texts_after_text_deduplication = calculate_discarded(sentences_dupl_sent, text_dupl_sent, True)

	# Add information about length
	corpus_df["en_length"] = corpus_df.en_doc.str.split().str.len()

	# Add information about length of the other language
	corpus_df[f"{lang_code}_length"] = corpus_df[f"{lang_code}_doc"].str.split().str.len()


	print("\nInitial length of texts in the corpus:")

	print(corpus_df.en_length.describe().to_markdown())

	# Discard instances that have length less than  79 (median from other datasets)
	corpus_df = corpus_df[corpus_df["en_length"] > 78]

	print("\nTexts that have less than 79 words were discarded.\n")

	sentences_after_length, texts_after_length = calculate_discarded(sentences_after_text_deduplication, texts_after_text_deduplication, True)

	# Discard irrelevant columns
	corpus_df = corpus_df.drop(columns = ['score_bicleaner_ai', 'en_par_id', 'en_par', f'{lang_code}_par_id', f'{lang_code}_par', 'en-par-text', 'same_domains', 'different_domains'])

	# View the final dataframe
	print("The final dataframe: \n")

	display(corpus_df.head(5))

	# Save the dataframe to csv
	corpus_df.to_csv(f"Macocu-{lang_code}-en-doc-format.csv", sep= "\t")

	print(f"The preparation of the file is finished and the file is saved as Macocu-{lang_code}-en-doc-format.csv.")

filter_non_textual

In [4]:
def filter_non_textual(lang_code, lower_limit = 0.015, upper_limit = 0.2):
	"""
	Takes the CSV file, produced with the preprocess function
	and applies filtering of the non-textual texts based on a no. of punctuations per no. of words heuristic.

	Args:
	- file_name (str): path to the CSV file, without the ".csv" (!)
	- lower_limit (float): default is 0.015, can be changed if the results show that this would filter out mostly okay texts
	- upper_limit (float): default is 0.2, can be changed if the results show that this would filter out mostly okay texts
	
	Saves the filtered dataframe as a CSV to which genre predictions are to be made.
	"""
	import pandas as pd
	import regex as re

	corpus_df = pd.read_csv(f"Macocu-{lang_code}-en-doc-format.csv", sep= "\t", index_col = 0)

	# Filter out the non-textual texts

	# Calculate ratio of punctuations per words

	def paragraph_punct_ratio(text):
		token_re=re.compile(r'\w+|\S',re.UNICODE)
		tokens=token_re.findall(text)
		punct=len([e for e in tokens if e in '.;,!?:'])
		ratio = punct/len(tokens)
		return ratio

	corpus_df["punct_ratio"] = corpus_df.en_doc.apply(paragraph_punct_ratio)

	print(f"Texts (first 5) that would be discarded with the lower limit: {lower_limit}\n")

	# With the ratio below the lower limit, we catch non-textual texts without any punctuation
	for i in corpus_df.query(f"punct_ratio < {lower_limit}").en_doc.to_list()[:5]:
		print(i)

	print(f"\n\nTexts (first 5) that would be discarded with the upper limit: {upper_limit}\n")

	# With ratio above the upper limit, we catch non-textual texts with a lot of punctuations
	for i in corpus_df.query(f"punct_ratio > {upper_limit}").en_doc.to_list()[:5]:
		print(i)

	# Number of texts up to now
	previous_no_texts = len(corpus_df.en_source.unique())

	# See number of discarded texts and sentences
	def calculate_discarded(previous_no_sentences, previous_no_texts, calculate_texts_only):
		new_number_sentences = corpus_df.en_source.count()
		new_number_texts = len(corpus_df.en_source.unique())
		if calculate_texts_only == False:
			print(f"New number of sentences: {new_number_sentences}")
			print(f"No. of discarded sentences: {previous_no_sentences-new_number_sentences}, percentage: {(previous_no_sentences-new_number_sentences)/previous_no_sentences}")
		
		print(f"New number of texts: {new_number_texts}")
		print(f"No. of discarded texts: {previous_no_texts-new_number_texts}, percentage: {(previous_no_texts-new_number_texts)/previous_no_texts}")

		return new_number_sentences, new_number_texts

	# Filter the corpus by using only instances with ratio between the lower and upper limit
	corpus_df = corpus_df.query(f"punct_ratio >= {lower_limit} & punct_ratio <= {upper_limit}")

	print("The non-textual texts were discarded.\n")

	sentences_after_heuristic, texts_after_heuristic = calculate_discarded(100, previous_no_texts, True)

	display(corpus_df.head(5))

	# Save the dataframe to csv
	corpus_df.to_csv(f"Macocu-{lang_code}-en-doc-format-filtered.csv", sep= "\t")

	print(f"The preparation of the file is finished and the file is saved as Macocu-{lang_code}-en-doc-format-filtered.csv.")

analyze_prepared_corpus

In [5]:
def analyze_prepared_corpus(lang_code):
	"""
	Takes the CSV file, created by the filter_non_textual function and analyzes the corpus.

	Args:
	- file_name (str): path to the CSV file
	"""
	import pandas as pd

	corpus_df = pd.read_csv(f"Macocu-{lang_code}-en-doc-format-filtered.csv", sep= "\t", index_col = 0)

	print("View the corpus:")
	display(corpus_df.head(3))

	# Inspect corpus information
	print("All information about the corpus: \n")
	display(corpus_df.describe(include="all"))

	# Inspect en_var_doc statistics

	print("\nPrediction of English varieties (on document level):\n")
	print(corpus_df.en_var_doc.value_counts(normalize = True).to_markdown())

	print("\nPrediction of English varieties (on domain level):\n")
	print(corpus_df.en_var_dom.value_counts(normalize = True).to_markdown())

	# Inspect translation direction
	print("\nPrediction of translation direction:\n")
	print(corpus_df.translation_direction.value_counts(normalize = True).to_markdown())

	print("\nInformation on the bicleaner score:\n")
	print(corpus_df.average_score.describe().to_markdown())

	print("\nFinal length of texts in the corpus:")
	print(corpus_df.en_length.describe().to_markdown())
	
	# Analyze English domains in the corpus_df
	count = pd.DataFrame({"Count": list(corpus_df.en_domain.value_counts())[:30], "Percentage": list(corpus_df.en_domain.value_counts(normalize="True")*100)[:30]}, index = corpus_df.en_domain.value_counts()[:30].index)

	print("\nAn analysis of the 30 most frequent English domains:")
	print(count.to_markdown())

	print("\n\nAnalysis completed.")


postprocess_results

In [6]:
def postprocess_results(file_name, lang_code):
	"""
	Takes the CSV file with genre predictions, applies filtering - discards some of the non-reliable results,
	and saves the final file as CSV.

	Args:
	- file_name: path to the CSV file with predictions
	"""
	import pandas as pd
	import numpy as np

	corpus = pd.read_csv(f"{file_name}", sep = "\t", index_col = 0)

	# View the Dataframe
	display(corpus.head(3))

	# Analyze genre distribution
	count = pd.DataFrame({"Count": list(corpus["X-GENRE"].value_counts()), "Percentage": list(corpus["X-GENRE"].value_counts(normalize="True")*100)}, index = corpus["X-GENRE"].value_counts().index)

	print("Genre distribution before post-processing: \n")
	print(count.to_markdown())

	initial_number_of_labels = corpus["X-GENRE"].count()

	# Post-process the data

	# Copy all predicted labels to a new column, except if the label is "Other"
	corpus["final-X-GENRE"] = np.where(corpus["X-GENRE"] == "Other", np.nan, corpus["X-GENRE"])

	# Copy all predicted labels to a column "final-X-GENRE", except if the label is "Forum"
	corpus["final-X-GENRE"] = np.where(corpus["final-X-GENRE"] == "Forum", np.nan, corpus["final-X-GENRE"])

	print("The Forum and Other label were discarded from the column with final genre labels.")

	print("New genre distribution:\n")
	print(corpus["final-X-GENRE"].value_counts().to_markdown())

	current_no_final_labels = corpus["final-X-GENRE"].count()

	# Copy all predicted labels to a column "final-X-GENRE", except if the prediction confidence is lower than 0.9
	corpus["final-X-GENRE"] = np.where(corpus["chosen_category_distr"] < 0.9, np.nan, corpus["final-X-GENRE"])

	print("Labels, predicted with confidence, lower than 0.9, were discarded from the final labels.\n")

	final_no_of_labels = corpus["final-X-GENRE"].count()

	print(f"Number of discarded labels due to confidence being to low: {current_no_final_labels-final_no_of_labels}, percentage: {(current_no_final_labels-final_no_of_labels)/current_no_final_labels}")

	print(f"Final number of labelled texts: {final_no_of_labels}")

	print(f"Total number of labels discarded due to post-processing: {initial_number_of_labels-final_no_of_labels}, percentage: {(initial_number_of_labels-final_no_of_labels)/initial_number_of_labels}")

	# Analyze final genre distribution
	count = pd.DataFrame({"Count": list(corpus["final-X-GENRE"].value_counts()), "Percentage": list(corpus["final-X-GENRE"].value_counts(normalize="True")*100)}, index = corpus["final-X-GENRE"].value_counts().index)

	print("Final genre distribution:\n")
	print(count.to_markdown())

	LABELS = list(corpus["final-X-GENRE"].unique())

	# Save the new file
	corpus.to_csv(f"Macocu-{lang_code}-en-predicted-post-processed.csv")

	print(f"The file with final labels is saved as {file_name}-post-processed.csv.")

analyze_results

In [20]:
def analyze_results(lang_code, no_of_domains):
	"""
	Takes the post-processed CSV file with genre predictions, produced with the postprocess_results function and analyzes the results.
	Saves a file with results, named "file_name-analysis.txt"

	Args:
	- file_name (str): path to the post-processed CSV file with predictions
	- no_of_domains (int): define how many most frequent domains you want to analyze in terms of genre distribution - usually, we take the number of domains that represent more than 1% of data (this is analyzed in analyze_prepared_corpus)
	"""
	import pandas as pd
	import numpy as np

	corpus = pd.read_csv(f"Macocu-{lang_code}-en-predicted-post-processed.csv", index_col = 0)

	results_file = open(f"MaCoCu-{lang_code}-en-predicted-analysis.txt", "w")
	results_file.write(f"Analysis of results for file: Macocu-{lang_code}-en-predicted-post-processed.csv\n\n")

	# View the Dataframe
	print("View the final dataframe:\n\n")
	display(corpus.head(3))

	# Analyze English domains in the corpus
	count = pd.DataFrame({"Count": list(corpus.en_domain.value_counts()), "Percentage": list(corpus.en_domain.value_counts(normalize="True")*100)}, index = corpus.en_domain.value_counts().index)

	domains_to_analyse = count.index.to_list()[:no_of_domains]

	# See the distribution of genres in the most frequent domains:
	results_file.write("Distribution of genres in the most frequent domains:\n\n")

	for i in domains_to_analyse:
		results_file.write("\n\n")
		results_file.write(i)
		results_file.write("\n\n")
		filtered_corpus = corpus[corpus["en_domain"] == i]
		results_file.write(filtered_corpus["final-X-GENRE"].value_counts(normalize="True").to_markdown())

	print("The distribution of genres in the most frequent domains analyzed.")

	# Analyze differences in genres based on domain frequency
	results_file.write("\n\nDistribution of domains in genres:\n\n")

	for i in ['Opinion/Argumentation', 'News', 'Legal', 'Information/Explanation', 'Promotion', 'Instruction', 'Prose/Lyrical']:
		results_file.write("\n\n")
		results_file.write(i)
		results_file.write("\n\n")
		filtered_corpus = corpus[corpus["final-X-GENRE"] == i]
		results_file.write(filtered_corpus["en_domain"].value_counts(normalize="True")[:5].to_markdown())

	print("Differences of the domain distribution for each genre analyzed.")

	# Analyze differences in genres based on language varieties
	#results_file.write("\n\nDistribution of English varieties in genres:\n\n")

	#for i in ['News', 'Opinion/Argumentation', 'Promotion', 'Instruction', 'Information/Explanation', 'Legal', 'Prose/Lyrical']:
	#	results_file.write("\n\n")
	#	results_file.write(i)
	#	results_file.write("\n\n")
	#	filtered_corpus = corpus[corpus["final-X-GENRE"] == i]
	#	results_file.write(filtered_corpus["en_var_doc"].value_counts(normalize="True").to_markdown())

	# Analyze differences in genres based on language varieties - print raw scores
	print("\n\nDistribution of English varieties in genres - raw scores:\n\n")

	print("Labels in this order: 'News', 'Opinion/Argumentation', 'Promotion', 'Instruction','Information/Explanation', 'Legal','Prose/Lyrical'\n\n")

	for i in ['News', 'Opinion/Argumentation', 'Promotion', 'Instruction','Information/Explanation', 'Legal','Prose/Lyrical']:
		#results_file.write("\n\n")
		#results_file.write(i)
		#results_file.write("\n\n")
		filtered_corpus = corpus[corpus["final-X-GENRE"] == i]
		print(dict(sorted(filtered_corpus["en_var_doc"].value_counts().to_dict().items())))

	# Analyze differences in genres based on language varieties - print normalized scores

	print("\n\nDistribution of English varieties in genres - normalized scores:\n\n")

	for i in ['News', 'Opinion/Argumentation', 'Promotion', 'Instruction','Information/Explanation', 'Legal','Prose/Lyrical']:
		#results_file.write("\n\n")
		#results_file.write(i)
		#results_file.write("\n\n")
		filtered_corpus = corpus[corpus["final-X-GENRE"] == i]
		print((dict(sorted((filtered_corpus["en_var_doc"].value_counts(normalize = True)*100).round(2).to_dict().items()))))

	print("Differences in language varieties distribution per genres analyzed.")

	# Length distribution of the entire corpus
	results_file.write("\n\nLength distribution of the entire corpus:\n\n")
	results_file.write(corpus["en_length"].describe().to_markdown())

	# Analyze differences in genres based on text length
	results_file.write("\n\nLength distribution for each of the genre subcorpus:\n\n")

	for i in ['News', 'Opinion/Argumentation', 'Promotion', 'Instruction', 'Information/Explanation', 'Legal', 'Prose/Lyrical']:
		results_file.write("\n\n")
		results_file.write(i)
		results_file.write("\n\n")
		filtered_corpus = corpus[corpus["final-X-GENRE"] == i]
		results_file.write(filtered_corpus["en_length"].describe().to_markdown())
	
	results_file.close()
	
	print(f"Analysis completed. Inspect the result file MaCoCu-{lang_code}-en-predicted-analysis.txt for results.")


## The pipeline

In [8]:
# Complete pipeline of MaCoCu corpus preparation for genre prediction

# Define the URL from which the MaCoCu corpus is to be downloaded:
url = "https://www.clarin.si/repository/xmlui/bitstream/handle/11356/1521/MaCoCu-bg-en.tmx.gz"

# Define the name of the zipped folder; without ".GZ":
folder_name = "MaCoCu-bg-en.tmx"

# Define the language code of the language that is in the combination with English
# (it is in the name of the MaCoCu file, e.g. "mk" in Macocu-mk-en)
lang_code = "bg"

In [9]:
# After the prediction of genres is complete, run the post-processing and analysis of results
# Define the name of the file with predictions (stated in the predict_genres.py)
file_name = "Macocu-bg-en-predicted.csv"

# Define the number of the most frequent domains you wish to analyze
# (the best is to choose domains that are present in more than 1% of data - this information
# is obtained with the analyze_prepared_corpus function.)
no_of_domains = 7

# No need to change anything below this point.
# -------------------------------------------------------------------------------------


download_unzip

In [9]:
# Download and unzip the files
download_unzip(url, folder_name)

Downloading Completed


tmx_to_json

In [10]:
# Transform the downloaded TMX file to JSON
tmx_to_json(folder_name, lang_code)

A sample of the tus in the corpora:

    <prop type="score-bicleaner-ai">0.956</prop>
    <prop type="biroamer-entities">No</prop>
    <prop type="translation-direction">bg-orig</prop>
    <prop type="type">1:1</prop>
    <tuv xml:lang="en">
     <prop type="source-document">https://www.motoroads.com/italy/milan.aspx</prop>
     <prop type="checksum-seg">bbddf8a31aae529</prop>
     <prop type="paragraph-id">p359s11</prop>
    <prop type="english-variant-document">A</prop>
    <prop type="english-variant-domain">MIX</prop>
     <seg>Scooter and motorcycle rent gives you the freedom to explore the city and its surroundings with your own rhythm and collect unforgettable memories.</seg>
    </tuv>
    <tuv xml:lang="bg">
     <prop type="source-document">https://www.motoroads.com/bulgaria/italy/milan.aspx</prop>
     <prop type="checksum-seg">7fdf8f6207e3a7b4</prop>
     <prop type="paragraph-id">p359s10</prop>
     <seg>Това ви дава свободата да разгледате града и околностите му по ваш со

preprocess

In [11]:
# Preprocess the JSON file, transformed to a DataFrame and save it to the CSV file
preprocess(lang_code)

Information about the web domains for the two languages is added. See the head of the dataframe:



Unnamed: 0,score_bicleaner_ai,biroamer_entities,translation_direction,en_source,en_par_id,en_par,en_var_doc,en_var_dom,bg_source,bg_par_id,bg_par,en_domain,bg_domain,same_domains,different_domains
2602218,0.506,No,bg-orig,http://100-years.geography.bg/,p120s0,The celebration of the 100th anniversary of th...,UNK,MIX,http://www.geography.bg/news/item/40-konferent...,p41s0,Българският географски фестивал ще се проведе ...,100-years.geography.bg,geography.bg,no,100-years.geography.bg geography.bg
3677287,0.639,No,bg-orig,http://100-years.geography.bg/conference-agenda/,p50s3+p50s4,Recent behavior and possible future evolution ...,MIX,MIX,http://www.geography.bg/news/item/123-%D0%BF%D...,p54s0,"– Петър Ножаров, Емил Гачев, Анализ на зависим...",100-years.geography.bg,geography.bg,no,100-years.geography.bg geography.bg


Number of same and different domains in the corpus:

|     |   same_domains |
|:----|---------------:|
| yes |    2.3591e+06  |
| no  |    1.49855e+06 |

Current number of sentences: 3857653
Current number of texts: 287456


Instances from different domains were discarded.

New number of sentences: 2359104
No. of discarded sentences: 1498549, percentage: 0.38846132609646333
New number of texts: 215654
No. of discarded texts: 71802, percentage: 0.24978431481687632

All duplicated English sentences with the same paragraph and sentence ID were discarded.

New number of sentences: 1773771
No. of discarded sentences: 585333, percentage: 0.24811665784975992
New number of texts: 213259
No. of discarded texts: 2395, percentage: 0.011105752733545401

The sentences were merged into texts based on the source URL and the English duplicated texts were removed.

New number of texts: 212201
No. of discarded texts: 1058, percentage: 0.004961103634547662

Initial length of texts in the corpus:
|       

Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,bg_source,en_domain,bg_domain,average_score,en_doc,bg_doc,en_length,bg_length
2485230,No,bg-orig,http://11meats.com/en/,UNK,B,http://11meats.com/,11meats.com,11meats.com,0.920227,11 Meats is a boutique meat-processing shop th...,КОИ СМЕ НИЕ Meats Meats е бутиков месопреработ...,344,322
1809399,No,en-orig,http://11meats.com/en/about-us-2/,B,B,http://11meats.com/%d0%b7%d0%b0-%d0%bd%d0%b0%d...,11meats.com,11meats.com,0.934524,Roca Moo – Barcelona Noma – Copenhagen Amass –...,Roca Moo – Барселона Noмa – Копенхаген Amass –...,429,334
1597721,No,en-orig,http://11meats.com/en/products/,UNK,B,http://11meats.com/%d0%bf%d1%80%d0%be%d0%b4%d1...,11meats.com,11meats.com,0.91965,The product does not contain allergens. Availa...,Крехки свински ребърца от селектирани породи п...,205,257
822943,No,en-orig,http://11meats.com/en/technique-2/,B,B,http://11meats.com/%d1%82%d0%b5%d1%85%d0%bd%d0...,11meats.com,11meats.com,0.913438,About Sous-Vide This innovative cooking techno...,За първи път информация за произхода на иноват...,385,349
294828,No,en-orig,http://1allsystemsbg.com/en/pages/grooming_nav...,A,A,http://1allsystemsbg.com/bg/pages/grooming_nav...,1allsystemsbg.com,1allsystemsbg.com,0.901636,"* Concentrated, low sudsing formula is Ph bala...","Формула с балансирано Ph (6.3), който не се пе...",2611,2526


The preparation of the file is finished and the file is saved as Macocu-bg-en-doc-format.csv.


filter_non_textual

In [12]:
# Apply final preprocessing step: filter out non-textual texts if applicable (change lower and upper limit if necessary)
filter_non_textual(lang_code, lower_limit = 0.015, upper_limit = 0.2)

Texts (first 5) that would be discarded with the lower limit: 0.015

News The Guidelines for Data Entry and Actualization of the Information Contained in the Management Information System for EU Structural and Cohesion Funds (UMIS) have been Published The Minister of Finance has endorsed the Guidelines for Data Entry and Actualization of the Information Contained in the Management Information System for EU Structural and Cohesion Funds (UMIS) on 23 February 2009. The purpose of these Guidelines is to facilitate the use of UMIS which will contribute to the effective and efficient management of the financial aid granted by the EU and the corresponding national co-financing and will also improve the supply of data and information to the European Commission for the purposes of documentary checks and on the spot checks and provide for the control and transparency of the physical and financial implementation of each project.
Savings driven by economies of scale Savings from leveraging the “t

Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,bg_source,en_domain,bg_domain,average_score,en_doc,bg_doc,en_length,bg_length,punct_ratio
2485230,No,bg-orig,http://11meats.com/en/,UNK,B,http://11meats.com/,11meats.com,11meats.com,0.920227,11 Meats is a boutique meat-processing shop th...,КОИ СМЕ НИЕ Meats Meats е бутиков месопреработ...,344,322,0.085213
1809399,No,en-orig,http://11meats.com/en/about-us-2/,B,B,http://11meats.com/%d0%b7%d0%b0-%d0%bd%d0%b0%d...,11meats.com,11meats.com,0.934524,Roca Moo – Barcelona Noma – Copenhagen Amass –...,Roca Moo – Барселона Noмa – Копенхаген Amass –...,429,334,0.065126
1597721,No,en-orig,http://11meats.com/en/products/,UNK,B,http://11meats.com/%d0%bf%d1%80%d0%be%d0%b4%d1...,11meats.com,11meats.com,0.91965,The product does not contain allergens. Availa...,Крехки свински ребърца от селектирани породи п...,205,257,0.105932
822943,No,en-orig,http://11meats.com/en/technique-2/,B,B,http://11meats.com/%d1%82%d0%b5%d1%85%d0%bd%d0...,11meats.com,11meats.com,0.913438,About Sous-Vide This innovative cooking techno...,За първи път информация за произхода на иноват...,385,349,0.092135
294828,No,en-orig,http://1allsystemsbg.com/en/pages/grooming_nav...,A,A,http://1allsystemsbg.com/bg/pages/grooming_nav...,1allsystemsbg.com,1allsystemsbg.com,0.901636,"* Concentrated, low sudsing formula is Ph bala...","Формула с балансирано Ph (6.3), който не се пе...",2611,2526,0.103859


The preparation of the file is finished and the file is saved as Macocu-bg-en-doc-format-filtered.csv.


analyze_prepared_corpus

In [13]:
# Analyze prepared corpus
analyze_prepared_corpus(lang_code)

# After preparation of the corpus, you can apply genre prediction to it - modify the beginning of the predict_genres.py file and run it in the terminal:
# nohup python predict_genres.py

View the corpus:


Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,bg_source,en_domain,bg_domain,average_score,en_doc,bg_doc,en_length,bg_length,punct_ratio
2485230,No,bg-orig,http://11meats.com/en/,UNK,B,http://11meats.com/,11meats.com,11meats.com,0.920227,11 Meats is a boutique meat-processing shop th...,КОИ СМЕ НИЕ Meats Meats е бутиков месопреработ...,344,322,0.085213
1809399,No,en-orig,http://11meats.com/en/about-us-2/,B,B,http://11meats.com/%d0%b7%d0%b0-%d0%bd%d0%b0%d...,11meats.com,11meats.com,0.934524,Roca Moo – Barcelona Noma – Copenhagen Amass –...,Roca Moo – Барселона Noмa – Копенхаген Amass –...,429,334,0.065126
1597721,No,en-orig,http://11meats.com/en/products/,UNK,B,http://11meats.com/%d0%bf%d1%80%d0%be%d0%b4%d1...,11meats.com,11meats.com,0.91965,The product does not contain allergens. Availa...,Крехки свински ребърца от селектирани породи п...,205,257,0.105932


All information about the corpus: 



Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,bg_source,en_domain,bg_domain,average_score,en_doc,bg_doc,en_length,bg_length,punct_ratio
count,107404,107404,107404,107404,107404,107404,107404,107404,107404.0,107404,107404,107404.0,107404.0,107404.0
unique,2,2,107404,4,4,98726,5362,5362,,107404,98664,,,
top,No,bg-orig,http://11meats.com/en/,UNK,A,https://www.websiteplanet.com/bg/,goldenpages.bg,goldenpages.bg,,11 Meats is a boutique meat-processing shop th...,Подарък за Вашия Рожден Ден – ще получите елек...,,,
freq,84830,57848,1,45933,43275,6,13020,13020,,1,9,,,
mean,,,,,,,,,0.890131,,,301.515325,274.295771,0.093803
std,,,,,,,,,0.072742,,,552.040784,594.647883,0.02894
min,,,,,,,,,0.5025,,,79.0,2.0,0.015267
25%,,,,,,,,,0.847292,,,107.0,85.0,0.075203
50%,,,,,,,,,0.91,,,170.0,148.0,0.088596
75%,,,,,,,,,0.9463,,,318.0,290.0,0.107639



Prediction of English varieties (on document level):

|     |   en_var_doc |
|:----|-------------:|
| UNK |    0.427666  |
| A   |    0.32874   |
| B   |    0.178755  |
| MIX |    0.0648393 |

Prediction of English varieties (on domain level):

|     |   en_var_dom |
|:----|-------------:|
| A   |   0.402918   |
| B   |   0.304793   |
| MIX |   0.282885   |
| UNK |   0.00940375 |

Prediction of translation direction:

|         |   translation_direction |
|:--------|------------------------:|
| bg-orig |                0.538602 |
| en-orig |                0.461398 |

Information on the bicleaner score:

|       |   average_score |
|:------|----------------:|
| count |  107404         |
| mean  |       0.890131  |
| std   |       0.0727416 |
| min   |       0.5025    |
| 25%   |       0.847292  |
| 50%   |       0.91      |
| 75%   |       0.9463    |
| max   |       0.99225   |

Final length of texts in the corpus:
|       |   en_length |
|:------|------------:|
| count |  107404    

postprocess_results

In [12]:
# Post-process the results
postprocess_results(file_name, lang_code)

Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,bg_source,en_domain,bg_domain,average_score,en_doc,bg_doc,en_length,bg_length,punct_ratio,X-GENRE,label_distribution,chosen_category_distr
2485230,No,bg-orig,http://11meats.com/en/,UNK,B,http://11meats.com/,11meats.com,11meats.com,0.920227,11 Meats is a boutique meat-processing shop th...,КОИ СМЕ НИЕ Meats Meats е бутиков месопреработ...,344,322,0.085213,Promotion,"{'Other': 0.0002, 'Information/Explanation': 0...",0.99888
1809399,No,en-orig,http://11meats.com/en/about-us-2/,B,B,http://11meats.com/%d0%b7%d0%b0-%d0%bd%d0%b0%d...,11meats.com,11meats.com,0.934524,Roca Moo – Barcelona Noma – Copenhagen Amass –...,Roca Moo – Барселона Noмa – Копенхаген Amass –...,429,334,0.065126,Promotion,"{'Other': 0.0002, 'Information/Explanation': 0...",0.998795
1597721,No,en-orig,http://11meats.com/en/products/,UNK,B,http://11meats.com/%d0%bf%d1%80%d0%be%d0%b4%d1...,11meats.com,11meats.com,0.91965,The product does not contain allergens. Availa...,Крехки свински ребърца от селектирани породи п...,205,257,0.105932,Promotion,"{'Other': 0.0003, 'Information/Explanation': 0...",0.997783


Genre distribution before post-processing: 

|                         |   Count |   Percentage |
|:------------------------|--------:|-------------:|
| Promotion               |   36397 |    33.8879   |
| Information/Explanation |   22651 |    21.0895   |
| News                    |   18278 |    17.018    |
| Other                   |    9860 |     9.18029  |
| Instruction             |    7697 |     7.1664   |
| Opinion/Argumentation   |    7648 |     7.12078  |
| Legal                   |    3113 |     2.8984   |
| Forum                   |    1186 |     1.10424  |
| Prose/Lyrical           |     574 |     0.534431 |
The Forum and Other label were discarded from the column with final genre labels.
New genre distribution:

|                         |   final-X-GENRE |
|:------------------------|----------------:|
| Promotion               |           36397 |
| Information/Explanation |           22651 |
| News                    |           18278 |
| Instruction             |        

analyze_results

In [22]:
# Analyze the results
analyze_results(lang_code, no_of_domains)

View the final dataframe:




Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,bg_source,en_domain,bg_domain,average_score,en_doc,bg_doc,en_length,bg_length,punct_ratio,X-GENRE,label_distribution,chosen_category_distr,final-X-GENRE
2485230,No,bg-orig,http://11meats.com/en/,UNK,B,http://11meats.com/,11meats.com,11meats.com,0.920227,11 Meats is a boutique meat-processing shop th...,КОИ СМЕ НИЕ Meats Meats е бутиков месопреработ...,344,322,0.085213,Promotion,"{'Other': 0.0002, 'Information/Explanation': 0...",0.99888,Promotion
1809399,No,en-orig,http://11meats.com/en/about-us-2/,B,B,http://11meats.com/%d0%b7%d0%b0-%d0%bd%d0%b0%d...,11meats.com,11meats.com,0.934524,Roca Moo – Barcelona Noma – Copenhagen Amass –...,Roca Moo – Барселона Noмa – Копенхаген Amass –...,429,334,0.065126,Promotion,"{'Other': 0.0002, 'Information/Explanation': 0...",0.998795,Promotion
1597721,No,en-orig,http://11meats.com/en/products/,UNK,B,http://11meats.com/%d0%bf%d1%80%d0%be%d0%b4%d1...,11meats.com,11meats.com,0.91965,The product does not contain allergens. Availa...,Крехки свински ребърца от селектирани породи п...,205,257,0.105932,Promotion,"{'Other': 0.0003, 'Information/Explanation': 0...",0.997783,Promotion


The distribution of genres in the most frequent domains analyzed.
Differences of the domain distribution for each genre analyzed.


Distribution of English varieties in genres - raw scores:


Labels in this order: 'News', 'Opinion/Argumentation', 'Promotion', 'Instruction','Information/Explanation', 'Legal','Prose/Lyrical'


{'A': 4715, 'B': 3941, 'MIX': 911, 'UNK': 7426}
{'A': 1507, 'B': 729, 'MIX': 271, 'UNK': 3195}
{'A': 14934, 'B': 5605, 'MIX': 2865, 'UNK': 11425}
{'A': 2478, 'B': 1168, 'MIX': 347, 'UNK': 2793}
{'A': 7614, 'B': 4446, 'MIX': 1713, 'UNK': 7347}
{'A': 673, 'B': 818, 'MIX': 200, 'UNK': 1027}
{'A': 124, 'B': 119, 'MIX': 31, 'UNK': 122}


Distribution of English varieties in genres - normalized scores:


{'A': 27.75, 'B': 23.19, 'MIX': 5.36, 'UNK': 43.7}
{'A': 26.43, 'B': 12.78, 'MIX': 4.75, 'UNK': 56.03}
{'A': 42.88, 'B': 16.09, 'MIX': 8.23, 'UNK': 32.8}
{'A': 36.52, 'B': 17.21, 'MIX': 5.11, 'UNK': 41.16}
{'A': 36.05, 'B': 21.05, 'MIX': 8.11, 'UNK': 34.79}
{'A': 24.76, 

Add combined English variety scores - if there is UNK or MIX at the doc-level, use the domain level, otherwise, use doc level.

In [8]:
import pandas as pd
import numpy as np
corpus = pd.read_csv("Macocu-bg-en-predicted-post-processed.csv", index_col = 0)

corpus.shape

(107404, 18)

In [10]:
filtered_df = corpus.dropna(subset=["final-X-GENRE"])
print(filtered_df.shape)

print(filtered_df.en_var_doc.value_counts(normalize=True))

# Add combined lang variety labels (if en_var_doc mix or unknown, use en_var_dom)

filtered_df["combined_en_var"] = np.where((filtered_df["en_var_doc"] == "UNK") | (filtered_df["en_var_doc"] == "MIX"), filtered_df["en_var_dom"], filtered_df["en_var_doc"])

print(filtered_df[["en_var_doc", "en_var_dom", "combined_en_var"]].head(10).to_markdown())

print(filtered_df["combined_en_var"].value_counts(normalize=True).to_markdown())

(88544, 18)
UNK    0.376479
A      0.361910
B      0.190030
MIX    0.071580
Name: en_var_doc, dtype: float64
|         | en_var_doc   | en_var_dom   | combined_en_var   |
|--------:|:-------------|:-------------|:------------------|
| 2485230 | UNK          | B            | B                 |
| 1809399 | B            | B            | B                 |
| 1597721 | UNK          | B            | B                 |
|  822943 | B            | B            | B                 |
|  294828 | A            | A            | A                 |
| 2857944 | UNK          | B            | B                 |
|    8733 | B            | B            | B                 |
| 3546406 | B            | B            | B                 |
| 1793981 | UNK          | B            | B                 |
|  897637 | UNK          | B            | B                 |
|     |   combined_en_var |
|:----|------------------:|
| A   |         0.548258  |
| B   |         0.305667  |
| MIX |         0.135548  |
| UNK |

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["combined_en_var"] = np.where((filtered_df["en_var_doc"] == "UNK") | (filtered_df["en_var_doc"] == "MIX"), filtered_df["en_var_dom"], filtered_df["en_var_doc"])
