In [1]:
# Define the gpu  on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

import pandas as pd
import numpy as np
from prevert import dataset
import re

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


## Add lang distr information

In [3]:
#suffix = "uk-1.0"
#path = f"/cache/tajak/macocu-mt/datasets/annotated/MaCoCu-{suffix}.tsv-genre-annotated.jsonl"
path = "datasets/annotated/MaCoCu-sq-texts-with-genres.tsv"
lang = "sq"

# Open the original dataset to extract information about lang distribution
prevert_path = "datasets/initial/MaCoCu-sq-1.0.xml"

# Open tsv (created by above process) or json (created with the python_extended.py script)
if ".tsv" in path[-6:]:
    df = pd.read_csv(path, sep="\t", index_col = 0)
elif ".json" in path[-6:]:
    df = pd.read_json(path, orient="records", lines=True)

display(df)

Unnamed: 0,text_id,text,genre,text_length
0,macocu.sq.1,Te rejat Fondi Shqiptar i Zhvillimit ka hapur ...,News,512
1,macocu.sq.2,You are using an out of date browser. It may n...,Forum,512
2,macocu.sq.3,Shkarkoni aplikacionin në celularin tuaj. Nga ...,News,512
3,macocu.sq.4,"Persona që nuk kanë një kult, që nuk kanë një ...",Opinion/Argumentation,143
4,macocu.sq.7,4 risitë e Bindjes Demokratike në këto zgjedhj...,Mix,100
...,...,...,...,...
1303397,macocu.sq.1684318,Enter full names of the beneficiary of the acc...,Legal,512
1303398,macocu.sq.1684319,Platforma Magazina franceze “Closer” pretendon...,News,512
1303399,macocu.sq.1684320,“Ky aktvendim u mor për arsye se ekziston dysh...,News,512
1303400,macocu.sq.1684321,"Saturday, 30 November 2013 Ka ndërruar jetë so...",News,512


In [5]:
# Open the dataset with the prevert parser 
dset = dataset(prevert_path)

# Create a dictionary for lang distribution for each file
lang_distr_dict = {}

for doc in dset:
	current_doc_id = doc.meta["id"]
	current_lang_distr = doc.meta["lang_distr"]
	lang_distr_dict[current_doc_id] = current_lang_distr

# rename the original df
df = df.rename(columns={"text_id":"document_id"})

# Add to the df
df = pd.merge(left=df, right=pd.DataFrame({"document_id": list(lang_distr_dict.keys()), "lang_distr": list(lang_distr_dict.values())}), on = "document_id", how = "outer")

# Skip all where there are no predictions
df = df.dropna(subset="genre")

df

Unnamed: 0,document_id,text,genre,text_length,lang_distr
0,macocu.sq.1,Te rejat Fondi Shqiptar i Zhvillimit ka hapur ...,News,512.0,"[('sq', 1.0)]"
1,macocu.sq.2,You are using an out of date browser. It may n...,Forum,512.0,"[('sq', 0.96), ('en', 0.04)]"
2,macocu.sq.3,Shkarkoni aplikacionin në celularin tuaj. Nga ...,News,512.0,"[('sq', 1.0)]"
3,macocu.sq.4,"Persona që nuk kanë një kult, që nuk kanë një ...",Opinion/Argumentation,143.0,"[('sq', 1.0)]"
4,macocu.sq.7,4 risitë e Bindjes Demokratike në këto zgjedhj...,Mix,100.0,"[('sq', 1.0)]"
...,...,...,...,...,...
1303397,macocu.sq.1684318,Enter full names of the beneficiary of the acc...,Legal,512.0,"[('sq', 0.98), ('en', 0.02)]"
1303398,macocu.sq.1684319,Platforma Magazina franceze “Closer” pretendon...,News,512.0,"[('sq', 0.98), ('en', 0.02)]"
1303399,macocu.sq.1684320,“Ky aktvendim u mor për arsye se ekziston dysh...,News,512.0,"[('sq', 1.0)]"
1303400,macocu.sq.1684321,"Saturday, 30 November 2013 Ka ndërruar jetë so...",News,512.0,"[('sq', 1.0)]"


In [6]:
df["lang_distr"].value_counts().head(5)

lang_distr
[('sq', 1.0)]                   1253573
[('sq', 0.98), ('en', 0.02)]        731
[('sq', 0.99), ('en', 0.01)]        690
[('sq', 0.97), ('en', 0.03)]        616
[('sq', 0.9), ('en', 0.1)]          590
Name: count, dtype: int64

In [7]:
# Add information about lang
df["lang"] = np.where(df["lang_distr"] == f"[('{lang}', 1.0)]", f"{lang}", "mix")

df.head(3)

Unnamed: 0,document_id,text,genre,text_length,lang_distr,lang
0,macocu.sq.1,Te rejat Fondi Shqiptar i Zhvillimit ka hapur ...,News,512.0,"[('sq', 1.0)]",sq
1,macocu.sq.2,You are using an out of date browser. It may n...,Forum,512.0,"[('sq', 0.96), ('en', 0.04)]",mix
2,macocu.sq.3,Shkarkoni aplikacionin në celularin tuaj. Nga ...,News,512.0,"[('sq', 1.0)]",sq


In [8]:
# Look at the statistics of mixed lang texts per genre
df.groupby("genre").lang.value_counts(normalize=True)

genre                    lang
Forum                    sq      0.657051
                         mix     0.342949
Information/Explanation  sq      0.956284
                         mix     0.043716
Instruction              sq      0.976110
                         mix     0.023890
Legal                    sq      0.963282
                         mix     0.036718
Mix                      sq      0.937485
                         mix     0.062515
News                     sq      0.984509
                         mix     0.015491
Opinion/Argumentation    sq      0.971300
                         mix     0.028700
Other                    sq      0.955638
                         mix     0.044362
Promotion                sq      0.946613
                         mix     0.053387
Prose/Lyrical            sq      0.828850
                         mix     0.171150
Name: proportion, dtype: float64

In [9]:
# Save the df, enriched with lang distr information
suffix = "sq-1.0"
df.to_json(f"/cache/tajak/macocu-mt/datasets/annotated/MaCoCu-{suffix}.tsv-genre-annotated-with-lang-distr.jsonl", orient="records", lines=True)

## Filter out texts with mixed langs

In [10]:
# Now, filter the corpus - keep only texts that are in Maltese only
df_filtered = df[df["lang"] == f"{lang}"]

# Save the df
df_filtered.to_json(f"/cache/tajak/macocu-mt/datasets/annotated/MaCoCu-{suffix}.tsv-genre-annotated-only-target-lang.jsonl", orient="records", lines=True)

df_filtered.head(2)


Unnamed: 0,document_id,text,genre,text_length,lang_distr,lang
0,macocu.sq.1,Te rejat Fondi Shqiptar i Zhvillimit ka hapur ...,News,512.0,"[('sq', 1.0)]",sq
2,macocu.sq.3,Shkarkoni aplikacionin në celularin tuaj. Nga ...,News,512.0,"[('sq', 1.0)]",sq


Get statistics

In [12]:
print(df_filtered.shape)

print(df_filtered.genre.value_counts().to_dict())
print("\n\n")
print(df_filtered.genre.value_counts(normalize="True").to_markdown())

(1253573, 6)
{'News': 849691, 'Information/Explanation': 100602, 'Opinion/Argumentation': 94963, 'Mix': 42529, 'Promotion': 40179, 'Instruction': 38162, 'Forum': 30796, 'Prose/Lyrical': 27178, 'Other': 25118, 'Legal': 4355}



| genre                   |   proportion |
|:------------------------|-------------:|
| News                    |   0.677815   |
| Information/Explanation |   0.0802522  |
| Opinion/Argumentation   |   0.0757539  |
| Mix                     |   0.0339262  |
| Promotion               |   0.0320516  |
| Instruction             |   0.0304426  |
| Forum                   |   0.0245666  |
| Prose/Lyrical           |   0.0216804  |
| Other                   |   0.0200371  |
| Legal                   |   0.00347407 |


In [34]:
df_filtered.describe(include="all")

## Reprocess texts that have mixed languages 

The code to reprocess and annotate texts is `4-process-mixed-lang-texts.py`

After reprocessing, let's merge the texts that did need reprocessing (they were in one language) with texts that were reprocessed.

In [2]:
# Define the gpu  on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

import pandas as pd
import numpy as np
from prevert import dataset
import re

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [2]:
# Open the dataset with texts in one lang
suffix = "el-1.0"
df = pd.read_json(f"datasets/annotated/MaCoCu-{suffix}.tsv-genre-annotated-only-target-lang.jsonl", orient="records", lines=True)

display(df["lang"].value_counts())

display(df.head(3))

# Open the newly cleaned and annotated dataset
clean_df = pd.read_json(f"datasets/annotated/MaCoCu-{suffix}-mixed-texts-genre-annotated.jsonl", orient="records", lines=True)

# Remove logit
#clean_df = clean_df.drop(columns=["logit"])

# Add text length
clean_df["text_length"] = clean_df["text"].apply(lambda x:len(x.split(" ")))

# Add information that these texts are newly cleaned
clean_df["lang"] = "reprocessed"

clean_df.head(3)

lang
el    10411807
Name: count, dtype: int64

Unnamed: 0,document_id,text,genre,logit,lang_distr,lang
0,macocu.el.1,Αριστείδης Αλαφούζος: Ο επιχειρηματικός θρίαμβ...,Information/Explanation,"[-0.6428499222, 8.249838829, -0.3538880944, -0...","[('el', 1.0)]",el
1,macocu.el.4,Πιο σοβαροί αποδεικνύονται οι τραυματισμοί των...,News,"[-1.4598587751, -0.6933389306000001, 7.8427786...","[('el', 1.0)]",el
2,macocu.el.5,Πρωτοβουλία του ΥφΑΑΤ Σίμου Κεδίκογλου για την...,News,"[-1.4910259247, -0.9129921794, 7.8686022758, -...","[('el', 1.0)]",el


Unnamed: 0,document_id,text,genre,logit,text_length,lang
0,macocu.el.420,Νέα Σμύρνη: Δεν έκαναν δωρεάν διαγνωστικά τεστ...,News,"[-1.485710501670837, -0.8395971059799191, 7.89...",445,reprocessed
1,macocu.el.8525,Οι πλατείες των συνωστισμών και της επιπολαιότ...,News,"[-1.482446551322937, -0.8473572731018061, 7.90...",384,reprocessed
2,macocu.el.11669,ΘΕΜΑ: Τακτική Γενική Συνέλευση της ΑΣΠΕ στις 1...,Legal,"[2.524170160293579, -1.799233317375183, -1.014...",512,reprocessed


In [3]:
# Merge the two datasets
merged_df = pd.concat((df,clean_df))

# Save the reprocessed file
merged_df.to_json(f"/cache/tajak/macocu-mt/datasets/annotated/MaCoCu-{suffix}-genre-annotated-reprocessed-final.jsonl", orient="records", lines=True)

merged_df

In [5]:
# Get statistics

print(merged_df.shape)
print(merged_df.lang.value_counts().to_markdown())
print("\n")
print(merged_df.genre.value_counts().to_dict())
print("\n")
print(merged_df.genre.value_counts(normalize=True).to_markdown())

(513437, 7)
| lang        |   count |
|:------------|--------:|
| mt          |  466327 |
| reprocessed |   47110 |


{'Information/Explanation': 209770, 'Mix': 124395, 'Forum': 52162, 'Opinion/Argumentation': 48625, 'Other': 36497, 'Prose/Lyrical': 35932, 'Instruction': 2271, 'Promotion': 2239, 'News': 1546}


| genre                   |   proportion |
|:------------------------|-------------:|
| Information/Explanation |   0.40856    |
| Mix                     |   0.242279   |
| Forum                   |   0.101594   |
| Opinion/Argumentation   |   0.0947049  |
| Other                   |   0.0710837  |
| Prose/Lyrical           |   0.0699833  |
| Instruction             |   0.00442313 |
| Promotion               |   0.00436081 |
| News                    |   0.00301108 |
