In [1]:
import pandas as pd
from morfeusz2 import Morfeusz
morfeusz = Morfeusz() 

In [2]:
pd.set_option('display.max_colwidth', None)

## Loading data and choosing only appropiate MWE

In [3]:
poleval_df = pd.read_csv("poleval2019_task2_training_190221/index.tsv", sep="\t", header=None)
poleval_df.columns = ["phrase_id", "doc_id", "text", "lemma"]
poleval_df.head()

Unnamed: 0,phrase_id,doc_id,text,lemma
0,31822,99883,Toronto Dominion Centre,Toronto Dominion Centre
1,40025,99883,Toronto,Toronto
2,343873,99883,kompleks handlowo-kulturalny,kompleks handlowo-kulturalny
3,31833,99883,Joe Fafard,Joe Fafard
4,327365,99883,kanadyjskim,kanadyjski


In [4]:
print(f"Shape of the original Poleval 2019 dataset: {poleval_df.shape}")

Shape of the original Poleval 2019 dataset: (22177, 4)


In [5]:
mwe_df = poleval_df[poleval_df["text"].str.split().str.len() > 1]
mwe_df.head()

Unnamed: 0,phrase_id,doc_id,text,lemma
0,31822,99883,Toronto Dominion Centre,Toronto Dominion Centre
2,343873,99883,kompleks handlowo-kulturalny,kompleks handlowo-kulturalny
3,31833,99883,Joe Fafard,Joe Fafard
5,343872,99883,Ludwiga Mies van der Rohe,Ludwig Mies van der Rohe
7,343871,99883,Toronto Dominion Gallery of Inuit Art,Toronto Dominion Gallery of Inuit Art


In [6]:
print(f"Shape of the Poleval 2019 dataset (only multi word expressions): {mwe_df.shape}")

Shape of the Poleval 2019 dataset (only multi word expressions): (9718, 4)


**remove not polish MWE**


In [7]:
def is_polish_mwe(text):
    """
    Check if the text is a valid Polish MWE.
    Returns True if at least one word is recognized (not tagged as 'ign').
    """
    analysis = morfeusz.analyse(text)
    for _, _, interpretations in analysis:
        tag = interpretations[2]
        if tag != 'ign':
            return True
    return False

In [8]:
df_unrecognized = mwe_df[~mwe_df['text'].apply(is_polish_mwe)].reset_index(drop=True)
print(f"Number of unrecognized MWEs: {len(df_unrecognized)}")
df_unrecognized.sample(10)

Number of unrecognized MWEs: 566


Unnamed: 0,phrase_id,doc_id,text,lemma
16,99721,100524,Barvicha Luxury Village,Barvicha Luxury Village
393,287881,102435,History Carnival,History Carnival
482,471925,103954,Angry Birds,Angry Birds
225,72068,101751,Creative Commons,Creative Commons
473,471465,103927,Andrei Strbik,Andrea Strbik
161,67462,101384,Tiengiz Grigorjewicz Sułakwielidze,Tiengiz Grigorjewicz Sułakwielidze
549,490467,107326,Nicolasem Sarkozy,Nicolas Sarkozy
63,52562,101186,Pearson Television,Pearson Television
271,73836,101886,Two Harbors,Two Harbors
57,52395,101173,Olle Hansson,Olle Hansson


In [9]:
mwe_df = mwe_df[mwe_df["text"].apply(is_polish_mwe)].reset_index(drop=True)
print(f"Shape of the Poleval 2019 dataset (only valid Polish MWEs): {mwe_df.shape}")

Shape of the Poleval 2019 dataset (only valid Polish MWEs): (9152, 4)


There was 566 fully not recognized expressions by Morfeusz.


**Remove duplicates**


In [10]:
def remove_duplicates(df):
    """
    Remove duplicates from the DataFrame based on 'text' and 'lemma'.
    Keeps the first occurrence.
    """
    return df.drop_duplicates(subset=["text", "lemma"], keep='first')

In [11]:
mwe_df = remove_duplicates(mwe_df)
print(f"Shape of the Poleval 2019 dataset (after removing duplicates): {mwe_df.shape}")

Shape of the Poleval 2019 dataset (after removing duplicates): (7453, 4)


**TO DO: podzielić na nieodmienne i mianowniki:**


In [12]:
mwe_df[mwe_df["text"] == mwe_df["lemma"]]


Unnamed: 0,phrase_id,doc_id,text,lemma
0,31822,99883,Toronto Dominion Centre,Toronto Dominion Centre
1,343873,99883,kompleks handlowo-kulturalny,kompleks handlowo-kulturalny
2,31833,99883,Joe Fafard,Joe Fafard
4,343871,99883,Toronto Dominion Gallery of Inuit Art,Toronto Dominion Gallery of Inuit Art
5,343875,100499,PZL.13 (PZL-13),PZL.13 (PZL-13)
...,...,...,...,...
9137,329445,107360,Marin Leovac,Marin Leovac
9139,329446,107360,Roland Linz,Roland Linz
9142,329448,107360,Tomas Jun,Tomas Jun
9144,329449,107360,Patrick Salomon,Patrick Salomon


In [13]:
mwe_df = mwe_df[mwe_df["text"] != mwe_df["lemma"]]


In [14]:
print(f"Shape of the Poleval 2019 dataset (only inflected mwe): {mwe_df.shape}")


Shape of the Poleval 2019 dataset (only inflected mwe): (3828, 4)


In [15]:
mwe_df[['text', 'lemma']].sample(10)

Unnamed: 0,text,lemma
2283,Lexusa IS,Lexus IS
280,igrzyskach w Nagano,igrzyska w Nagano
3188,Sesji Sejmu Dzieci i Młodzieży,Sejm Dzieci i Młodzieży
7531,A Johnsona,A Johnson
1673,Anny Marii z domu Petardówny,Anna Maria z domu Petardówny
7322,stanów kwantowych,stany kwantowe
1746,Willa Wrighta,Will Wright
2100,Osiedle XXV-lecia PRL,osiedle XXV-lecia PRL
6803,Porozumienia Międzyinstytucjonalnego,Porozumienie Międzyinstytucjonalne
1137,departamencie Loiret,departament Loiret


## Cleaning

In [16]:
mwe_df.isna().sum()

phrase_id    0
doc_id       0
text         0
lemma        1
dtype: int64

In [17]:
mwe_df[mwe_df["lemma"].isna()]

Unnamed: 0,phrase_id,doc_id,text,lemma
1334,307628,101286,San Diego Rockets,


In [18]:
# I can drop it because it's not a polish MWE
mwe_df = mwe_df.dropna()

In [19]:
mwe_df.isna().sum()

phrase_id    0
doc_id       0
text         0
lemma        0
dtype: int64

In [20]:
mwe_df[mwe_df["text"] == "Dolce & Gabbana"]

Unnamed: 0,phrase_id,doc_id,text,lemma
161,99725,100524,Dolce & Gabbana,Dolce &amp; Gabbana


In [21]:
mwe_df[mwe_df["text"] == "Lublin R-VIII a"]

Unnamed: 0,phrase_id,doc_id,text,lemma
734,65875,101200,Lublin R-VIII a,Lublin R-VIII


In [22]:
mwe_df[mwe_df["text"] == "BBC Radio 1's Live Lounge"]

Unnamed: 0,phrase_id,doc_id,text,lemma
2264,426217,101434,BBC Radio 1's Live Lounge,BBC Radio 1


In [23]:
mwe_df[mwe_df["text"] == "Jammin' CRT.5"]

Unnamed: 0,phrase_id,doc_id,text,lemma
5108,468926,102125,Jammin' CRT.5,Jammin


In [24]:
mwe_df[mwe_df["text"] == "Traxxas Rustler XL-1 '09"]


Unnamed: 0,phrase_id,doc_id,text,lemma
6580,469441,102506,Traxxas Rustler XL-1 '09,Traxxas Rustler XL-1


In [25]:
mwe_df[mwe_df["text"] == "Żerań F S O"]


Unnamed: 0,phrase_id,doc_id,text,lemma
6916,350246,102673,Żerań F S O,Żerań FSO


In [26]:
mwe_df.loc[mwe_df["text"] == "Żerań F S O", "text"] = "Żerań FSO"

In [27]:
mwe_df.loc[mwe_df["text"] == "R C S Orzeł", "text"] = "RCS Orzeł"

In [28]:
mwe_df = mwe_df[mwe_df["text"] != "S K M"]
mwe_df = mwe_df[mwe_df["text"] != "R K S"]

In [29]:
mwe_df[mwe_df["lemma"].str.contains("ZTM")]

Unnamed: 0,phrase_id,doc_id,text,lemma
6897,352573,102669,Z T M,ZTM


In [30]:
mwe_df[mwe_df["lemma"].str.contains("WKD")]


Unnamed: 0,phrase_id,doc_id,text,lemma
6988,352545,102697,W K D,WKD


In [31]:
mwe_df[mwe_df["text"] == "átlátszó ("]

Unnamed: 0,phrase_id,doc_id,text,lemma
8097,481079,103921,átlátszó (,átlátszó


In [32]:
mwe_df[mwe_df["text"] == "Społecznej Rady Konsultacyjnej ds. aktualizacji „Strategii Rozwoju Bydgoszczy do 2015 roku”"]

Unnamed: 0,phrase_id,doc_id,text,lemma
5224,79818,102157,Społecznej Rady Konsultacyjnej ds. aktualizacji „Strategii Rozwoju Bydgoszczy do 2015 roku”,Społeczna Rada Konsultacyjna ds. aktualizacji„Strategii Rozwoju Bydgoszczy do 2015 roku


In [33]:
mwe_df.loc[mwe_df["text"] == 'Społecznej Rady Konsultacyjnej ds. aktualizacji „Strategii Rozwoju Bydgoszczy do 2015 roku”', "lemma"] = 'Społeczna Rada Konsultacyjna ds. aktualizacji „Strategii Rozwoju Bydgoszczy do 2015 roku"'

In [34]:
mwe_df[mwe_df["text"].str.contains("Fundacji Ośrodka")]

Unnamed: 0,phrase_id,doc_id,text,lemma
6812,71729,102602,"Fundacji Ośrodka "" Karta ""","Fundacja Ośrodka ""Karta """


In [35]:
mwe_df.loc[mwe_df["text"] == 'Fundacji Ośrodka " Karta "', ["text", "lemma"]] = ['Fundacji Ośrodka "Karta"', 'Fundacja Ośrodka "Karta"']

In [36]:
mwe_df[mwe_df["text"].str.contains("FILLER")] # error during creating dataset

Unnamed: 0,phrase_id,doc_id,text,lemma
6973,474289,102694,Dworca FILLER Centralnego,Dworzec Centralny
6978,350524,102696,Dworzec FILLER Śródmieście,Dworzec Śródmieście


In [37]:
mwe_df.loc[mwe_df["text"] == 'Dworca FILLER Centralnego', "text"] = 'Dworca Centralnego'
mwe_df.loc[mwe_df["text"] == 'Dworzec FILLER Śródmieście', "text"] = 'Dworzec Śródmieście'

In [38]:
mwe_df[mwe_df["text"].str.contains("Herburt")]

Unnamed: 0,phrase_id,doc_id,text,lemma
1202,50764,101263,Jan Herburt (Arłamowski,Jan Herburt (Arłamowski)


In [39]:
mwe_df[mwe_df["text"].str.contains("Naukę")]

Unnamed: 0,phrase_id,doc_id,text,lemma
4075,310468,101854,Naukę 2.0,Nauka2.0


In [40]:
mwe_df.loc[mwe_df["text"] == "Naukę 2.0", "lemma"] = "Nauka 2.0"


In [41]:
mwe_df[mwe_df["text"] == "Sesji Sejmu Dzieci i Młodzieży"]

Unnamed: 0,phrase_id,doc_id,text,lemma
3188,63764,101638,Sesji Sejmu Dzieci i Młodzieży,Sejm Dzieci i Młodzieży


In [42]:
mwe_df.loc[mwe_df["text"] == "Sesji Sejmu Dzieci i Młodzieży", "lemma"] = "Sesja Sejmu Dzieci i Młodzieży"

In [43]:
mwe_df[mwe_df["text"]=='ustawy o zmianie ustawy Kodeks postępowania karnego']

Unnamed: 0,phrase_id,doc_id,text,lemma
3279,63911,101644,ustawy o zmianie ustawy Kodeks postępowania karnego,Ustawao zmianie ustawy Kodeks postępowania karnego


In [44]:
mwe_df.loc[mwe_df["text"]=='ustawy o zmianie ustawy Kodeks postępowania karnego', "lemma"] = 'ustawa o zmianie ustawy Kodeks postępowania karnego'


In [45]:
mwe_df[mwe_df["text"].str.contains("o ujawnieniu pracy lub służby")]


Unnamed: 0,phrase_id,doc_id,text,lemma
4449,81044,101946,o zmianie ustawy o ujawnieniu pracy lub służby w organach bezpieczeństwa państwa lub współpracy z nimi w latach 1944–1990 osób pełniących funkcje publiczne,ustawa o ujawnieniu pracy lub służby w organach bezpieczeństwa państwa lub współpracy z nimi w latach 1944–1990 osób pełniących funkcje publiczne
4451,81046,101946,ustawie z dnia 11 kwietnia 1997 r. o ujawnieniu pracy lub służby w organach bezpieczeństwa państwa lub współpracy z nimi w latach 1944-1990 osób pełniących funkcje publiczne,Ustawa z dnia 11 kwietnia 1997 r. o ujawnieniu pracy lub służby w organach bezpieczeństwa państwa lub współpracy z nimi w latach 1944-1990 osób pełniących funkcje publiczne


In [46]:
mwe_df = mwe_df.drop(index=4449)

In [47]:
mwe_df.drop(mwe_df[mwe_df["text"].isin([
    "Dolce & Gabbana", # not a polish MWE
    "Lublin R-VIII a", # ???
    "BBC Radio 1's Live Lounge", # not a polish MWE
    "Jammin' CRT.5", # not a polish MWE
    "Traxxas Rustler XL-1 '09", # not a polish MWE
    "átlátszó (", # not a polish MWE
    "Z T M",
    "W K D"
])].index, inplace=True)

In [48]:
print(f"Shape of the Poleval 2019 dataset (after cleaning): {mwe_df.shape}")

Shape of the Poleval 2019 dataset (after cleaning): (3816, 4)


In [49]:
mwe_df.to_csv("mwe_cleaned_df.csv")

## Actual analysis

In [50]:
mwe_df = pd.read_csv("mwe_cleaned_df.csv", index_col=0)
print(f"Shape of the cleaned Poleval 2019 dataset: {mwe_df.shape}")

Shape of the cleaned Poleval 2019 dataset: (3816, 4)


In [51]:
mwe_df.head()

Unnamed: 0,phrase_id,doc_id,text,lemma
3,343872,99883,Ludwiga Mies van der Rohe,Ludwig Mies van der Rohe
7,343879,100499,samolotu PZL.23 „Karaś”,samolot PZL.23 „Karaś”
8,31843,100499,Ministerstwa Komunikacji,Ministerstwo Komunikacji
11,343880,100499,silnikiem Pratt-Whitney Wasp,silnik Pratt-Whitney Wasp
14,31839,100499,Stanisława Praussa,Stanisław Prauss


In [52]:
mwe_df.isna().sum()

phrase_id    0
doc_id       0
text         0
lemma        0
dtype: int64

In [53]:
def is_conjunction(word):
    """
    Check if the word is a conjunction.
    Returns True if the word is a conjunction.
    """
    analysis = morfeusz.analyse(word)
    for _, _, interpretations in analysis:
        tag = interpretations[2]
        if tag.startswith('conj'):
            return True
    return False

In [54]:
def analyze_pair(words, lemmas):
    """
    Analyze a 2 word expression.
    :param words: List of 2 words in the expression.
    :param lemmas: List of 2 lemmas corresponding to the words.
    :return: number, case, relation type and head of the expression.
    """
    # possible numbers and cases for the words
    number_case = {}
    part_of_speech = {}

    # returned values
    number = None
    case = None
    relation = None
    head = None

    #print(f"Analyzing pair: {words} with lemmas: {lemmas}")

    for i, word in enumerate(words):
        number_case_i = set()
        analyses = morfeusz.analyse(word)

        for _, _, interpretations in analyses:
            tag = interpretations[2].split(":")
            if len(tag) > 2:
                number_i = tag[1]
                case_i = tag[2].split('.')

                # Morfeusz can return more than 1 possibilty of case after the dot so I make a list of it 
                for c in case_i:
                    number_case_i.add((number_i, c))

                part_of_speech[word] = tag[0]

        number_case[i] = number_case_i

    available = [i for i in number_case if number_case[i]]

    # check if it is possible to find common number and case for the words in the mwe -> ZWIĄZEK ZGODY
    common_cases = set.intersection(*number_case.values())
    if len(common_cases)>=1:
        # if there is only one common pair(number, case), then it is an agreement relation and the number and case of the expression can be assigned to the whole expression
        
        if len(common_cases) == 1:
            number_case_head_list = list(common_cases)[0]
            number = number_case_head_list[0]
            case = number_case_head_list[1]
 
        else:
            number = [x[0] for x in list(common_cases)]
            case = [x[1] for x in list(common_cases)]
            
        relation = "agreement"

        # noun -> HEAD
        for i, (word, tag) in enumerate(part_of_speech.items()):
            if tag == 'subst':
                head = lemmas[i]
                break

        # if there is no noun in the expression, then the head is the first word in the expression
            else:
                head = lemmas[0]

    elif [w_i for w_i, l_i in zip(words, lemmas) if w_i == l_i]:
        # government relation    
        for i, word in enumerate(words):
            if word == lemmas[i]:
                continue
            else: 
                if number_case[i]:
                    head = lemmas[i]
                    number_case_head_list = list(number_case[i])
                    number = [x[0] for x in number_case_head_list]
                    case = [x[1] for x in number_case_head_list]
                    relation = "government"
        
    if len(available) == 1:
        #print(f"Only one word has a valid analysis: {words[available[0]]} with lemma {lemmas[available[0]]}, words: {words}, lemmas: {lemmas}, number_case: {number_case}")
        # if there is only one word in the expression that has a valid analysis, then it is the head of the expression
        head = lemmas[available[0]]
        number_case_head_list = list(number_case[available[0]])
        number = [x[0] for x in number_case_head_list]
        case = [x[1] for x in number_case_head_list]
        
        idx_j = 1 - available[0]  # index of the second word in the pair
        if words[idx_j] == lemmas[idx_j]:
            # if the second word is the same as the lemma, then it is a government relation
            relation = "government"
        else:
            relation = "agreement"

    return number, case, relation, head


def analyze_mwe(phrase, lemma):
    """
    Analyze a multi-word expression.
    :param phrase: The multi-word expression.
    :param lemma: The lemma of the multi-word expression.
    :return: analysis results: numbers, cases, relations, heads, pairs.
    """

    phrase = phrase.replace(" - ", "-")
    phrase = phrase.replace("___", "")

    
    words = phrase.strip().split()
    lemmas = lemma.strip().split() 

    numbers = []
    cases = []
    relations = []
    heads = []
    pairs = []

    i = 0
    while i < len(words)-1:
        if is_conjunction(words[i+1]) and i + 2 < len(words):
            pair_words = [words[i], words[i+2]]
            pair_lemmas = [lemmas[i], lemmas[i+2]]
            i += 2 # skip the conjunction
        
        else:
            pair_words = words[i:i+2]
            pair_lemmas = lemmas[i:i+2]
            i += 1 

        # analyze the pair of words
        analyses = analyze_pair(pair_words, pair_lemmas)
        number, case, relation, head = analyses
        pairs.append(tuple(pair_words))
        numbers.append(number)  
        cases.append(case)
        relations.append(relation)
        heads.append(head)


    return numbers, cases, relations, heads, pairs

In [55]:
mwe_df[["numbers", "cases", "relations", "heads", "pairs"]] = mwe_df.apply(lambda row: pd.Series(analyze_mwe(row["text"], row["lemma"])), axis=1)

In [56]:
mwe_df.sample(10)

Unnamed: 0,phrase_id,doc_id,text,lemma,numbers,cases,relations,heads,pairs
1718,425138,101348,gminie Wieczfnia Kościelna,gmina Wieczfnia Kościelna,"[[sg, sg, sg], sg]","[[voc, loc, dat], nom]","[government, agreement]","[gmina, Wieczfnia]","[(gminie, Wieczfnia), (Wieczfnia, Kościelna)]"
2648,58422,101503,Juliusza III,Juliusz III,"[[sg, sg]]","[[acc, gen]]",[government],[Juliusz],"[(Juliusza, III)]"
7151,466640,102772,Instytutu Naukowo-Badawczego Kolejnictwa,Instytut Naukowo-Badawczy Kolejnictwa,"[sg, sg]","[gen, gen]","[agreement, agreement]","[Instytut, Kolejnictwa]","[(Instytutu, Naukowo-Badawczego), (Naukowo-Badawczego, Kolejnictwa)]"
2971,467312,101573,Don Pedrem de Mendez,Don Pedro de Mendez,"[[sg, sg], [sg.pl, sg.pl, sg.pl, sg.pl, sg.pl, sg.pl, sg.pl], [sg.pl, sg.pl, sg.pl, sg.pl, sg.pl, sg.pl, sg.pl]]","[[acc, nom], [inst, acc, loc, nom, gen, dat, voc], [inst, acc, loc, nom, gen, dat, voc]]","[agreement, agreement, government]","[Don, de, de]","[(Don, Pedrem), (Pedrem, de), (de, Mendez)]"
4686,349215,102030,X Liceum Ogólnokształcącego,X Liceum Ogólnokształcące,"[[sg, sg, sg, sg, sg, sg, sg], [sg, sg]]","[[inst, acc, loc, gen, dat, voc, nom], [acc, gen]]","[government, agreement]","[Liceum, Liceum]","[(X, Liceum), (Liceum, Ogólnokształcącego)]"
107,32097,100520,Papieskim Uniwersytecie Salezjańskim,Papieski Uniwersytet Salezjański,"[sg, sg]","[loc, loc]","[agreement, agreement]","[Uniwersytet, Uniwersytet]","[(Papieskim, Uniwersytecie), (Uniwersytecie, Salezjańskim)]"
1544,422936,101315,hrabstwie Dorset,hrabstwo Dorset,[[sg]],[[loc]],[government],[hrabstwo],"[(hrabstwie, Dorset)]"
4542,473585,101956,niszowych artystów,niszowi artyści,"[[pl, pl]]","[[acc, gen]]",[agreement],[artyści],"[(niszowych, artystów)]"
8189,482833,103944,Julii Marcel,Julia Marcel,[pl],[gen],[agreement],[Julia],"[(Julii, Marcel)]"
2878,356812,101534,pracowników 50+,pracownik 50+,"[[pl, pl]]","[[acc, gen]]",[government],[pracownik],"[(pracowników, 50+)]"


In [57]:
def extract_all_relation_data(numbers, cases, relations, heads):
    results = []

    filtered = [
        (n, c, r, h)
        for n, c, r, h in zip(numbers, cases, relations, heads)
        if n is not None and c is not None and r is not None and h is not None
    ]

    gov_data = [(n, c, r, h) for n, c, r, h in filtered if r == "government"]
    agree_data = [(n, c, r, h) for n, c, r, h in filtered if r == "agreement"]

    # GOV: only 1 pair
    if len(gov_data) == 1:
        n, c, r, h = gov_data[0]
        n_list = n if isinstance(n, list) else [n]
        c_list = c if isinstance(c, list) else [c]
        for num, case in zip(n_list, c_list):
            if num is not None and case is not None:
                results.append(("government", h, num, case))
        return results

    # GOV: more than 1 pair
    elif len(gov_data) > 1:
        all_pairs = []
        for n, c, *_ in gov_data:
            n_list = n if isinstance(n, list) else [n]
            c_list = c if isinstance(c, list) else [c]
            pairs = set(zip(n_list, c_list))
            all_pairs.append(pairs)

        common_pairs = set.intersection(*all_pairs)
        # If there are common pairs, return them
        if common_pairs:
            h = gov_data[0][3]
            for num, case in common_pairs:
                results.append(("government", h, num, case))
            return results

        # If no common pairs, return the first pair
        n = gov_data[0][0]
        c = gov_data[0][1]
        h = gov_data[0][3]
        n_list = n if isinstance(n, list) else [n]
        c_list = c if isinstance(c, list) else [c]

        for num, case in zip(n_list, c_list):
            if num is not None and case is not None:
                results.append(("government", h, num, case))
        return results


    # AGREEMENT
    if agree_data:
        all_pairs = []
        for n, c, *_ in agree_data:
            n_list = n if isinstance(n, list) else [n]
            c_list = c if isinstance(c, list) else [c]
            pairs = set(zip(n_list, c_list))
            all_pairs.append(pairs)

        common_pairs = set.intersection(*all_pairs)
        if common_pairs:
            h = agree_data[0][3]
            for num, case in common_pairs:
                results.append(("agreement", h, num, case))
            return results


        n = agree_data[0][0]
        c = agree_data[0][1]
        h = agree_data[0][3]
        n_list = n if isinstance(n, list) else [n]
        c_list = c if isinstance(c, list) else [c]

        for num, case in zip(n_list, c_list):
            if num is not None and case is not None:
                results.append(("agreement", h, num, case))
        return results

    
    return [(None, None, None, None)]  

expanded_rows = []
for idx, row in mwe_df.iterrows():
    results = extract_all_relation_data(row["numbers"], row["cases"], row["relations"], row["heads"])
    #print(results)
    for rel, head, num, case in results:
        new_row = row.copy()
        new_row["relation"] = rel
        new_row["head"] = head
        new_row["final_number"] = num
        new_row["final_case"] = case
        expanded_rows.append(new_row)

expanded_df = pd.DataFrame(expanded_rows)

In [68]:
expanded_df[160:170]

Unnamed: 0,phrase_id,doc_id,text,lemma,numbers,cases,relations,heads,pairs,relation,head,final_number,final_case
143,309612,100523,Wambierzyckiej Królowej Rodzin,Wambierzycka Królowa Rodzin,"[[sg, sg, sg], [sg, sg, sg]]","[[gen, loc, dat], [gen, loc, dat]]","[agreement, government]","[Królowa, Królowa]","[(Wambierzyckiej, Królowej), (Królowej, Rodzin)]",government,Królowa,sg,gen
143,309612,100523,Wambierzyckiej Królowej Rodzin,Wambierzycka Królowa Rodzin,"[[sg, sg, sg], [sg, sg, sg]]","[[gen, loc, dat], [gen, loc, dat]]","[agreement, government]","[Królowa, Królowa]","[(Wambierzyckiej, Królowej), (Królowej, Rodzin)]",government,Królowa,sg,loc
143,309612,100523,Wambierzyckiej Królowej Rodzin,Wambierzycka Królowa Rodzin,"[[sg, sg, sg], [sg, sg, sg]]","[[gen, loc, dat], [gen, loc, dat]]","[agreement, government]","[Królowa, Królowa]","[(Wambierzyckiej, Królowej), (Królowej, Rodzin)]",government,Królowa,sg,dat
147,305858,100523,wojny trzydziestoletniej,wojna trzydziestoletnia,[sg],[gen],[agreement],[wojna],"[(wojny, trzydziestoletniej)]",agreement,wojna,sg,gen
148,344031,100523,bazyliki mniejszej,bazylika mniejsza,[sg],[gen],[agreement],[bazylika],"[(bazyliki, mniejszej)]",agreement,bazylika,sg,gen
149,344034,100523,hrabiego Franciszka Antoniego von Goetzen,hrabia Franciszek Antoni von Goetzen,"[[sg, sg], [sg, sg], [sg, sg], None]","[[acc, gen], [acc, gen], [acc, gen], None]","[agreement, agreement, government, None]","[hrabia, Franciszek, Antoni, None]","[(hrabiego, Franciszka), (Franciszka, Antoniego), (Antoniego, von), (von, Goetzen)]",government,Antoni,sg,acc
149,344034,100523,hrabiego Franciszka Antoniego von Goetzen,hrabia Franciszek Antoni von Goetzen,"[[sg, sg], [sg, sg], [sg, sg], None]","[[acc, gen], [acc, gen], [acc, gen], None]","[agreement, agreement, government, None]","[hrabia, Franciszek, Antoni, None]","[(hrabiego, Franciszka), (Franciszka, Antoniego), (Antoniego, von), (von, Goetzen)]",government,Antoni,sg,gen
150,32847,100523,Franciszka Antoniego von Goetzen,Franciszek Antoni von Goetzen,"[[sg, sg], [sg, sg], None]","[[acc, gen], [acc, gen], None]","[agreement, government, None]","[Franciszek, Antoni, None]","[(Franciszka, Antoniego), (Antoniego, von), (von, Goetzen)]",government,Antoni,sg,acc
150,32847,100523,Franciszka Antoniego von Goetzen,Franciszek Antoni von Goetzen,"[[sg, sg], [sg, sg], None]","[[acc, gen], [acc, gen], None]","[agreement, government, None]","[Franciszek, Antoni, None]","[(Franciszka, Antoniego), (Antoniego, von), (von, Goetzen)]",government,Antoni,sg,gen
151,309617,100523,Matki Bożej,Matka Boża,[sg],[gen],[agreement],[Matka],"[(Matki, Bożej)]",agreement,Matka,sg,gen


In [59]:
expanded_df.isna().sum()

phrase_id        0
doc_id           0
text             0
lemma            0
numbers          0
cases            0
relations        0
heads            0
pairs            0
relation        43
head            43
final_number    43
final_case      43
dtype: int64

In [60]:
expanded_df[expanded_df['head'].isna()]

Unnamed: 0,phrase_id,doc_id,text,lemma,numbers,cases,relations,heads,pairs,relation,head,final_number,final_case
184,345762,100527,archipelagu Seszeli,archipelag Seszele,[None],[None],[None],[None],"[(archipelagu, Seszeli)]",,,,
407,41471,100620,R. Brautigana,R. Brautigan,[None],[None],[None],[None],"[(R., Brautigana)]",,,,
421,40626,100626,Sokolnikach Lesie,Sokolniki Las,[None],[None],[None],[None],"[(Sokolnikach, Lesie)]",,,,
536,52361,101170,ziemie sanocką,ziemia sanocka,[None],[None],[None],[None],"[(ziemie, sanocką)]",,,,
1152,50581,101256,J. A. Marinsky'ego,J. A. Marinsky,[None],[None],[None],[None],"[(J., Marinsky'ego)]",,,,
1156,50583,101256,L. E. Glendenina,L. E. Glendenin,"[None, None]","[None, None]","[None, None]","[None, None]","[(L., E.), (E., Glendenina)]",,,,
1268,51007,101275,Henrietą von Grunburg,Henrieta von Grunburg,"[None, None]","[None, None]","[None, None]","[None, None]","[(Henrietą, von), (von, Grunburg)]",,,,
1532,422905,101311,V. Vanýska,V. Vanýsk,[None],[None],[None],[None],"[(V., Vanýska)]",,,,
1883,425275,101365,paśmie Błotnego,pasmo Błotny,[None],[None],[None],[None],"[(paśmie, Błotnego)]",,,,
2201,67569,101419,Vincenta Siewa,Vincent Siew,[None],[None],[None],[None],"[(Vincenta, Siewa)]",,,,


In [62]:
expanded_df.shape

(6844, 13)

In [65]:
final_df = expanded_df.dropna()
print(f"Shape of the final Poleval 2019 dataset: {final_df.shape}")

Shape of the final Poleval 2019 dataset: (6801, 13)


In [67]:
final_df.to_csv("final_poleval2019_mwe_dataset.csv")

## NOtes

In [63]:
# def extract_relation_head(relations, heads):
#     for relation, head in zip(relations, heads):
#         if relation == 'government':
#             return 'governemnt', head
#         elif relation == 'agreement':
#             return 'agreement', head
#     return None, None

# mwe_df[["relation", "head"]] = mwe_df.apply(lambda row: pd.Series(extract_relation_head(row["relations"], row["heads"])), axis=1)

In [64]:
# def extract_full_info(numbers, cases, relations, heads):
#     # ignore None values
#     filtered = [
#         (n, c, r, h)
#         for n, c, r, h in zip(numbers, cases, relations, heads)
#         if n is not None and c is not None and r is not None and h is not None
#         ]

#     if not filtered:
#         return None, None, None, None

#     for n, c, r, h in filtered:
#         if r == "government":
#             if len(c)==1:
#                 return r, h, n[0], c[0]
#             else:
#                 all_numbers = [set(x) if isinstance(x, list) else {x} for x in n]

#     # Jeżeli wszystko to związki zgody
#     if all(r == "agreement" for _, _, r, _ in filtered):
#         norm_numbers = [set(x) if isinstance(x, list) else {x} for x, *_ in filtered]
#         norm_cases = [set(x) if isinstance(x, list) else {x} for _, x, *_ in filtered]

#         common_number = set.intersection(*norm_numbers)
#         common_case = set.intersection(*norm_cases)

#         head = filtered[0][3]  # pierwszy head
#         return "agreement", head, (
#             list(common_number)[0] if common_number else None
#         ), (
#             list(common_case)[0] if common_case else None
#         )


#     return None, None, None, None


# mwe_df[["relation", "head", "final_number", "final_case"]] = mwe_df.apply(
#     lambda row: pd.Series(extract_full_info(row["numbers"], row["cases"], row["relations"], row["heads"])),
#     axis=1
# )