In [458]:
import pandas as pd
from morfeusz2 import Morfeusz
morfeusz = Morfeusz() 

In [459]:
pd.set_option('display.max_colwidth', None)

## Loading data and choosing only appropiate MWE

In [460]:
poleval_df = pd.read_csv("poleval2019_task2_training_190221/index.tsv", sep="\t", header=None)
poleval_df.columns = ["phrase_id", "doc_id", "text", "lemma"]
poleval_df.head()

Unnamed: 0,phrase_id,doc_id,text,lemma
0,31822,99883,Toronto Dominion Centre,Toronto Dominion Centre
1,40025,99883,Toronto,Toronto
2,343873,99883,kompleks handlowo-kulturalny,kompleks handlowo-kulturalny
3,31833,99883,Joe Fafard,Joe Fafard
4,327365,99883,kanadyjskim,kanadyjski


In [461]:
print(f"Shape of the original Poleval 2019 dataset: {poleval_df.shape}")

Shape of the original Poleval 2019 dataset: (22177, 4)


In [462]:
mwe_df = poleval_df[poleval_df["text"].str.split().str.len() > 1]
mwe_df.head()

Unnamed: 0,phrase_id,doc_id,text,lemma
0,31822,99883,Toronto Dominion Centre,Toronto Dominion Centre
2,343873,99883,kompleks handlowo-kulturalny,kompleks handlowo-kulturalny
3,31833,99883,Joe Fafard,Joe Fafard
5,343872,99883,Ludwiga Mies van der Rohe,Ludwig Mies van der Rohe
7,343871,99883,Toronto Dominion Gallery of Inuit Art,Toronto Dominion Gallery of Inuit Art


In [463]:
print(f"Shape of the Poleval 2019 dataset (only multi word expressions): {mwe_df.shape}")

Shape of the Poleval 2019 dataset (only multi word expressions): (9718, 4)


**remove not polish MWE**


In [464]:
def is_polish_mwe(text):
    """
    Check if the text is a valid Polish MWE.
    Returns True if at least one word is recognized (not tagged as 'ign').
    """
    analysis = morfeusz.analyse(text)
    for _, _, interpretations in analysis:
        tag = interpretations[2]
        if tag != 'ign':
            return True
    return False

In [465]:
df_unrecognized = mwe_df[~mwe_df['text'].apply(is_polish_mwe)].reset_index(drop=True)
print(f"Number of unrecognized MWEs: {len(df_unrecognized)}")
df_unrecognized.sample(10)

Number of unrecognized MWEs: 566


Unnamed: 0,phrase_id,doc_id,text,lemma
123,51264,101307,Ive Jerolimov,Ive Jerolimov
470,471463,103927,Gézy Komoróczy,Géza Komoróczy
14,305840,100514,Copra Berni Piacenza,Copra Berni Piacenza
386,287721,102423,Creative Commons,Creative Commons
330,293763,102126,Hot Bodies D4,Hot Bodies D4
334,78132,102132,Philip Brickman,Philip Brickman
181,56951,101437,Deep Purple,Deep Purple
74,52684,101190,Harri Haatainen,Harri Haatainen
33,40122,100608,Creative Commons,Creative Commons
193,57645,101459,Deutsche Forschungsgemeinschaft,Deutsche Forschungsgemeinschaft


In [466]:
mwe_df = mwe_df[mwe_df["text"].apply(is_polish_mwe)].reset_index(drop=True)
print(f"Shape of the Poleval 2019 dataset (only valid Polish MWEs): {mwe_df.shape}")

Shape of the Poleval 2019 dataset (only valid Polish MWEs): (9152, 4)


There was 566 fully not recognized expressions by Morfeusz.


**Remove duplicates**


In [467]:
def remove_duplicates(df):
    """
    Remove duplicates from the DataFrame based on 'text' and 'lemma'.
    Keeps the first occurrence.
    """
    return df.drop_duplicates(subset=["text", "lemma"], keep='first')

In [468]:
mwe_df = remove_duplicates(mwe_df)
print(f"Shape of the Poleval 2019 dataset (after removing duplicates): {mwe_df.shape}")

Shape of the Poleval 2019 dataset (after removing duplicates): (7453, 4)


**TO DO: podzielić na nieodmienne i mianowniki:**


In [469]:
mwe_df[mwe_df["text"] == mwe_df["lemma"]]


Unnamed: 0,phrase_id,doc_id,text,lemma
0,31822,99883,Toronto Dominion Centre,Toronto Dominion Centre
1,343873,99883,kompleks handlowo-kulturalny,kompleks handlowo-kulturalny
2,31833,99883,Joe Fafard,Joe Fafard
4,343871,99883,Toronto Dominion Gallery of Inuit Art,Toronto Dominion Gallery of Inuit Art
5,343875,100499,PZL.13 (PZL-13),PZL.13 (PZL-13)
...,...,...,...,...
9137,329445,107360,Marin Leovac,Marin Leovac
9139,329446,107360,Roland Linz,Roland Linz
9142,329448,107360,Tomas Jun,Tomas Jun
9144,329449,107360,Patrick Salomon,Patrick Salomon


In [470]:
mwe_df = mwe_df[mwe_df["text"] != mwe_df["lemma"]]


In [471]:
print(f"Shape of the Poleval 2019 dataset (only inflected mwe): {mwe_df.shape}")


Shape of the Poleval 2019 dataset (only inflected mwe): (3828, 4)


In [472]:
mwe_df[['text', 'lemma']].sample(10)

Unnamed: 0,text,lemma
7091,Polskim Kontyngencie Wojskowym,Polski Kontyngent Wojskowy
965,Anonima tzw. Galla,Anonim tzw. Gall
5751,zjawisk społeczno-ekonomicznych,zjawiska społeczno-ekonomiczne
8544,Radzie UE,Rada UE
5450,"MINISTRA GOSPODARKI, PRACY I POLITYKI SPOŁECZNEJ","Minister Gospodarki, Pracy i Polityki Społecznej"
4625,Ery Ryb,Era Ryb
3322,źródeł energii,źródła energii
3474,Ministerstwa Sprawiedliwości,Ministerstwo Sprawiedliwości
6578,śrubami AXA088,śruby AXA088
1993,Ludzi Zachodu,Ludze Zachodu


## Cleaning

In [473]:
mwe_df.isna().sum()

phrase_id    0
doc_id       0
text         0
lemma        1
dtype: int64

In [474]:
mwe_df[mwe_df["lemma"].isna()]

Unnamed: 0,phrase_id,doc_id,text,lemma
1334,307628,101286,San Diego Rockets,


In [475]:
# I can drop it because it's not a polish MWE
mwe_df = mwe_df.dropna()

In [476]:
mwe_df.isna().sum()

phrase_id    0
doc_id       0
text         0
lemma        0
dtype: int64

In [477]:
mwe_df[mwe_df["text"] == "Dolce & Gabbana"]

Unnamed: 0,phrase_id,doc_id,text,lemma
161,99725,100524,Dolce & Gabbana,Dolce &amp; Gabbana


In [478]:
mwe_df[mwe_df["text"] == "Lublin R-VIII a"]

Unnamed: 0,phrase_id,doc_id,text,lemma
734,65875,101200,Lublin R-VIII a,Lublin R-VIII


In [479]:
mwe_df[mwe_df["text"] == "BBC Radio 1's Live Lounge"]

Unnamed: 0,phrase_id,doc_id,text,lemma
2264,426217,101434,BBC Radio 1's Live Lounge,BBC Radio 1


In [480]:
mwe_df[mwe_df["text"] == "Jammin' CRT.5"]

Unnamed: 0,phrase_id,doc_id,text,lemma
5108,468926,102125,Jammin' CRT.5,Jammin


In [481]:
mwe_df[mwe_df["text"] == "Traxxas Rustler XL-1 '09"]


Unnamed: 0,phrase_id,doc_id,text,lemma
6580,469441,102506,Traxxas Rustler XL-1 '09,Traxxas Rustler XL-1


In [482]:
mwe_df[mwe_df["text"] == "Żerań F S O"]


Unnamed: 0,phrase_id,doc_id,text,lemma
6916,350246,102673,Żerań F S O,Żerań FSO


In [483]:
mwe_df.loc[mwe_df["text"] == "Żerań F S O", "text"] = "Żerań FSO"

In [484]:
mwe_df = mwe_df[mwe_df["text"] != "S K M"]
mwe_df = mwe_df[mwe_df["text"] != "R K S"]

In [485]:
mwe_df[mwe_df["text"] == "átlátszó ("]

Unnamed: 0,phrase_id,doc_id,text,lemma
8097,481079,103921,átlátszó (,átlátszó


In [486]:
mwe_df[mwe_df["text"] == "Społecznej Rady Konsultacyjnej ds. aktualizacji „Strategii Rozwoju Bydgoszczy do 2015 roku”"]

Unnamed: 0,phrase_id,doc_id,text,lemma
5224,79818,102157,Społecznej Rady Konsultacyjnej ds. aktualizacji „Strategii Rozwoju Bydgoszczy do 2015 roku”,Społeczna Rada Konsultacyjna ds. aktualizacji„Strategii Rozwoju Bydgoszczy do 2015 roku


In [487]:
mwe_df.loc[mwe_df["text"] == 'Społecznej Rady Konsultacyjnej ds. aktualizacji „Strategii Rozwoju Bydgoszczy do 2015 roku”', "lemma"] = 'Społeczna Rada Konsultacyjna ds. aktualizacji „Strategii Rozwoju Bydgoszczy do 2015 roku"'

In [488]:
mwe_df[mwe_df["text"].str.contains("Fundacji Ośrodka")]

Unnamed: 0,phrase_id,doc_id,text,lemma
6812,71729,102602,"Fundacji Ośrodka "" Karta ""","Fundacja Ośrodka ""Karta """


In [489]:
mwe_df.loc[mwe_df["text"] == 'Fundacji Ośrodka " Karta "', ["text", "lemma"]] = ['Fundacji Ośrodka "Karta"', 'Fundacja Ośrodka "Karta"']

In [490]:
mwe_df[mwe_df["text"].str.contains("FILLER")] # error during creating dataset

Unnamed: 0,phrase_id,doc_id,text,lemma
6973,474289,102694,Dworca FILLER Centralnego,Dworzec Centralny
6978,350524,102696,Dworzec FILLER Śródmieście,Dworzec Śródmieście


In [491]:
mwe_df.loc[mwe_df["text"] == 'Dworca FILLER Centralnego', "text"] = 'Dworca Centralnego'
mwe_df.loc[mwe_df["text"] == 'Dworzec FILLER Śródmieście', "text"] = 'Dworzec Śródmieście'

In [492]:
mwe_df[mwe_df["text"].str.contains("Herburt")]

Unnamed: 0,phrase_id,doc_id,text,lemma
1202,50764,101263,Jan Herburt (Arłamowski,Jan Herburt (Arłamowski)


In [493]:
mwe_df[mwe_df["text"] == "Sesji Sejmu Dzieci i Młodzieży"]

Unnamed: 0,phrase_id,doc_id,text,lemma
3188,63764,101638,Sesji Sejmu Dzieci i Młodzieży,Sejm Dzieci i Młodzieży


In [494]:
mwe_df.loc[mwe_df["text"] == "Sesji Sejmu Dzieci i Młodzieży", "lemma"] = "Sesja Sejmu Dzieci i Młodzieży"

In [495]:
mwe_df[mwe_df["text"]=='ustawy o zmianie ustawy Kodeks postępowania karnego']

Unnamed: 0,phrase_id,doc_id,text,lemma
3279,63911,101644,ustawy o zmianie ustawy Kodeks postępowania karnego,Ustawao zmianie ustawy Kodeks postępowania karnego


In [496]:
mwe_df.loc[mwe_df["text"]=='ustawy o zmianie ustawy Kodeks postępowania karnego', "lemma"] = 'ustawa o zmianie ustawy Kodeks postępowania karnego'


In [497]:
mwe_df[mwe_df["text"].str.contains("o ujawnieniu pracy lub służby")]


Unnamed: 0,phrase_id,doc_id,text,lemma
4449,81044,101946,o zmianie ustawy o ujawnieniu pracy lub służby w organach bezpieczeństwa państwa lub współpracy z nimi w latach 1944–1990 osób pełniących funkcje publiczne,ustawa o ujawnieniu pracy lub służby w organach bezpieczeństwa państwa lub współpracy z nimi w latach 1944–1990 osób pełniących funkcje publiczne
4451,81046,101946,ustawie z dnia 11 kwietnia 1997 r. o ujawnieniu pracy lub służby w organach bezpieczeństwa państwa lub współpracy z nimi w latach 1944-1990 osób pełniących funkcje publiczne,Ustawa z dnia 11 kwietnia 1997 r. o ujawnieniu pracy lub służby w organach bezpieczeństwa państwa lub współpracy z nimi w latach 1944-1990 osób pełniących funkcje publiczne


In [498]:
mwe_df = mwe_df.drop(index=4449)

In [499]:
mwe_df.drop(mwe_df[mwe_df["text"].isin([
    "Dolce & Gabbana", # not a polish MWE
    "Lublin R-VIII a", # ???
    "BBC Radio 1's Live Lounge", # not a polish MWE
    "Jammin' CRT.5", # not a polish MWE
    "Traxxas Rustler XL-1 '09", # not a polish MWE
    "átlátszó (" # not a polish MWE
])].index, inplace=True)

In [500]:
print(f"Shape of the Poleval 2019 dataset (after cleaning): {mwe_df.shape}")

Shape of the Poleval 2019 dataset (after cleaning): (3818, 4)


In [501]:
mwe_df.to_csv("mwe_cleaned_df.csv")

## Actual analysis

In [502]:
mwe_df = pd.read_csv("mwe_cleaned_df.csv", index_col=0)
print(f"Shape of the cleaned Poleval 2019 dataset: {mwe_df.shape}")

Shape of the cleaned Poleval 2019 dataset: (3818, 4)


In [503]:
mwe_df.head()

Unnamed: 0,phrase_id,doc_id,text,lemma
3,343872,99883,Ludwiga Mies van der Rohe,Ludwig Mies van der Rohe
7,343879,100499,samolotu PZL.23 „Karaś”,samolot PZL.23 „Karaś”
8,31843,100499,Ministerstwa Komunikacji,Ministerstwo Komunikacji
11,343880,100499,silnikiem Pratt-Whitney Wasp,silnik Pratt-Whitney Wasp
14,31839,100499,Stanisława Praussa,Stanisław Prauss


In [504]:
mwe_df.isna().sum()

phrase_id    0
doc_id       0
text         0
lemma        0
dtype: int64

In [505]:
def is_conjunction(word):
    """
    Check if the word is a conjunction.
    Returns True if the word is a conjunction.
    """
    analysis = morfeusz.analyse(word)
    for _, _, interpretations in analysis:
        tag = interpretations[2]
        if tag.startswith('conj'):
            return True
    return False

In [None]:
def analyze_pair(words, lemmas):
    """
    Analyze a 2 word expression.
    :param words: List of 2 words in the expression.
    :param lemmas: List of 2 lemmas corresponding to the words.
    :return: number, case, relation type and head of the expression.
    """
    # possible numbers and cases for the words
    number_case = {}
    part_of_speech = {}

    # returned values
    number = None
    case = None
    relation = None
    head = None

    print(f"Analyzing pair: {words} with lemmas: {lemmas}")

    for i, word in enumerate(words):
        number_case_i = set()
        analyses = morfeusz.analyse(word)

        for _, _, interpretations in analyses:
            tag = interpretations[2].split(":")
            if len(tag) > 2:
                number_i = tag[1]
                case_i = tag[2].split('.')

                # Morfeusz can return more than 1 possibilty of case after the dot so I make a list of it 
                for c in case_i:
                    number_case_i.add((number_i, c))

                part_of_speech[word] = tag[0]

        number_case[i] = number_case_i


    # check if it is possible to find common number and case for the words in the mwe -> ZWIĄZEK ZGODY
    common_cases = set.intersection(*number_case.values())
    if len(common_cases)>=1:
        # if there is only one common pair(number, case), then it is an agreement relation and the number and case of the expression can be assigned to the whole expression
        
        if len(common_cases) == 1:
            number_case_head_list = list(common_cases)[0]
            number = number_case_head_list[0]
            case = number_case_head_list[1]
 
        else:
            number = [x[0] for x in list(common_cases)]
            case = [x[1] for x in list(common_cases)]
            
        relation = "agreement"

        # noun -> HEAD
        for i, (word, tag) in enumerate(part_of_speech.items()):
            if tag == 'subst':
                head = lemmas[i]
                break

        # if there is no noun in the expression, then the head is the first word in the expression
            else:
                head = lemmas[0]

    elif [w_i for w_i, l_i in zip(words, lemmas) if w_i == l_i]:
        # government relation    
        for i, word in enumerate(words):
            if word == lemmas[i]:
                continue
            else: 
                if number_case[i]:
                    head = lemmas[i]
                    number_case_head_list = list(number_case[i])
                    number = [x[0] for x in number_case_head_list]
                    case = [x[1] for x in number_case_head_list]
                    relation = "government"
        
    # else:
    #     print(f"{words}: {number_case}")
    #TODO: zająć się przypadkami gdy jakiegoś wyrazu nie ma w Morfeuszu - jeżeli tylko jedno jest w Morfeuszu to na podstawie tego -> przypadek itd i to jest nasza glowa a zwiazek sprawdzamy po tym czy jest to takie samo w lemma i text
     
    return number, case, relation, head


def analyze_mwe(phrase, lemma):
    """
    Analyze a multi-word expression.
    :param phrase: The multi-word expression.
    :param lemma: The lemma of the multi-word expression.
    :return: analysis results: numbers, cases, relations, heads, pairs.
    """

    phrase = phrase.replace(" - ", "-")
    phrase = phrase.replace("___", "")

    
    words = phrase.strip().split()
    lemmas = lemma.strip().split() 

    numbers = []
    cases = []
    relations = []
    heads = []
    pairs = []

    i = 0
    while i < len(words)-1:
        if is_conjunction(words[i+1]) and i + 2 < len(words):
            pair_words = [words[i], words[i+2]]
            pair_lemmas = [lemmas[i], lemmas[i+2]]
            i += 2 # skip the conjunction
        
        else:
            pair_words = words[i:i+2]
            pair_lemmas = lemmas[i:i+2]
            i += 1 

        # analyze the pair of words
        analyses = analyze_pair(pair_words, pair_lemmas)
        number, case, relation, head = analyses
        pairs.append(tuple(pair_words))
        numbers.append(number)  
        cases.append(case)
        relations.append(relation)
        heads.append(head)


    return numbers, cases, relations, heads, pairs

In [507]:
mwe_df[["numbers", "cases", "relations", "heads", "pairs"]] = mwe_df.apply(lambda row: pd.Series(analyze_mwe(row["text"], row["lemma"])), axis=1)

Analyzing pair: ['Ludwiga', 'Mies'] with lemmas: ['Ludwig', 'Mies']
Analyzing pair: ['Mies', 'van'] with lemmas: ['Mies', 'van']
Analyzing pair: ['van', 'der'] with lemmas: ['van', 'der']
Analyzing pair: ['der', 'Rohe'] with lemmas: ['der', 'Rohe']
Analyzing pair: ['samolotu', 'PZL.23'] with lemmas: ['samolot', 'PZL.23']
Analyzing pair: ['PZL.23', '„Karaś”'] with lemmas: ['PZL.23', '„Karaś”']
Analyzing pair: ['Ministerstwa', 'Komunikacji'] with lemmas: ['Ministerstwo', 'Komunikacji']
Analyzing pair: ['silnikiem', 'Pratt-Whitney'] with lemmas: ['silnik', 'Pratt-Whitney']
Analyzing pair: ['Pratt-Whitney', 'Wasp'] with lemmas: ['Pratt-Whitney', 'Wasp']
Analyzing pair: ['Stanisława', 'Praussa'] with lemmas: ['Stanisław', 'Prauss']
Analyzing pair: ['Państwowych', 'Zakładach'] with lemmas: ['Państwowe', 'Zakłady']
Analyzing pair: ['Zakładach', 'Lotniczych'] with lemmas: ['Zakłady', 'Lotnicze']
Analyzing pair: ['Lotniczych', 'w'] with lemmas: ['Lotnicze', 'w']
Analyzing pair: ['w', 'Warszawie

In [520]:
mwe_df.sample(10)

Unnamed: 0,phrase_id,doc_id,text,lemma,numbers,cases,relations,heads,pairs,relation,head,final_number,final_case
825,307218,101218,wojny 30 letniej,wojna 30 letnia,"[[pl, pl, sg, pl], [sg, sg, sg]]","[[acc, nom, gen, voc], [dat, gen, loc]]","[government, government]","[wojna, letnia]","[(wojny, 30), (30, letniej)]",government,wojna,"[pl, pl, sg, pl]","[acc, nom, gen, voc]"
4794,310817,102048,Rady Osiedla,Rada Osiedla,"[[pl, pl, pl, sg]]","[[acc, nom, voc, gen]]",[agreement],[Rada],"[(Rady, Osiedla)]",agreement,Rada,pl,nom
4046,468428,101849,licencjami Creative Commons,licencja Creative Commons,"[[pl], None]","[[inst], None]","[government, None]","[licencja, None]","[(licencjami, Creative), (Creative, Commons)]",government,licencja,[pl],[inst]
7313,484019,103623,ﬁzyki klasycznej,ﬁzyka klasyczna,[None],[None],[None],[None],"[(ﬁzyki, klasycznej)]",,,,
4060,76084,101852,Cory’ego Doctorowa,Cory Doctorow,[None],[None],[None],[None],"[(Cory’ego, Doctorowa)]",,,,
553,52386,101172,Azji Dalekowschodniej,Azja Dalekowschodnia,"[[sg, sg, sg]]","[[dat, gen, loc]]",[agreement],[Azja],"[(Azji, Dalekowschodniej)]",agreement,Azja,sg,gen
3553,467207,101684,Radę Europejską,Rada Europejska,[sg],[acc],[agreement],[Rada],"[(Radę, Europejską)]",agreement,Rada,sg,acc
5354,466369,102167,Komendanta Chorągwi Ziemi Lubuskiej Związku Harcerstwa Polskiego,Komendant Chorągwi Ziemi Lubuskiej Związku Harcerstwa Polskiego,"[sg, [sg, sg, sg], [sg, sg, sg], [sg, sg], sg, sg]","[gen, [dat, gen, loc], [dat, gen, loc], [gen, loc], gen, gen]","[agreement, agreement, agreement, agreement, agreement, agreement]","[Komendant, Chorągwi, Ziemi, Związku, Związku, Harcerstwa]","[(Komendanta, Chorągwi), (Chorągwi, Ziemi), (Ziemi, Lubuskiej), (Lubuskiej, Związku), (Związku, Harcerstwa), (Harcerstwa, Polskiego)]",agreement,Komendant,sg,gen
5809,340025,102353,Konwentu Seniorów Sejmu,Konwent Seniorów Sejmu,"[[sg], None]","[[gen], None]","[government, None]","[Konwent, None]","[(Konwentu, Seniorów), (Seniorów, Sejmu)]",government,Konwent,[sg],[gen]
3033,466716,101584,projektem ustawy,projekt ustawy,[[sg]],[[inst]],[government],[projekt],"[(projektem, ustawy)]",government,projekt,[sg],[inst]


In [521]:
morfeusz.analyse("ﬁzyki klasycznej") # nie ma fizyki XD

[(0, 1, ('ﬁzyki', 'ﬁzyki', 'ign', [], [])),
 (1, 2, ('klasycznej', 'klasyczny', 'adj:sg:dat:f:pos', [], [])),
 (1, 2, ('klasycznej', 'klasyczny', 'adj:sg:gen:f:pos', [], [])),
 (1, 2, ('klasycznej', 'klasyczny', 'adj:sg:loc:f:pos', [], []))]

In [None]:
def extract_all_relation_data(numbers, cases, relations, heads):
    results = []

    filtered = [
        (n, c, r, h)
        for n, c, r, h in zip(numbers, cases, relations, heads)
        if n is not None and c is not None and r is not None and h is not None
    ]

    gov_data = [(n, c, r, h) for n, c, r, h in filtered if r == "government"]
    agree_data = [(n, c, r, h) for n, c, r, h in filtered if r == "agreement"]

    # GOV: only 1 pair
    if len(gov_data) == 1:
        n, c, r, h = gov_data[0]
        n_list = n if isinstance(n, list) else [n]
        c_list = c if isinstance(c, list) else [c]
        for num, case in zip(n_list, c_list):
            if num is not None and case is not None:
                results.append(("government", h, num, case))
        return results

    # GOV: more than 1 pair
    elif len(gov_data) > 1:
        all_pairs = []
        for n, c, *_ in gov_data:
            n_list = n if isinstance(n, list) else [n]
            c_list = c if isinstance(c, list) else [c]
            pairs = set(zip(n_list, c_list))
            all_pairs.append(pairs)

        common_pairs = set.intersection(*all_pairs)
        if common_pairs:
            h = gov_data[0][3]
            for num, case in common_pairs:
                results.append(("government", h, num, case))
            return results

    # AGREEMENT
    if agree_data:
        all_pairs = []
        for n, c, *_ in agree_data:
            n_list = n if isinstance(n, list) else [n]
            c_list = c if isinstance(c, list) else [c]
            pairs = set(zip(n_list, c_list))
            all_pairs.append(pairs)

        common_pairs = set.intersection(*all_pairs)
        if common_pairs:
            h = agree_data[0][3]
            for num, case in common_pairs:
                results.append(("agreement", h, num, case))
            return results

    return [("unknown", None, None, None)]


for idx, row in mwe_df.iterrows():
    results = extract_all_relation_data(row["numbers"], row["cases"], row["relations"], row["heads"])
    for rel, head, num, case in results:
        new_row = row.copy()
        new_row["relation"] = rel
        new_row["head"] = head
        new_row["final_number"] = num
        new_row["final_case"] = case
        expanded_rows.append(new_row)

expanded_df = pd.DataFrame(expanded_rows)

In [528]:
expanded_df.head(10)

Unnamed: 0,phrase_id,doc_id,text,lemma,numbers,cases,relations,heads,pairs,relation,head,final_number,final_case
3,343872,99883,Ludwiga Mies van der Rohe,Ludwig Mies van der Rohe,"[[sg, sg], None, None, None]","[[gen, acc], None, None, None]","[government, None, None, None]","[Ludwig, None, None, None]","[(Ludwiga, Mies), (Mies, van), (van, der), (der, Rohe)]",government,Ludwig,sg,gen
3,343872,99883,Ludwiga Mies van der Rohe,Ludwig Mies van der Rohe,"[[sg, sg], None, None, None]","[[gen, acc], None, None, None]","[government, None, None, None]","[Ludwig, None, None, None]","[(Ludwiga, Mies), (Mies, van), (van, der), (der, Rohe)]",government,Ludwig,sg,acc
7,343879,100499,samolotu PZL.23 „Karaś”,samolot PZL.23 „Karaś”,"[[sg], None]","[[gen], None]","[government, None]","[samolot, None]","[(samolotu, PZL.23), (PZL.23, „Karaś”)]",government,samolot,sg,gen
3,343872,99883,Ludwiga Mies van der Rohe,Ludwig Mies van der Rohe,"[[sg, sg], None, None, None]","[[gen, acc], None, None, None]","[government, None, None, None]","[Ludwig, None, None, None]","[(Ludwiga, Mies), (Mies, van), (van, der), (der, Rohe)]",government,Ludwig,sg,gen
3,343872,99883,Ludwiga Mies van der Rohe,Ludwig Mies van der Rohe,"[[sg, sg], None, None, None]","[[gen, acc], None, None, None]","[government, None, None, None]","[Ludwig, None, None, None]","[(Ludwiga, Mies), (Mies, van), (van, der), (der, Rohe)]",government,Ludwig,sg,acc
7,343879,100499,samolotu PZL.23 „Karaś”,samolot PZL.23 „Karaś”,"[[sg], None]","[[gen], None]","[government, None]","[samolot, None]","[(samolotu, PZL.23), (PZL.23, „Karaś”)]",government,samolot,sg,gen
8,31843,100499,Ministerstwa Komunikacji,Ministerstwo Komunikacji,[sg],[gen],[agreement],[Ministerstwo],"[(Ministerstwa, Komunikacji)]",agreement,Ministerstwo,sg,gen
11,343880,100499,silnikiem Pratt-Whitney Wasp,silnik Pratt-Whitney Wasp,"[[sg], None]","[[inst], None]","[government, None]","[silnik, None]","[(silnikiem, Pratt-Whitney), (Pratt-Whitney, Wasp)]",government,silnik,sg,inst
14,31839,100499,Stanisława Praussa,Stanisław Prauss,"[[sg, sg]]","[[gen, acc]]",[agreement],[Stanisław],"[(Stanisława, Praussa)]",agreement,Stanisław,sg,gen
14,31839,100499,Stanisława Praussa,Stanisław Prauss,"[[sg, sg]]","[[gen, acc]]",[agreement],[Stanisław],"[(Stanisława, Praussa)]",agreement,Stanisław,sg,acc


In [530]:
expanded_df.isna().sum()

phrase_id         0
doc_id            0
text              0
lemma             0
numbers           0
cases             0
relations         0
heads             0
pairs             0
relation          0
head            349
final_number    349
final_case      349
dtype: int64

In [529]:
expanded_df.shape

(6389, 13)

In [None]:
morfeusz.analyse("Woli Mieleckiej")

[(0, 1, ('Woli', 'wola', 'subst:sg:gen:f', ['nazwa_pospolita'], [])),
 (0, 1, ('Woli', 'wola', 'subst:sg:dat.loc:f', ['nazwa_pospolita'], [])),
 (0, 1, ('Woli', 'wola', 'subst:pl:gen:f', ['nazwa_pospolita'], [])),
 (0, 1, ('Woli', 'Wola:Sf~i', 'subst:sg:gen:f', ['nazwa_geograficzna'], [])),
 (0,
  1,
  ('Woli', 'Wola:Sf~i', 'subst:sg:dat.loc:f', ['nazwa_geograficzna'], [])),
 (0, 1, ('Woli', 'Wola:Sf~i', 'subst:pl:gen:f', ['nazwa_geograficzna'], [])),
 (0, 1, ('Woli', 'Wola:Sf~i~l', 'subst:sg:gen:f', ['nazwisko'], [])),
 (0, 1, ('Woli', 'Wola:Sf~i~l', 'subst:sg:dat.loc:f', ['nazwisko'], [])),
 (0, 1, ('Woli', 'Wola:Sf~i~l', 'subst:pl:gen:f', ['nazwisko'], ['hom.'])),
 (0, 1, ('Woli', 'Wola:Sm1', 'subst:sg:gen:m1', ['nazwisko'], [])),
 (0, 1, ('Woli', 'Wola:Sm1', 'subst:sg:dat.loc:m1', ['nazwisko'], [])),
 (0, 1, ('Woli', 'wole', 'subst:pl:gen:n:ncol', ['nazwa_pospolita'], [])),
 (0, 1, ('Woli', 'woli', 'adj:pl:nom.voc:m1:pos', [], [])),
 (0, 1, ('Woli', 'woli', 'adj:sg:acc:m3:pos', [],

## NOtes

In [531]:
# def extract_relation_head(relations, heads):
#     for relation, head in zip(relations, heads):
#         if relation == 'government':
#             return 'governemnt', head
#         elif relation == 'agreement':
#             return 'agreement', head
#     return None, None

# mwe_df[["relation", "head"]] = mwe_df.apply(lambda row: pd.Series(extract_relation_head(row["relations"], row["heads"])), axis=1)

In [532]:
# def extract_full_info(numbers, cases, relations, heads):
#     # ignore None values
#     filtered = [
#         (n, c, r, h)
#         for n, c, r, h in zip(numbers, cases, relations, heads)
#         if n is not None and c is not None and r is not None and h is not None
#         ]

#     if not filtered:
#         return None, None, None, None

#     for n, c, r, h in filtered:
#         if r == "government":
#             if len(c)==1:
#                 return r, h, n[0], c[0]
#             else:
#                 all_numbers = [set(x) if isinstance(x, list) else {x} for x in n]

#     # Jeżeli wszystko to związki zgody
#     if all(r == "agreement" for _, _, r, _ in filtered):
#         norm_numbers = [set(x) if isinstance(x, list) else {x} for x, *_ in filtered]
#         norm_cases = [set(x) if isinstance(x, list) else {x} for _, x, *_ in filtered]

#         common_number = set.intersection(*norm_numbers)
#         common_case = set.intersection(*norm_cases)

#         head = filtered[0][3]  # pierwszy head
#         return "agreement", head, (
#             list(common_number)[0] if common_number else None
#         ), (
#             list(common_case)[0] if common_case else None
#         )


#     return None, None, None, None


# mwe_df[["relation", "head", "final_number", "final_case"]] = mwe_df.apply(
#     lambda row: pd.Series(extract_full_info(row["numbers"], row["cases"], row["relations"], row["heads"])),
#     axis=1
# )